In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup
)

print("\nLoading and preprocessing dataset...")


df = pd.read_csv('dataset_phishing.csv')
print(f"Loaded dataset_phishing.csv with {len(df)} rows")


print("\nPreprocessing steps:")


print("1. Converting labels...")
df['label'] = (df['status'] == 'phishing').astype(int)


print("2. Cleaning URLs...")
df['text'] = df['url'].fillna('').astype(str)


print("3. Validating URLs...")
def is_valid_url(url):
    """Basic URL validation"""
    if not isinstance(url, str):
        return False
    return len(url) > 5

print(f"Dataset before URL validation: {len(df)}")
df = df[df['text'].apply(is_valid_url)].copy()
print(f"Dataset after URL validation: {len(df)}")


print("\n4. Splitting dataset...")
train_df, test_df = train_test_split(
    df, 
    test_size=0.5,  
    random_state=42,
    stratify=df['label']  
)


print("\nDataset Statistics:")
print("-" * 50)
print(f"Total valid URLs: {len(df)}")
print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

print("\nTraining Set Label Distribution:")
print(train_df['label'].value_counts(normalize=True))
print("\nTesting Set Label Distribution:")
print(test_df['label'].value_counts(normalize=True))


print("\nURL Length Statistics:")
url_lengths = train_df['text'].str.len()
print("Training set:")
print(url_lengths.describe())


Loading and preprocessing dataset...
Loaded dataset_phishing.csv with 11430 rows

Preprocessing steps:
1. Converting labels...
2. Cleaning URLs...
3. Validating URLs...
Dataset before URL validation: 11430
Dataset after URL validation: 11430

4. Splitting dataset...

Dataset Statistics:
--------------------------------------------------
Total valid URLs: 11430
Training set size: 5715
Testing set size: 5715

Training Set Label Distribution:
label
1    0.500087
0    0.499913
Name: proportion, dtype: float64

Testing Set Label Distribution:
label
0    0.500087
1    0.499913
Name: proportion, dtype: float64

URL Length Statistics:
Training set:
count    5715.000000
mean       61.677690
std        60.467361
min        12.000000
25%        32.500000
50%        47.000000
75%        70.000000
max      1641.000000
Name: text, dtype: float64


Dataset link :- https://www.kaggle.com/datasets/shashwatwork/web-page-phishing-detection-dataset

In [2]:

print("Preparing datasets...")


import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


url_lengths = train_df['text'].str.len()
max_length = min(256, int(url_lengths.quantile(0.99)))  
print(f"\nOptimized max_length set to {max_length} based on URL length distribution")

def df_to_dataset(df, batch_size=2000):  
    """Convert DataFrame to Dataset with tokenization"""
    try:
        
        ds = Dataset.from_pandas(df[['text', 'label']].reset_index(drop=True))
        print(f"Created initial dataset with {len(ds)} samples")
        
        
        def tokenize(batch):
            return tokenizer(
                batch['text'], 
                truncation=True, 
                padding='max_length', 
                max_length=max_length,
                return_attention_mask=True,
                return_tensors=None  
            )
        
        
        ds = ds.map(tokenize, batched=True, batch_size=batch_size, num_proc=4)
        print("Completed tokenization")
        
        
        ds = ds.rename_column('label', 'labels')
        ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        print("Dataset format set for training")
        
        return ds
    
    except Exception as e:
        print(f"Error in dataset preparation: {str(e)}")
        raise


print("\nInitializing model and tokenizer...")
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    hidden_dropout_prob=0.2,  
    attention_probs_dropout_prob=0.2
)

print("\nConverting datasets to PyTorch format...")
train_dataset = df_to_dataset(train_df)
test_dataset = df_to_dataset(test_df)
print(f"Created training dataset with {len(train_dataset)} samples")
print(f"Created testing dataset with {len(test_dataset)} samples")

labels = train_df['label'].values
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("\nClass weights for balanced training:", class_weights)

Preparing datasets...

Optimized max_length set to 256 based on URL length distribution

Initializing model and tokenizer...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Converting datasets to PyTorch format...
Created initial dataset with 5715 samples


Map (num_proc=4):   0%|          | 0/5715 [00:00<?, ? examples/s]

Completed tokenization
Dataset format set for training
Created initial dataset with 5715 samples


Map (num_proc=4):   0%|          | 0/5715 [00:00<?, ? examples/s]

Completed tokenization
Dataset format set for training
Created training dataset with 5715 samples
Created testing dataset with 5715 samples

Class weights for balanced training: tensor([1.0002, 0.9998])


In [3]:

model_path = './phishing_detection_model_new'
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
max_length = 128

print("Model loaded successfully from:", model_path)

Model loaded successfully from: ./phishing_detection_model_new


In [4]:
print("Setting up training configuration...")

def compute_metrics(pred):
    try:
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
        acc = accuracy_score(labels, preds)
        
        
        tn = ((preds == 0) & (labels == 0)).sum()
        tp = ((preds == 1) & (labels == 1)).sum()
        fn = ((preds == 0) & (labels == 1)).sum()
        fp = ((preds == 1) & (labels == 0)).sum()
        
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'specificity': specificity
        }
    except Exception as e:
        print(f"Error computing metrics: {str(e)}")
        return {'error': str(e)}


class AdaptiveTrainer(Trainer):
    def __init__(self, class_weights=None, max_grad_norm=1.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.max_grad_norm = max_grad_norm
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        try:
            
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to('cpu')
            
            outputs = model(**inputs)
            
           
            if return_outputs and not isinstance(outputs, dict):
                outputs = {'logits': outputs[0], 'hidden_states': outputs[1] if len(outputs) > 1 else None}
            
            if 'labels' in inputs:
                labels = inputs['labels']
                logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0]
                
                
                if self.class_weights is not None:
                    loss_fct = torch.nn.CrossEntropyLoss(
                        weight=self.class_weights,
                        label_smoothing=0.1  
                    )
                    loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
                else:
                    loss = outputs['loss'] if isinstance(outputs, dict) else outputs[1]
            else:
                loss = outputs['loss'] if isinstance(outputs, dict) else outputs[1]
            
            return (loss, outputs) if return_outputs else loss
            
        except Exception as e:
            print(f"Error in loss computation: {str(e)}")
            raise


os.makedirs('./results', exist_ok=True)
os.makedirs('./logs', exist_ok=True)


batch_size = 32  
grad_acc_steps = 2  
num_epochs = 4  


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size * 2,  
    gradient_accumulation_steps=grad_acc_steps,
    learning_rate=5e-5,  
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,  
    save_total_limit=2,
    eval_steps=100,  
    save_steps=100, 
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    remove_unused_columns=True,
    dataloader_num_workers=4,  
    warmup_ratio=0.1,
    lr_scheduler_type='cosine_with_restarts',  
    report_to='none',  
    do_eval=True,  
    save_strategy='steps',  
    eval_strategy='steps'  
)


print("\nAdvanced Training Configuration:")
print(f"Total training samples: {len(train_df)}")
print(f"Total testing samples: {len(test_df)}")
print(f"Batch size: {batch_size}")
print(f"Gradient accumulation steps: {grad_acc_steps}")
print(f"Effective batch size: {batch_size * grad_acc_steps}")
print(f"Number of epochs: {num_epochs}")
print(f"Initial learning rate: {training_args.learning_rate}")
print(f"Learning rate scheduler: {training_args.lr_scheduler_type}")


total_steps = len(train_df) * num_epochs / (batch_size * grad_acc_steps)
print(f"Total training steps: {int(total_steps)}")

print("\nOptimizations enabled:")
print("- Adaptive learning rates with cosine restarts")
print("- Label smoothing")
print("- Increased worker threads")
print("- Regular evaluation and model selection")
print(f"- More frequent checkpoints (every {training_args.save_steps} steps)")
print("- Memory optimizations")

Setting up training configuration...

Advanced Training Configuration:
Total training samples: 5715
Total testing samples: 5715
Batch size: 32
Gradient accumulation steps: 2
Effective batch size: 64
Number of epochs: 4
Initial learning rate: 5e-05
Learning rate scheduler: SchedulerType.COSINE_WITH_RESTARTS
Total training steps: 357

Optimizations enabled:
- Adaptive learning rates with cosine restarts
- Label smoothing
- Increased worker threads
- Regular evaluation and model selection
- More frequent checkpoints (every 100 steps)
- Memory optimizations

Advanced Training Configuration:
Total training samples: 5715
Total testing samples: 5715
Batch size: 32
Gradient accumulation steps: 2
Effective batch size: 64
Number of epochs: 4
Initial learning rate: 5e-05
Learning rate scheduler: SchedulerType.COSINE_WITH_RESTARTS
Total training steps: 357

Optimizations enabled:
- Adaptive learning rates with cosine restarts
- Label smoothing
- Increased worker threads
- Regular evaluation and mo



In [None]:
print("Starting training process...")


print("\nDataset Information:")
print(f"Training samples: {len(train_df)}")
print(f"└── Phishing URLs: {len(train_df[train_df['label'] == 1])}")
print(f"└── Legitimate URLs: {len(train_df[train_df['label'] == 0])}")
print(f"\nTesting samples: {len(test_df)}")
print(f"└── Phishing URLs: {len(test_df[test_df['label'] == 1])}")
print(f"└── Legitimate URLs: {len(test_df[test_df['label'] == 0])}")


trainer = AdaptiveTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
    max_grad_norm=1.0  
)


print("\nStarting training...")
train_result = trainer.train()


print("\nTraining Statistics:")
print(f"Total steps completed: {train_result.global_step}")
print(f"Total training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training samples/second: {train_result.metrics['train_samples_per_second']:.2f}")
print(f"Final training loss: {train_result.training_loss:.4f}")


print("\nEvaluating model on test set...")
eval_result = trainer.evaluate()
print("\nTest Set Metrics:")
print(f"Accuracy: {eval_result['eval_accuracy']:.4f}")
print(f"F1 Score: {eval_result['eval_f1']:.4f}")
print(f"Precision: {eval_result['eval_precision']:.4f}")
print(f"Recall: {eval_result['eval_recall']:.4f}")
print(f"Specificity: {eval_result['eval_specificity']:.4f}")


output_dir = './phishing_detection_model_new'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"\nBest model saved to {output_dir}")


try:
    import matplotlib.pyplot as plt
    
    
    history = pd.DataFrame(trainer.state.log_history)
    
    
    plt.figure(figsize=(10, 5))
    plt.plot(history['step'], history['loss'], label='Training Loss')
    plt.title('Training Loss over Time')
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    
    
    eval_history = history[history['eval_accuracy'].notna()]
    metrics = ['eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall']
    plt.figure(figsize=(12, 6))
    for metric in metrics:
        plt.plot(eval_history['step'], eval_history[metric], label=metric.replace('eval_', ''))
    plt.title('Evaluation Metrics over Time')
    plt.xlabel('Step')
    plt.ylabel('Score')
    plt.legend()
    plt.show()
    
except ImportError:
    print("Matplotlib not available for plotting learning curves")

Starting training process...

Dataset Information:
Training samples: 5715
└── Phishing URLs: 2858
└── Legitimate URLs: 2857

Testing samples: 5715
└── Phishing URLs: 2857
└── Legitimate URLs: 2858

Starting training...




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Specificity
100,0.2748,0.359939,0.918985,0.923306,0.876415,0.975499,0.862491
200,0.2675,0.322151,0.934383,0.931831,0.969365,0.897095,0.971659
300,0.2404,0.307311,0.947507,0.947183,0.952887,0.941547,0.953464





Training Statistics:
Total steps completed: 360
Total training time: 18719.51 seconds
Training samples/second: 1.22
Final training loss: 0.2685

Evaluating model on test set...





Test Set Metrics:
Accuracy: 0.9475
F1 Score: 0.9472
Precision: 0.9529
Recall: 0.9415
Specificity: 0.9535

Best model saved to ./phishing_detection_model_new
Matplotlib not available for plotting learning curves

Best model saved to ./phishing_detection_model_new
Matplotlib not available for plotting learning curves


In [5]:
def preprocess_url(url):
    """Preprocess URL for consistent format"""
    url = url.lower().strip()
    
    # Add http:// if no protocol is specified
    if not url.startswith(('http://', 'https://')):
        if url.startswith('www.'):
            url = 'http://' + url
        else:
            url = 'http://www.' + url.replace('www.', '')
    
    return url

def extract_url_features(url):
    """Extract features that might indicate phishing"""
    suspicious_patterns = [
        'login', 'signin', 'verify', 'secure', 'account',
        'update', 'confirm', 'banking', 'paypal', 'password'
    ]
    
    url_lower = url.lower()
    suspicious_count = sum(1 for pattern in suspicious_patterns if pattern in url_lower)
    has_suspicious_chars = '@' in url or '%' in url
    dots_count = url.count('.')
    
    return suspicious_count, has_suspicious_chars, dots_count

def predict_url(url, max_length=None):
    """Enhanced URL prediction with better detection"""
    if max_length is None:
        max_length = min(256, len(url) + 10)
    
    # Preprocess URL
    processed_url = preprocess_url(url)
    
    # Get additional features
    suspicious_count, has_suspicious_chars, dots_count = extract_url_features(processed_url)
    
    inputs = tokenizer(
        processed_url,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    predictions = torch.softmax(outputs.logits, dim=1)
    predicted_class = torch.argmax(predictions).item()
    confidence = predictions[0][predicted_class].item()
    
    # Adjust confidence based on additional features
    if predicted_class == 1:  # If model predicts phishing
        if suspicious_count >= 2 or (has_suspicious_chars and dots_count > 2):
            confidence = min(confidence + 0.1, 1.0)
    else:  # If model predicts legitimate
        if suspicious_count >= 2 and dots_count > 3:
            confidence = max(confidence - 0.1, 0.0)
            predicted_class = 1  # Change to phishing if highly suspicious
    
    threshold = 0.8
    prediction = 'phishing' if predicted_class == 1 else 'legitimate'
    if confidence < threshold:
        confidence_label = "LOW"
    elif confidence < 0.9:
        confidence_label = "MEDIUM"
    else:
        confidence_label = "HIGH"
    
    return {
        'url': url,
        'processed_url': processed_url,
        'prediction': prediction,
        'confidence': f"{confidence:.2%}",
        'confidence_level': confidence_label,
        'sequence_length': max_length,
        'suspicious_patterns': suspicious_count,
        'suspicious_chars': has_suspicious_chars,
        'dots_count': dots_count
    }

# Test with various URL formats and suspicious patterns
test_urls = [
    "http://www.google.com",  # Legitimate
    "http://paypal-secure-login.com/verify",  # Phishing
    "www.facebook.com",  # Legitimate, no protocol
    "secure-banking.com.suspicious.net/login",  # Phishing
    "https://account-verify.paypal.com.suspicious.net",  # Phishing
    "http://mybank.com@phishing.com",  # Phishing with @ symbol
    "https://ku.ac.bd",  # Legitimate
    "http://verify-account.secure-login.com",  # Phishing
    "www.prothomalo.com",  # Legitimate news site
    "http://banking.update.security.mydomain.com"  # Phishing with multiple subdomains
]

print("Enhanced URL Testing:")
print("=" * 80)

results = []
for url in test_urls:
    result = predict_url(url)
    results.append(result)
    print(f"\nOriginal URL: {result['url']}")
    print(f"Processed URL: {result['processed_url']}")
    print(f"Prediction: {result['prediction']}")
    print(f"Confidence: {result['confidence']} ({result['confidence_level']})")
    print(f"Suspicious Patterns: {result['suspicious_patterns']}")
    print(f"Suspicious Characters: {'Yes' if result['suspicious_chars'] else 'No'}")
    print(f"Number of Dots: {result['dots_count']}")
    print("-" * 60)

predictions_summary = {
    'total': len(results),
    'phishing': len([r for r in results if r['prediction'] == 'phishing']),
    'legitimate': len([r for r in results if r['prediction'] == 'legitimate']),
    'high_confidence': len([r for r in results if r['confidence_level'] == 'HIGH']),
    'medium_confidence': len([r for r in results if r['confidence_level'] == 'MEDIUM']),
    'low_confidence': len([r for r in results if r['confidence_level'] == 'LOW'])
}

print("\nPrediction Statistics:")
print(f"Total URLs tested: {predictions_summary['total']}")
print(f"Phishing predictions: {predictions_summary['phishing']}")
print(f"Legitimate predictions: {predictions_summary['legitimate']}")
print(f"\nConfidence Levels:")
print(f"High confidence: {predictions_summary['high_confidence']}")
print(f"Medium confidence: {predictions_summary['medium_confidence']}")
print(f"Low confidence: {predictions_summary['low_confidence']}")

Enhanced URL Testing:

Original URL: http://www.google.com
Processed URL: http://www.google.com
Prediction: legitimate
Confidence: 95.83% (HIGH)
Suspicious Patterns: 0
Suspicious Characters: No
Number of Dots: 2
------------------------------------------------------------

Original URL: http://paypal-secure-login.com/verify
Processed URL: http://paypal-secure-login.com/verify
Prediction: phishing
Confidence: 100.00% (HIGH)
Suspicious Patterns: 4
Suspicious Characters: No
Number of Dots: 1
------------------------------------------------------------

Original URL: www.facebook.com
Processed URL: http://www.facebook.com
Prediction: legitimate
Confidence: 95.85% (HIGH)
Suspicious Patterns: 0
Suspicious Characters: No
Number of Dots: 2
------------------------------------------------------------

Original URL: secure-banking.com.suspicious.net/login
Processed URL: http://www.secure-banking.com.suspicious.net/login
Prediction: phishing
Confidence: 100.00% (HIGH)
Suspicious Patterns: 3
Suspi