In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Configuration
MODEL_NAME = "distilbert-base-uncased"
TEXT_COLS = ['url', 'domain', 'path', 'subdomain', 'top_level_domain']
NUMERICAL_COLS = ['path_length', 'num_path_segments', 'query_params', 
                'num_query_params', 'has_https', 'has_fragment',
                'has_special_chars_in_path', 'has_port', 'port_number', 'is_ip_address']
MAX_LENGTH = 128
BATCH_SIZE = 16

In [3]:
class UrlDataset(Dataset):
    def __init__(self, texts, numericals, labels, tokenizer, max_length):
        # Process text features
        self.texts = [" ".join([str(t) for t in text_tuple]) 
                     for text_tuple in zip(*[texts[col] for col in TEXT_COLS])]
        
        # Process numerical features
        self.numericals = numericals
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        numerical = self.numericals[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            **{k: v.squeeze(0) for k, v in encoding.items()},
            "numerical_features": torch.tensor(numerical, dtype=torch.float),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [4]:
class CustomModel(torch.nn.Module):
    def __init__(self, model_name, num_numerical, num_labels):
        super().__init__()
        self.transformer = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.numerical_layer = torch.nn.Linear(num_numerical, 128)
        self.combined_layer = torch.nn.Linear(
            self.transformer.config.hidden_size + 128,
            num_labels
        )

    def forward(self, input_ids, attention_mask, numerical_features, labels=None):
        # Text features
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        text_features = outputs.hidden_states[-1][:, 0, :]  # CLS token
        
        # Numerical features
        numerical = torch.relu(self.numerical_layer(numerical_features))
        
        # Combine features
        combined = torch.cat([text_features, numerical], dim=1)
        logits = self.combined_layer(combined)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.combined_layer.out_features), 
                           labels.view(-1))
            
        return {"loss": loss, "logits": logits}

In [5]:
# Training function
def train_custom(model, train_loader, val_loader, optimizer, num_epochs=5, device='cuda'):
    criterion = torch.nn.CrossEntropyLoss()
    model.to(device)
    best_f1 = 0.0
    best_model_state = None
    
    for epoch in range(num_epochs):
        model.train()
        total_loss, total_correct = 0, 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_features = batch['numerical_features'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask, numerical_features, labels)
            loss = outputs['loss']
            logits = outputs['logits']
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            total_correct += (logits.argmax(dim=1) == labels).sum().item()
        
        avg_loss = total_loss / len(train_loader)
        accuracy = total_correct / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
        
        acc, f1, precision, recall = evaluate(model, val_loader, device)
        if f1 > best_f1:
            best_f1 = f1
            best_model_state = model.state_dict()
    
    if best_model_state:
        torch.save(best_model_state, "./Models/best_distil_bert_model.pth")
        print("Best model saved with F1 score: {:.4f}".format(best_f1))

# Evaluation function
def evaluate(model, test_loader, device):
    model.eval()
    total_correct, total_samples = 0, 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].unsqueeze(0).to(device)
            attention_mask = batch['attention_mask'].unsqueeze(0).to(device)
            numerical_features = batch['numerical_features'].unsqueeze(0).to(device)
            labels = batch['labels'].unsqueeze(0).to(device)
            
            outputs = model(input_ids, attention_mask, numerical_features, labels)
            logits = outputs['logits']
            
            preds = logits.argmax(dim=1).cpu().numpy()
            labels = labels.cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels)
            
            total_correct += (preds == labels).sum()
            total_samples += labels.shape[0]
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print(f"Test - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    return accuracy, f1, precision, recall

# Main function
def train_model(df):
    le = LabelEncoder()
    labels = le.fit_transform(df['type'])
    num_labels = len(le.classes_)
    
    scaler = StandardScaler()
    numericals = scaler.fit_transform(df[NUMERICAL_COLS])
    
    train_idx, val_idx, test_idx = np.split(
        np.random.permutation(len(df)), [int(.7 * len(df)), int(.85 * len(df))]
    )
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_dataset = UrlDataset({col: df[col].iloc[train_idx] for col in TEXT_COLS}, numericals[train_idx], labels[train_idx], tokenizer, MAX_LENGTH)
    val_dataset = UrlDataset({col: df[col].iloc[val_idx] for col in TEXT_COLS}, numericals[val_idx], labels[val_idx], tokenizer, MAX_LENGTH)
    test_dataset = UrlDataset({col: df[col].iloc[test_idx] for col in TEXT_COLS}, numericals[test_idx], labels[test_idx], tokenizer, MAX_LENGTH)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    model = CustomModel(MODEL_NAME, len(NUMERICAL_COLS), num_labels)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    
    train_custom(model, train_loader, test_dataset, optimizer, num_epochs=5, device='cuda' if torch.cuda.is_available() else 'cpu')
    evaluate(model, test_dataset, device='cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
df = pd.read_csv('./Data/Balanced_dataset.csv')
train_model(df)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [03:24<00:00,  6.37it/s]


Epoch 1/5, Loss: 0.2272, Accuracy: 0.9197


100%|█████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:11<00:00, 125.02it/s]


Test - Accuracy: 0.9576, Precision: 0.9581, Recall: 0.9576, F1 Score: 0.9577


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [03:01<00:00,  7.19it/s]


Epoch 2/5, Loss: 0.0980, Accuracy: 0.9650


100%|█████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:12<00:00, 123.23it/s]


Test - Accuracy: 0.9616, Precision: 0.9627, Recall: 0.9616, F1 Score: 0.9618


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [03:23<00:00,  6.39it/s]


Epoch 3/5, Loss: 0.0539, Accuracy: 0.9815


100%|█████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:07<00:00, 132.03it/s]


Test - Accuracy: 0.9635, Precision: 0.9641, Recall: 0.9635, F1 Score: 0.9635


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [03:30<00:00,  6.20it/s]


Epoch 4/5, Loss: 0.0316, Accuracy: 0.9897


100%|█████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:05<00:00, 137.31it/s]


Test - Accuracy: 0.9672, Precision: 0.9673, Recall: 0.9672, F1 Score: 0.9672


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [03:33<00:00,  6.11it/s]


Epoch 5/5, Loss: 0.0167, Accuracy: 0.9942


100%|█████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:01<00:00, 145.57it/s]


Test - Accuracy: 0.9669, Precision: 0.9677, Recall: 0.9669, F1 Score: 0.9670
Best model saved with F1 score: 0.9672


100%|█████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:07<00:00, 131.66it/s]

Test - Accuracy: 0.9669, Precision: 0.9677, Recall: 0.9669, F1 Score: 0.9670



