In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Configuration
MODEL_NAME = "google/electra-small-discriminator"
TEXT_COLS = ['url', 'domain', 'path', 'subdomain', 'top_level_domain']
NUMERICAL_COLS = ['path_length', 'num_path_segments', 'query_params', 
                'num_query_params', 'has_https', 'has_fragment',
                'has_special_chars_in_path', 'has_port', 'port_number', 'is_ip_address']
MAX_LENGTH = 128
BATCH_SIZE = 32

In [3]:
class UrlDataset(Dataset):
    def __init__(self, texts, numericals, labels, tokenizer, max_length):
        # Process text features
        self.texts = [" ".join([str(t) for t in text_tuple]) 
                     for text_tuple in zip(*[texts[col] for col in TEXT_COLS])]
        
        # Process numerical features
        self.numericals = numericals
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        numerical = self.numericals[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            **{k: v.squeeze(0) for k, v in encoding.items()},
            "numerical_features": torch.tensor(numerical, dtype=torch.float),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [4]:
class CustomModel(torch.nn.Module):
    def __init__(self, model_name, num_numerical, num_labels):
        super().__init__()
        self.transformer = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.numerical_layer = torch.nn.Linear(num_numerical, 128)
        self.combined_layer = torch.nn.Linear(
            self.transformer.config.hidden_size + 128,
            num_labels
        )

    def forward(self, input_ids, attention_mask, numerical_features, labels=None):
        # Text features
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        text_features = outputs.hidden_states[-1][:, 0, :]  # CLS token
        
        # Numerical features
        numerical = torch.relu(self.numerical_layer(numerical_features))
        
        # Combine features
        combined = torch.cat([text_features, numerical], dim=1)
        logits = self.combined_layer(combined)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.combined_layer.out_features), 
                           labels.view(-1))
            
        return {"loss": loss, "logits": logits}

In [5]:
# Evaluation function
def evaluate(model, test_loader, device):
    model.eval()
    total_correct, total_samples = 0, 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].unsqueeze(0).to(device)
            attention_mask = batch['attention_mask'].unsqueeze(0).to(device)
            numerical_features = batch['numerical_features'].unsqueeze(0).to(device)
            labels = batch['labels'].unsqueeze(0).to(device)
            
            outputs = model(input_ids, attention_mask, numerical_features, labels)
            logits = outputs['logits']
            
            preds = logits.argmax(dim=1).cpu().numpy()
            labels = labels.cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels)
            
            total_correct += (preds == labels).sum()
            total_samples += labels.shape[0]
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print(f"Test - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    return accuracy, f1, precision, recall
    
def train_custom(model, train_loader, val_loader, optimizer, num_epochs=5, device='cuda'):
    criterion = torch.nn.CrossEntropyLoss()
    model.to(device)
    best_f1 = 0.0
    best_model_state = None
    
    for epoch in range(num_epochs):
        model.train()
        total_loss, total_correct = 0, 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_features = batch['numerical_features'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask, numerical_features, labels)
            loss = outputs['loss']
            logits = outputs['logits']
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            total_correct += (logits.argmax(dim=1) == labels).sum().item()
        
        avg_loss = total_loss / len(train_loader)
        accuracy = total_correct / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
        
        acc, f1, precision, recall = evaluate(model, val_loader, device)
        if f1 > best_f1:
            best_f1 = f1
            best_model_state = model.state_dict()
    
    if best_model_state:
        torch.save(best_model_state, "./Models/best_electra_model.pth")
        print("Best model saved with F1 score: {:.4f}".format(best_f1))

# Main function
def train_model(df):
    le = LabelEncoder()
    labels = le.fit_transform(df['type'])
    num_labels = len(le.classes_)
    
    scaler = StandardScaler()
    numericals = scaler.fit_transform(df[NUMERICAL_COLS])
    
    train_idx, val_idx, test_idx = np.split(
        np.random.permutation(len(df)), [int(.7 * len(df)), int(.85 * len(df))]
    )
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_dataset = UrlDataset({col: df[col].iloc[train_idx] for col in TEXT_COLS}, numericals[train_idx], labels[train_idx], tokenizer, MAX_LENGTH)
    val_dataset = UrlDataset({col: df[col].iloc[val_idx] for col in TEXT_COLS}, numericals[val_idx], labels[val_idx], tokenizer, MAX_LENGTH)
    test_dataset = UrlDataset({col: df[col].iloc[test_idx] for col in TEXT_COLS}, numericals[test_idx], labels[test_idx], tokenizer, MAX_LENGTH)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    model = CustomModel(MODEL_NAME, len(NUMERICAL_COLS), num_labels)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    
    train_custom(model, train_loader, test_dataset, optimizer, num_epochs=8, device='cuda' if torch.cuda.is_available() else 'cpu')
    evaluate(model, test_dataset, device='cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
df = pd.read_csv('./Data/Balanced_dataset.csv')
train_model(df)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [01:36<00:00, 13.51it/s]


Epoch 1/8, Loss: 0.3614, Accuracy: 0.8773


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [02:07<00:00, 70.05it/s]


Test - Accuracy: 0.9407, Precision: 0.9413, Recall: 0.9407, F1 Score: 0.9405


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [01:46<00:00, 12.22it/s]


Epoch 2/8, Loss: 0.1719, Accuracy: 0.9399


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [02:05<00:00, 71.12it/s]


Test - Accuracy: 0.9538, Precision: 0.9545, Recall: 0.9538, F1 Score: 0.9540


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [01:58<00:00, 10.97it/s]


Epoch 3/8, Loss: 0.1223, Accuracy: 0.9563


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [02:03<00:00, 72.69it/s]


Test - Accuracy: 0.9632, Precision: 0.9638, Recall: 0.9632, F1 Score: 0.9634


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [02:00<00:00, 10.78it/s]


Epoch 4/8, Loss: 0.0971, Accuracy: 0.9654


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:57<00:00, 76.21it/s]


Test - Accuracy: 0.9638, Precision: 0.9645, Recall: 0.9638, F1 Score: 0.9639


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [01:14<00:00, 17.61it/s]


Epoch 5/8, Loss: 0.0778, Accuracy: 0.9721


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:41<00:00, 88.47it/s]


Test - Accuracy: 0.9611, Precision: 0.9620, Recall: 0.9611, F1 Score: 0.9608


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [01:03<00:00, 20.55it/s]


Epoch 6/8, Loss: 0.0657, Accuracy: 0.9765


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:45<00:00, 85.10it/s]


Test - Accuracy: 0.9676, Precision: 0.9677, Recall: 0.9676, F1 Score: 0.9675


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [01:01<00:00, 21.32it/s]


Epoch 7/8, Loss: 0.0480, Accuracy: 0.9828


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:45<00:00, 84.67it/s]


Test - Accuracy: 0.9678, Precision: 0.9678, Recall: 0.9678, F1 Score: 0.9677


100%|██████████████████████████████████████████████████████████████████████████████| 1304/1304 [01:00<00:00, 21.62it/s]


Epoch 8/8, Loss: 0.0405, Accuracy: 0.9856


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:46<00:00, 84.18it/s]


Test - Accuracy: 0.9706, Precision: 0.9706, Recall: 0.9706, F1 Score: 0.9705
Best model saved with F1 score: 0.9705


100%|██████████████████████████████████████████████████████████████████████████████| 8941/8941 [01:46<00:00, 83.70it/s]

Test - Accuracy: 0.9706, Precision: 0.9706, Recall: 0.9706, F1 Score: 0.9705



