In [1]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ✅ Step 2: Import libraries
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import time


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ✅ Step 3: Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ✅ Step 4: Load phishing datasets
path = '/content/drive/MyDrive/dataset thesis/'
files = {
    "phishing_email": "phishing_email.csv",
    "enron": "Enron.csv",
    "ling": "Ling.csv",
    "nazario": "Nazario.csv",
    "nigerian_fraud": "Nigerian_Fraud.csv",
    "spamassassin": "SpamAssasin.csv",
    "ceas_08": "CEAS_08.csv"
}

data = []
for name, file in files.items():
    df = pd.read_csv(path + file)
    df['source'] = name
    data.append(df)


Using device: cuda


In [3]:
# ✅ Step 5: Combine and preprocess
df_all = pd.concat(data, ignore_index=True)
df_all = df_all[['text_combined', 'label']]  # Adjust if needed
df_all.dropna(inplace=True)


In [4]:
# ✅ Step 6: Encode labels if not numeric
if df_all['label'].dtype == 'object':
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    df_all['label'] = le.fit_transform(df_all['label'])


In [5]:
# ✅ Step 7: Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_all['text_combined'].tolist(), df_all['label'].tolist(), test_size=0.2, random_state=42
)


In [6]:
# ✅ Step 8: Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [7]:
# ✅ Step 9: Optimized Dataset class (on-the-fly tokenization)
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [8]:
# ✅ Step 10: Load datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [9]:
# ✅ Step 11: Define model
class DistilBertClassifier(nn.Module):
    def __init__(self):
        super(DistilBertClassifier, self).__init__()
        print("Loading DistilBERT model...")
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        print("Model loaded.")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0]
        dropped = self.dropout(hidden_state)
        return self.classifier(dropped)


In [10]:
# ✅ Step 12: Initialize model
model = DistilBertClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()


Loading DistilBERT model...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Model loaded.


In [11]:
# ✅ Step 13: Training loop
print("Starting training...")
start_time = time.time()
model.train()
for epoch in range(3):
    print(f"\nEpoch {epoch + 1}")
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f"Batch {i}/{len(train_loader)} - Loss: {loss.item():.4f} - Time elapsed: {time.time() - start_time:.2f}s")

print("\n✅ Training complete.")


Starting training...

Epoch 1
Batch 0/4125 - Loss: 0.7005 - Time elapsed: 0.96s
Batch 10/4125 - Loss: 0.4843 - Time elapsed: 3.27s
Batch 20/4125 - Loss: 0.3461 - Time elapsed: 4.88s
Batch 30/4125 - Loss: 0.2297 - Time elapsed: 6.47s
Batch 40/4125 - Loss: 0.1011 - Time elapsed: 8.15s
Batch 50/4125 - Loss: 0.2863 - Time elapsed: 9.80s
Batch 60/4125 - Loss: 0.3201 - Time elapsed: 11.60s
Batch 70/4125 - Loss: 0.2251 - Time elapsed: 13.29s
Batch 80/4125 - Loss: 0.1038 - Time elapsed: 14.93s
Batch 90/4125 - Loss: 0.1851 - Time elapsed: 16.71s
Batch 100/4125 - Loss: 0.0172 - Time elapsed: 18.39s
Batch 110/4125 - Loss: 0.2772 - Time elapsed: 20.03s
Batch 120/4125 - Loss: 0.1146 - Time elapsed: 21.77s
Batch 130/4125 - Loss: 0.1789 - Time elapsed: 23.42s
Batch 140/4125 - Loss: 0.2722 - Time elapsed: 25.36s
Batch 150/4125 - Loss: 0.2531 - Time elapsed: 27.18s
Batch 160/4125 - Loss: 0.0351 - Time elapsed: 29.03s
Batch 170/4125 - Loss: 0.0274 - Time elapsed: 30.70s
Batch 180/4125 - Loss: 0.2389 - T

In [13]:


# ✅ Step 14: Evaluation with Time Measurement
def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []

    start_time = time.time()  # Capture start time

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    end_time = time.time()  # Capture end time

    elapsed_time = end_time - start_time  # Calculate elapsed time
    print(f"\n✅ Classification Report:")
    print(classification_report(all_labels, all_preds, digits=4))
    print(f"\nTotal testing time: {elapsed_time:.2f} seconds")  # Print elapsed time

evaluate(model, test_loader)



✅ Classification Report:
              precision    recall  f1-score   support

           0     0.9923    0.9931    0.9927      7935
           1     0.9936    0.9929    0.9932      8563

    accuracy                         0.9930     16498
   macro avg     0.9929    0.9930    0.9930     16498
weighted avg     0.9930    0.9930    0.9930     16498


Total testing time: 128.08 seconds
