In [1]:
# ✅ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ✅ Import libraries
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import time

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ✅ Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ✅ Load phishing datasets
path = '/content/drive/MyDrive/dataset thesis/'
files = {
    "phishing_email": "phishing_email.csv",
    "enron": "Enron.csv",
    "ling": "Ling.csv",
    "nazario": "Nazario.csv",
    "nigerian_fraud": "Nigerian_Fraud.csv",
    "spamassassin": "SpamAssasin.csv",
    "ceas_08": "CEAS_08.csv"
}

data = []
for name, file in files.items():
    df = pd.read_csv(path + file)
    df['source'] = name
    data.append(df)

Using device: cuda


In [3]:
# ✅ Combine and clean
df_all = pd.concat(data, ignore_index=True)
df_all = df_all[['text_combined', 'label']]  # Adjust if needed
df_all.dropna(inplace=True)

# ✅ Encode labels
if df_all['label'].dtype == 'object':
    le = LabelEncoder()
    df_all['label'] = le.fit_transform(df_all['label'])

# ✅ Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_all['text_combined'].tolist(), df_all['label'].tolist(), test_size=0.2, random_state=42
)

In [5]:
!pip install torchtext
# ✅ Tokenizer and vocab
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

Collecting torchtext
  Using cached torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Using cached torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
Installing collected packages: torchtext
Successfully installed torchtext-0.18.0




In [6]:
# ✅ Dataset class
MAX_LEN = 128

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    def __getitem__(self, idx):
        tokens = tokenizer(self.texts[idx])
        token_ids = [self.vocab[token] for token in tokens][:MAX_LEN]
        if len(token_ids) < MAX_LEN:
            token_ids += [self.vocab["<pad>"]] * (MAX_LEN - len(token_ids))
        return torch.tensor(token_ids), torch.tensor(self.labels[idx])

    def __len__(self):
        return len(self.labels)

# ✅ Load datasets
train_dataset = TextDataset(train_texts, train_labels, vocab)
test_dataset = TextDataset(test_texts, test_labels, vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [7]:
# ✅ CNN model
class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_classes=2):
        super(CNNTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128 * (MAX_LEN // 2), num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_len, embed_dim]
        x = x.permute(0, 2, 1)  # [batch_size, embed_dim, seq_len]
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)

In [8]:
# ✅ Initialize model
model = CNNTextClassifier(vocab_size=len(vocab)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# ✅ Training loop
print("Starting training...")
model.train()
for epoch in range(3):
    print(f"\nEpoch {epoch + 1}")
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f"Batch {i}/{len(train_loader)} - Loss: {loss.item():.4f}")

print("\n✅ Training complete.")

Starting training...

Epoch 1
Batch 0/2063 - Loss: 0.7059
Batch 10/2063 - Loss: 0.6153
Batch 20/2063 - Loss: 0.6473
Batch 30/2063 - Loss: 0.6020
Batch 40/2063 - Loss: 0.5224
Batch 50/2063 - Loss: 0.6358
Batch 60/2063 - Loss: 0.5008
Batch 70/2063 - Loss: 0.4552
Batch 80/2063 - Loss: 0.2763
Batch 90/2063 - Loss: 0.6753
Batch 100/2063 - Loss: 0.2836
Batch 110/2063 - Loss: 0.4700
Batch 120/2063 - Loss: 0.2780
Batch 130/2063 - Loss: 0.6869
Batch 140/2063 - Loss: 0.2729
Batch 150/2063 - Loss: 0.5833
Batch 160/2063 - Loss: 0.2671
Batch 170/2063 - Loss: 0.2130
Batch 180/2063 - Loss: 0.5099
Batch 190/2063 - Loss: 0.2907
Batch 200/2063 - Loss: 0.2347
Batch 210/2063 - Loss: 0.0887
Batch 220/2063 - Loss: 0.3539
Batch 230/2063 - Loss: 0.2421
Batch 240/2063 - Loss: 0.3170
Batch 250/2063 - Loss: 0.4277
Batch 260/2063 - Loss: 0.4793
Batch 270/2063 - Loss: 0.3779
Batch 280/2063 - Loss: 0.0627
Batch 290/2063 - Loss: 0.1931
Batch 300/2063 - Loss: 0.1873
Batch 310/2063 - Loss: 0.2551
Batch 320/2063 - Loss

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # 🔍 Metrics
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='weighted')
    rec = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print("\n📊 Evaluation Metrics:")
    print(f"Accuracy       : {acc:.4f}")
    print(f"Precision      : {prec:.4f}")
    print(f"Recall         : {rec:.4f}")
    print(f"F1 Score       : {f1:.4f}")
    print("\n🧾 Classification Report:")
    print(classification_report(all_labels, all_preds, digits=4))

# ✅ Run evaluation
evaluate_model(model, test_loader)


📊 Evaluation Metrics:
Accuracy       : 0.9821
Precision      : 0.9822
Recall         : 0.9821
F1 Score       : 0.9821

🧾 Classification Report:
              precision    recall  f1-score   support

           0     0.9886    0.9740    0.9813      7935
           1     0.9763    0.9896    0.9829      8563

    accuracy                         0.9821     16498
   macro avg     0.9824    0.9818    0.9821     16498
weighted avg     0.9822    0.9821    0.9821     16498

