In [None]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ✅ Step 2: Import libraries
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import time

# ✅ Step 3: Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ✅ Step 4: Load phishing datasets
path = '/content/drive/MyDrive/dataset thesis/'
files = {
    "phishing_email": "phishing_email.csv",
    "enron": "Enron.csv",
    "ling": "Ling.csv",
    "nazario": "Nazario.csv",
    "nigerian_fraud": "Nigerian_Fraud.csv",
    "spamassassin": "SpamAssasin.csv",
    "ceas_08": "CEAS_08.csv"
}

data = []
for name, file in files.items():
    df = pd.read_csv(path + file)
    df['source'] = name
    data.append(df)

# ✅ Step 5: Combine and preprocess
df_all = pd.concat(data, ignore_index=True)
df_all = df_all[['text_combined', 'label']]  # Adjust if needed
df_all.dropna(inplace=True)

# ✅ Step 6: Encode labels
if df_all['label'].dtype == 'object':
    le = LabelEncoder()
    df_all['label'] = le.fit_transform(df_all['label'])

# ✅ Step 7: Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_all['text_combined'].tolist(), df_all['label'].tolist(), test_size=0.2, random_state=42
)

# ✅ Step 8: Tokenization using torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

# ✅ Step 9: TextDataset for CNN
MAX_LEN = 128

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    def __getitem__(self, idx):
        tokens = tokenizer(self.texts[idx])
        token_ids = [self.vocab[token] for token in tokens][:MAX_LEN]
        if len(token_ids) < MAX_LEN:
            token_ids += [self.vocab["<pad>"]] * (MAX_LEN - len(token_ids))
        return torch.tensor(token_ids), torch.tensor(self.labels[idx])

    def __len__(self):
        return len(self.labels)

# ✅ Step 10: Load datasets
train_dataset = TextDataset(train_texts, train_labels, vocab)
test_dataset = TextDataset(test_texts, test_labels, vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# ✅ Step 11: Define CNN model
class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_classes=2):
        super(CNNTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128 * (MAX_LEN // 2), num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_len, embed_dim]
        x = x.permute(0, 2, 1)  # [batch_size, embed_dim, seq_len]
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)

# ✅ Step 12: Initialize model
model = CNNTextClassifier(vocab_size=len(vocab)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# ✅ Step 13: Training loop
print("Starting training...")
model.train()
for epoch in range(3):
    print(f"\nEpoch {epoch + 1}")
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f"Batch {i}/{len(train_loader)} - Loss: {loss.item():.4f}")

print("\n✅ Training complete.")

# ✅ Step 14: Evaluation
def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print("\n✅ Classification Report:")
    print(classification_report(all_labels, all_preds, digits=4))

evaluate(model, test_loader)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cpu




Starting training...

Epoch 1
Batch 0/2063 - Loss: 0.7178
Batch 10/2063 - Loss: 0.7244
Batch 20/2063 - Loss: 0.5892
Batch 30/2063 - Loss: 0.5856
Batch 40/2063 - Loss: 0.5453
Batch 50/2063 - Loss: 0.6374
Batch 60/2063 - Loss: 0.6170
Batch 70/2063 - Loss: 0.4813
Batch 80/2063 - Loss: 0.5488
Batch 90/2063 - Loss: 0.3698
Batch 100/2063 - Loss: 0.4280
Batch 110/2063 - Loss: 0.3413
Batch 120/2063 - Loss: 0.2979
Batch 130/2063 - Loss: 0.3827
Batch 140/2063 - Loss: 0.2626
Batch 150/2063 - Loss: 0.2049
Batch 160/2063 - Loss: 0.1481
Batch 170/2063 - Loss: 0.2747
Batch 180/2063 - Loss: 0.1786
Batch 190/2063 - Loss: 0.3802
Batch 200/2063 - Loss: 0.3071
Batch 210/2063 - Loss: 0.2079
Batch 220/2063 - Loss: 0.1725
Batch 230/2063 - Loss: 0.2600
Batch 240/2063 - Loss: 0.2516
Batch 250/2063 - Loss: 0.1569
Batch 260/2063 - Loss: 0.2740
Batch 270/2063 - Loss: 0.0492
Batch 280/2063 - Loss: 0.2357
Batch 290/2063 - Loss: 0.2489
Batch 300/2063 - Loss: 0.1869
Batch 310/2063 - Loss: 0.6217
Batch 320/2063 - Loss

In [None]:
!pip install torchtext

Collecting torchtext
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [None]:
!pip install torch==2.3.0 torchtext==0.18.0

Collecting torch==2.3.0
  Downloading torch-2.3.0-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.3.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylin