In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from torch.optim import AdamW
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(torch.cuda.is_available())

True


## Loading Data

In [5]:
# Download latest version of data
path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")
print("Path to dataset files:", path)
data = pd.read_csv(path +"/phishing_email.csv")
data.head()

Path to dataset files: /s/bach/c/under/trevor04/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1


Unnamed: 0,text_combined,label
0,hpl nom may 25 2001 see attached file hplno 52...,0
1,nom actual vols 24 th forwarded sabrae zajac h...,0
2,enron actuals march 30 april 1 201 estimated a...,0
3,hpl nom may 30 2001 see attached file hplno 53...,0
4,hpl nom june 1 2001 see attached file hplno 60...,0


In [6]:
print("Shape of Data:", data.shape)
print("Number of phishing instances: ", sum(data['label'] == 1))
print("Number of legitimate instances: ", sum(data['label'] == 0))

print("\nAny Missing Values: ", pd.isna(data).any().any())

Shape of Data: (82486, 2)
Number of phishing instances:  42891
Number of legitimate instances:  39595

Any Missing Values:  False


## Splitting Data

In [7]:
train_and_val, test_df = train_test_split(data, test_size=0.2)
train_df, val_df = train_test_split(train_and_val, test_size=0.25)

## PyTorch Compatible EmailDataset Class

In [8]:
class EmailDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length = 128):
        self.dataset = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        text = self.dataset.iloc[idx]['text_combined']
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {'input_ids': encoding['input_ids'].squeeze(), 
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(self.dataset.iloc[idx]['label'])}

## Tokenization


In [9]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

train_dataset = EmailDataset(train_df, tokenizer)
val_dataset = EmailDataset(val_df, tokenizer)
test_dataset = EmailDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)
num_epochs = 3

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the Model

In [10]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    print("Warning: No GPU, I hope you have 15 days to spare")
    device = 'cpu'
    
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [11]:
scaler = GradScaler()
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        with autocast():  # Mixed percision for memory saving
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
        
        scaler.scale(loss).backward()  
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

    torch.cuda.empty_cache()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


  scaler = GradScaler()
  with autocast():  # Mixed percision for memory saving


Epoch 1, Loss: 0.0735292670539139
Epoch 2, Loss: 0.023474541683600036
Epoch 3, Loss: 0.016255785442261358


## Evaluate Model on Validation Set

In [12]:
model.eval()  
total_correct = 0
total_samples = 0

with torch.no_grad():  
    for batch in val_loader:  
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        

        _, preds = torch.max(outputs.logits, dim=1)  
        

        total_correct += (preds == batch["labels"]).sum().item()
        total_samples += batch["labels"].size(0)

accuracy = total_correct / total_samples
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9919


## Save Model Weights

In [13]:
torch.save(model.state_dict(), "roberta_sequence_classification_weights.pth")