# 04_bert_finetuning.ipynb

In [23]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

In [24]:
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

Load data

In [15]:
df = pd.read_csv("../data/cleaned_data.csv")

Train-test split

In [16]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["clean_text"], df["label"], test_size=0.2, random_state=42
)

In [17]:
# Ensure texts are strings and drop NaNs
train_texts = pd.Series(train_texts).astype(str).fillna("").tolist()
val_texts   = pd.Series(val_texts).astype(str).fillna("").tolist()


Tokenizer

In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

Dataset class

In [40]:
class PhishingDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Convert labels to numpy array to avoid ValueError
        self.labels = torch.tensor(labels.to_numpy(), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
         return {
            key: val[idx] for key, val in self.encodings.items()
        } | {"labels": self.labels[idx]}


# create datasets
train_dataset = PhishingDataset(train_encodings, train_labels)
val_dataset = PhishingDataset(val_encodings, val_labels)


In [39]:
# The variables train_texts, val_texts, train_labels, and val_labels are already defined and processed in previous cells.
# No need to redefine or import anything here.
# If you want to check their types or reset index, you can do:

print(type(train_texts), type(val_texts), type(train_labels), type(val_labels))
print(len(train_texts), len(val_texts), len(train_labels), len(val_labels))

<class 'list'> <class 'list'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
65988 16498 65988 16498


In [33]:
train_encodings = tokenizer(
    train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt"
)

val_encodings = tokenizer(
    val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt"
)


DataLoader

In [28]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

Model + Optimizer

In [25]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT model with classification head
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(set(df["label"]))   # number of unique labels in dataset
)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler for learning rate
num_training_steps = len(train_loader) * 3   # 3 epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Loop

In [41]:
epochs = 3
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        # Ensure batch items are tensors and moved to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())


  0%|          | 0/4125 [00:00<?, ?it/s]



KeyError: 15399