### **Libraries used:**
* Pandas
* PyTorch
* NumPy
* RegExpressions
* SciKitLearn
* Transformers (HuggingFace)
* Dataset (HuggingFace)


In [None]:
CSV_PATH = "IMDB Dataset.csv"
TEXT_COL = "review"
LABEL_COL = "sentiment"

import os
import pandas as pd
import re
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import (
    DistilBertForSequenceClassification,
    DistilBertConfig,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
)

data_file = pd.read_csv(CSV_PATH)
data_file.info()
data_file.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### **Preprocessing**
The text is cleaned of any HTML syntax or embedded URLs for easy tokenization. Labels are then encoded for processing.
Our test set will be split using the SKLearn train_test_split function, filtering 20% of the data for testing.

In [25]:
class ReviewCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = str(text)
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

In [26]:
cleaner = ReviewCleaner()
data_file['clean_reviews'] = data_file[TEXT_COL].apply(cleaner.clean_text)

label_encoder = preprocessing.LabelEncoder()
data_file['labels'] = label_encoder.fit_transform(data_file[LABEL_COL].tolist())

train_data_file, test_data_file = train_test_split(data_file, test_size=0.25)

Pulling from Transformers, there are many pretrained tokenizers for DistilBERT. We'll use a basic lowercase tokenizer here, making 1 set for training and 1 set for testing created from the earlier split data.

In [27]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_dataset = Dataset.from_pandas(train_data_file)
test_dataset = Dataset.from_pandas(test_data_file)

def tokenize_data(i):
    return tokenizer(i["clean_reviews"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/37500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

### **Model Fine-Tuning**

We will continue to use the Transformers library for easier automatic functions and algorithms.

In [28]:
MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 2
BATCH_SIZE = 16
LR = 2e-5
WEIGHT_DECAY = 0.01
EPOCHS = 3
MAX_GRAD_NORM = 1.0
WARMUP_STEPS = 0
OUTPUT_DIR = "./distilbert-finetuned"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader = DataLoader(tokenized_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator)
test_loader = DataLoader(tokenized_test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator)

In [None]:
config = DistilBertConfig.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.to(DEVICE)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": WEIGHT_DECAY,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=1000)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
def evaluate(model, dataloader):
    model.eval()
    preds = []
    labels = []
    loss_total = 0.0
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            logits = outputs.logits
            loss_total += loss.item() * logits.size(0)
            batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()
            preds.extend(batch_preds.tolist())
            labels.extend(batch["labels"].cpu().numpy().tolist())
    avg_loss = loss_total / len(dataloader.dataset)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted") if NUM_LABELS > 2 else f1_score(labels, preds, average="binary")
    return {"loss": avg_loss, "accuracy": acc, "f1": f1}

In [None]:
test_metrics = evaluate(model, test_loader)
print(f"Epoch {EPOCHS+1}/{EPOCHS} test_loss={test_metrics['loss']:.4f} acc={test_metrics['accuracy']:.4f} f1={test_metrics['f1']:.4f}")

In [None]:
best_val_f1 = -1.0
global_step = 0
total_steps = EPOCHS * len(train_loader)
print(f"Training on device {DEVICE}. Total steps = {total_steps}")

for EPOCHS in range(EPOCHS):
    model.train()
    epoch_loss = 0.0
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
        global_step += 1

        if global_step % 100 == 0:
            print(f"Step {global_step}, loss {loss.item():.4f}")

    avg_epoch_loss = epoch_loss / len(train_loader)
    train_metrics = evaluate(model, train_loader)
    print(f"Epoch {EPOCHS+1}/{EPOCHS} train_loss={train_metrics['loss']:.4f} acc={train_metrics['accuracy']:.4f} f1={train_metrics['f1']:.4f}")

    if train_metrics["f1"] > best_val_f1:
        best_val_f1 = train_metrics["f1"]
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
        print(f"Saved best model to {OUTPUT_DIR} (f1={best_val_f1:.4f})")

GPT Comparison

In [None]:
GPT_MODEL_NAME = "distilgpt2"
NUM_LABELS = 2
OUTPUT_DIR = "./gpt2-finetuned"
BATCH = 8
LR = 2e-5
EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(GPT_MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(GPT_MODEL_NAME, num_labels=NUM_LABELS)
model.resize_token_embeddings(len(tokenizer))
collator = DataCollatorWithPadding(tokenizer=tokenizer)