### **Libraries used:**
* Pandas
* PyTorch
* NumPy
* RegExpressions
* SciKitLearn
* Transformers (HuggingFace)
* Dataset (HuggingFace)


In [None]:
CSV_PATH = "IMDB Dataset.csv"
TEXT_COL = "review"
LABEL_COL = "sentiment"
MODEL_NAME = "distilbert-base-uncased"

import os
import pandas as pd
import re
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

import torch

data_file = pd.read_csv(CSV_PATH)
data_file.info()
data_file.head()

### **Preprocessing**
The text is cleaned of any HTML syntax or embedded URLs for easy tokenization. Labels are then encoded for processing.
Our test set will be split using the SKLearn train_test_split function, filtering 25% of the data for testing.

In [None]:
class ReviewCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = str(text)
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

In [None]:
cleaner = ReviewCleaner()
data_file['clean_reviews'] = data_file[TEXT_COL].apply(cleaner.clean_text)

label_encoder = preprocessing.LabelEncoder()
data_file["labels"] = label_encoder.fit_transform(data_file[LABEL_COL].tolist())

train_data_file, test_data_file = train_test_split(data_file, test_size=0.25)

Pulling from Transformers, there are many pretrained tokenizers for DistilBERT. We'll use a basic lowercase tokenizer here, making 1 set for training and 1 set for testing created from the earlier split data.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = Dataset.from_pandas(train_data_file)
test_dataset = Dataset.from_pandas(test_data_file)

def tokenize_data(batch):
    return tokenizer(batch['clean_reviews'], truncation=True, max_length=256)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

collator = DataCollatorWithPadding(tokenizer=tokenizer)


### **Model Fine-Tuning**

We will continue to use the Transformers library for easier automatic functions and algorithms.

The weights will decay at 1%, and we will rotate through 3 epochs. Our batch size for fine tuning will be limited to 16, and the model when finished will output to its own directory.

In [None]:
NUM_LABELS = 2
LR = 2e-5
WEIGHT_DECAY = 0.01
EPOCHS = 3
MAX_GRAD_NORM = 1.0
BATCH_SIZE = 16
OUTPUT_DIR = "./distilbert-finetuned"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS).to(DEVICE)

def compute_metrics(prediction):
    labels = prediction.label_ids
    predictions = np.argmax(prediction.predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

Here, we'll begin defining how we will train, passing in batches and basic the best model on the f1 metric. Each evaluation and save will be snapshotted after every epoch finishes training.

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    save_strategy='epoch',
    eval_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

predictions_output = trainer.predict(tokenized_test)
predictions = np.argmax(predictions_output.predictions, axis=1)
labels = predictions_output.label_ids
print(classification_report(labels, predictions, digits=4))

### **GPT Comparison**

In [None]:
GPT_MODEL_NAME = "distilgpt2"
NUM_LABELS = 2
OUTPUT_DIR = "./gpt2-finetuned"
BATCH = 8
LR = 2e-5
EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(GPT_MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(GPT_MODEL_NAME, num_labels=NUM_LABELS)
model.resize_token_embeddings(len(tokenizer))
def tokenize_gpt(batch):
    return tokenizer(batch['clean_reviews'], truncation=True, padding=False, max_length=256)

gpt_train = train_dataset.map(tokenize_gpt, batched=True)
gpt_test  = test_dataset.map(tokenize_gpt, batched=True)

gpt_train.set_format(type="torch")
gpt_test.set_format(type="torch")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_strategy="steps",
    logging_steps=200,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=4,
    remove_unused_columns=False
)


gpt_trainer = Trainer(
    model=GPT_MODEL_NAME,
    args=training_args,
    train_dataset=gpt_train,
    eval_dataset=gpt_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

gpt_trainer.train()

gpt_eval = gpt_trainer.evaluate()
print("GPT eval:", gpt_eval)

predictions_out = gpt_trainer.predict(gpt_test)
gpt_predictions = np.argmax(predictions_out.predictions, axis=1)
gpt_labels = predictions_out.label_ids

print("=== GPT2 classification report ===")
print(classification_report(gpt_labels, gpt_predictions, digits=4))

# Also compare the HF evaluation dicts
print("Distil eval dict:", eval_result)
print("GPT eval dict:", gpt_eval)

The fine-tuned DistilBERT model is marginaly more accurate, to a degree of about 0.3% percent. The perform very close to the same, and train at very similar speeds.

### **Logistic Regression**

In [None]:
import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

label_encoder = preprocessing.LabelEncoder()
y_train = label_encoder.fit_transform(train_data_file[LABEL_COL].astype(str).tolist())
y_test  = label_encoder.transform(test_data_file[LABEL_COL].astype(str).tolist())

X_train_text = train_data_file[TEXT_COL].astype(str).tolist()
X_test_text  = test_data_file[TEXT_COL].astype(str).tolist()


tfidf_clf = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50_000, ngram_range=(1,2), strip_accents='unicode', lowercase=True)),
    ("clf", LogisticRegression(max_iter=2000, solver="saga", n_jobs=-1, class_weight=None))
])

t0 = time.time()
tfidf_clf.fit(X_train_text, y_train)
train_time = time.time() - t0
print(f"TFIDF+LogReg training time: {train_time:.1f}s")

# Predict & evaluate
t0 = time.time()
y_pred = tfidf_clf.predict(X_test_text)
infer_time = time.time() - t0
n_test = len(X_test_text)
print(f"Inference time on test set: {infer_time:.3f}s — samples/sec: {n_test / infer_time:.2f}")

print("TFIDF logistic regression results:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, digits=4))
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=0)
print(f"Accuracy: {acc:.4f}  Precision: {prec:.4f}  Recall: {rec:.4f}  F1: {f1:.4f}")

Logistic Regression has about a 1% difference from the fine-tuned DistilBERT model, with a much faster calculation time than both the fine-tuned and base models of DistilBERT.

In [None]:
[{
  "id": "TC-001",
  "input": "I did not enjoy the plot.",
  "expected_label": "negative",
  "metadata": {
    "word_count": 5,
    "sentence_count": 1,
    "avg_word_length": 4.0,
    "punctuation_count": 1
  },
  "complexity_tags": ["negation", "short", "syntactic_dependency"],
  "notes": "Tests negation handling: ‘not enjoy’ should flip polarity. Models that ignore syntactic scope may misclassify."
},
{
  "id": "TC-002",
  "input": "The cinematography was gorgeous and the performances were solid, but the pacing dragged in the middle and the ending felt rushed.",
  "expected_label": "negative",
  "metadata": {
    "word_count": 23,
    "sentence_count": 1,
    "avg_word_length": 5.0,
    "punctuation_count": 2
  },
  "complexity_tags": ["multi_clause", "contrast", "mixed_sentiment"],
  "notes": "Contains explicit positive and negative fragments joined by contrast. Good for testing aggregation strategies (how models combine clause-level sentiment)."
},
{
  "id": "TC-003",
  "input": "Although the concept held promise—laden with evocative moments and standout acting by the leads—the script never quite cohered; dialogue often slips into platitude and the third act collapses under its own ambitions.",
  "expected_label": "negative",
  "metadata": {
    "word_count": 36,
    "sentence_count": 1,
    "avg_word_length": 5.4444,
    "punctuation_count": 4
  },
  "complexity_tags": ["long_sentence", "parenthetical", "formal_vocabulary", "multi_clause"],
  "notes": "Long, punctuated sentence with high lexical complexity and mixed clause polarity. Tests long-range dependency handling and truncation effects."
}
]