### **Libraries used:**
* Pandas
* PyTorch
* NumPy
* RegExpressions
* SciKitLearn
* Transformers (HuggingFace)
* Dataset (HuggingFace)


In [9]:
CSV_PATH = "IMDB Dataset.csv"
TEXT_COL = "review"
LABEL_COL = "sentiment"
MODEL_NAME = "distilbert-base-uncased"

import os
import pandas as pd
import re
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

data_file = pd.read_csv(CSV_PATH)
data_file.info()
data_file.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### **Preprocessing**
The text is cleaned of any HTML syntax or embedded URLs for easy tokenization. Labels are then encoded for processing.
Our test set will be split using the SKLearn train_test_split function, filtering 25% of the data for testing.

In [10]:
class ReviewCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = str(text)
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

In [11]:
cleaner = ReviewCleaner()
data_file['clean_reviews'] = data_file[TEXT_COL].apply(cleaner.clean_text)

label_encoder = preprocessing.LabelEncoder()
data_file["labels"] = label_encoder.fit_transform(data_file[LABEL_COL].tolist())

train_data_file, test_data_file = train_test_split(data_file, test_size=0.25)

Pulling from Transformers, there are many pretrained tokenizers for DistilBERT. We'll use a basic lowercase tokenizer here, making 1 set for training and 1 set for testing created from the earlier split data.

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = Dataset.from_pandas(train_data_file)
test_dataset = Dataset.from_pandas(test_data_file)

def tokenize_data(batch):
    return tokenizer(batch['clean_reviews'], truncation=True, max_length=256)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map: 100%|██████████| 37500/37500 [00:05<00:00, 7281.82 examples/s]
Map: 100%|██████████| 12500/12500 [00:01<00:00, 6509.14 examples/s]


### **Model Fine-Tuning**

We will continue to use the Transformers library for easier automatic functions and algorithms.

In [13]:
NUM_LABELS = 2
LR = 2e-5
WEIGHT_DECAY = 0.01
EPOCHS = 3
MAX_GRAD_NORM = 1.0
BATCH_SIZE = 16
OUTPUT_DIR = "./distilbert-finetuned"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS).to(DEVICE)

def compute_metrics(prediction):
    labels = prediction.label_ids
    predictions = np.argmax(prediction.predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    save_strategy='epoch',
    eval_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

predictions_output = trainer.predict(tokenized_test)
predictions = np.argmax(predictions_output.predictions, axis=1)
labels = predictions_output.label_ids
print(classification_report(labels, predictions, digits=4))

GPT Comparison

In [None]:
GPT_MODEL_NAME = "distilgpt2"
NUM_LABELS = 2
OUTPUT_DIR = "./gpt2-finetuned"
BATCH = 8
LR = 2e-5
EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(GPT_MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(GPT_MODEL_NAME, num_labels=NUM_LABELS)
model.resize_token_embeddings(len(tokenizer))
collator = DataCollatorWithPadding(tokenizer=tokenizer)