In [1]:

%%writefile requirements.txt

numpy==1.26.4
scikit-learn==1.4.2
torch==2.2.2
transformers==4.41.2
datasets==2.18.0
tqdm
pandas


Overwriting requirements.txt


In [2]:
pip install -r requirements.txt



In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
from transformers import AutoTokenizer

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score , accuracy_score
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm
import numpy as np


############################################
# CONFIG
############################################

MODEL_NAME = "distilbert-base-uncased"
BATCH_SIZE = 32
EPOCHS = 3
LR = 2e-4
MAX_LEN = 128

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

############################################
# DATASET
############################################

class SentimentDataset(Dataset):

    def __init__(self, split="train"):
        dataset = load_dataset("imdb")[split]

        self.texts = dataset["text"]
        self.labels = dataset["label"]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }


def get_loader(split):
    return DataLoader(
        SentimentDataset(split),
        batch_size=BATCH_SIZE,
        shuffle=(split == "train"),
        num_workers=2,
        pin_memory=True
    )


############################################
# MODELS
############################################

class CNNClassifier(nn.Module):

    def __init__(self, vocab=30522, embed=128):
        super().__init__()

        self.embedding = nn.Embedding(vocab, embed)

        self.conv = nn.Conv1d(embed, 256, kernel_size=5)
        self.pool = nn.AdaptiveMaxPool1d(1)

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(256, 2)

    def forward(self, ids):

        x = self.embedding(ids)      # (B,L,E)
        x = x.permute(0, 2, 1)       # (B,E,L)

        x = torch.relu(self.conv(x))
        x = self.pool(x).squeeze(-1)

        x = self.dropout(x)

        return self.fc(x)


class RNNClassifier(nn.Module):

    def __init__(self, vocab=30522, embed=128, hidden=256):
        super().__init__()

        self.embedding = nn.Embedding(vocab, embed)

        self.lstm = nn.LSTM(
            embed,
            hidden,
            batch_first=True,
            bidirectional=True
        )

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden * 2, 2)

    def forward(self, ids):

        x = self.embedding(ids)

        _, (hidden, _) = self.lstm(x)

        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)

        hidden = self.dropout(hidden)

        return self.fc(hidden)


############################################
# METRICS
############################################

def compute_metrics(logits, labels):
    probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    labels = labels.cpu().numpy()

    return {
        "Precision": precision_score(labels, preds),
        "Recall": recall_score(labels, preds),
        "F1": f1_score(labels, preds),
        "ROC-AUC": roc_auc_score(labels, probs),
        "Accuracy": accuracy_score(labels, preds)
    }


############################################
# TRAINING
############################################

def evaluate(model, loader):

    model.eval()

    all_logits = []
    all_labels = []

    with torch.no_grad():

        for batch in loader:

            ids = batch["input_ids"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            logits = model(ids)

            all_logits.append(logits)
            all_labels.append(labels)

    metrics = compute_metrics(
        torch.cat(all_logits),
        torch.cat(all_labels)
    )

    print("\nEvaluation Metrics:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")


def train(model_type="cnn"):

    train_loader = get_loader("train")
    test_loader = get_loader("test")

    if model_type == "cnn":
        model = CNNClassifier().to(DEVICE)
    else:
        model = RNNClassifier().to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(EPOCHS):

        model.train()
        loop = tqdm(train_loader)

        for batch in loop:

            ids = batch["input_ids"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            logits = model(ids)

            loss = criterion(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item())

        evaluate(model, test_loader)

    torch.save(model.state_dict(), f"{model_type}_sentiment.pt")

    print("\nModel saved ✔")


############################################
# KMEANS (UNSUPERVISED SENTIMENT)
############################################

def run_kmeans():

    print("\nRunning KMeans clustering...")

    dataset = load_dataset("imdb")["train"]

    texts = dataset["text"][:5000]

    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words="english"
    )

    X = vectorizer.fit_transform(texts)

    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(X)

    unique, counts = np.unique(kmeans.labels_, return_counts=True)

    print("Cluster Distribution:")
    print(dict(zip(unique, counts)))


############################################
# MAIN
############################################

if __name__ == "__main__":

    print("Device:", DEVICE)

    # Choose model: "cnn" or "rnn"
    train(model_type="cnn")

    print('*' * 100)

    train(model_type="rnn")

    print('*' * 100)

    # Run unsupervised clustering
    run_kmeans()


Device: cuda


Epoch 1: 100%|██████████| 782/782 [00:35<00:00, 22.10it/s, loss=0.473]



Evaluation Metrics:
Precision: 0.6950
Recall: 0.7518
F1: 0.7223
ROC-AUC: 0.7904
Accuracy: 0.7109


Epoch 2: 100%|██████████| 782/782 [00:36<00:00, 21.57it/s, loss=0.384]



Evaluation Metrics:
Precision: 0.7117
Recall: 0.8232
F1: 0.7634
ROC-AUC: 0.8334
Accuracy: 0.7448


Epoch 3: 100%|██████████| 782/782 [00:35<00:00, 21.80it/s, loss=0.6]



Evaluation Metrics:
Precision: 0.7409
Recall: 0.8182
F1: 0.7776
ROC-AUC: 0.8541
Accuracy: 0.7660

Model saved ✔
****************************************************************************************************


Epoch 1: 100%|██████████| 782/782 [00:40<00:00, 19.29it/s, loss=0.683]



Evaluation Metrics:
Precision: 0.6581
Recall: 0.7370
F1: 0.6953
ROC-AUC: 0.7405
Accuracy: 0.6771


Epoch 2: 100%|██████████| 782/782 [00:39<00:00, 19.65it/s, loss=0.335]



Evaluation Metrics:
Precision: 0.7546
Recall: 0.6897
F1: 0.7207
ROC-AUC: 0.8157
Accuracy: 0.7327


Epoch 3: 100%|██████████| 782/782 [00:41<00:00, 18.81it/s, loss=0.254]



Evaluation Metrics:
Precision: 0.7368
Recall: 0.8129
F1: 0.7730
ROC-AUC: 0.8439
Accuracy: 0.7612

Model saved ✔
****************************************************************************************************

Running KMeans clustering...
Cluster Distribution:
{0: 3044, 1: 1956}
