In [2]:
import sys; print(sys.version)  # sanity check you're on 3.12.x

%pip install --upgrade pip
%pip install numpy pandas scikit-learn tqdm transformers
%pip install torch torchvision torchaudio

3.13.7 (v3.13.7:bcee1c32211, Aug 14 2025, 19:10:51) [Clang 16.0.0 (clang-1600.0.26.6)]
Note: you may need to restart the kernel to use updated packages.
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting transformers
  Using cached transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.35.2-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.9.18-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers

In [3]:
!pip install -q torch transformers scikit-learn tqdm

import os, re, time, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
DATA_PATH = "data/mca_econsult_100k.csv"   # your training CSV
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN, BATCH_SIZE, EPOCHS, LR = 160, 16, 2, 2e-5

device = torch.device("mps" if torch.backends.mps.is_available()
                      else ("cuda" if torch.cuda.is_available() else "cpu"))
print("Device:", device)

df = pd.read_csv(DATA_PATH)
df["comment_text"] = df["comment_text"].astype(str)
labels = sorted(df["gold_label"].unique().tolist())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
df["label"] = df["gold_label"].map(label2id)

idx_train, idx_val = train_test_split(np.arange(len(df)), test_size=0.2,
                                      stratify=df["label"], random_state=42)
train_df, val_df = df.loc[idx_train].reset_index(drop=True), df.loc[idx_val].reset_index(drop=True)
len(train_df), len(val_df), label2id


Device: mps


(80000, 20000, {'negative': 0, 'neutral': 1, 'positive': 2, 'suggestion': 3})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class CommentDS(Dataset):
    def __init__(self, texts, labels):
        self.texts, self.labels = texts, labels
    def __len__(self): return len(self.labels)
    def __getitem__(self, i):
        enc = tokenizer(self.texts[i], truncation=True, padding="max_length",
                        max_length=MAX_LEN, return_tensors="pt")
        item = {k: v.squeeze(0) for k,v in enc.items()}
        item["labels"] = torch.tensor(self.labels[i], dtype=torch.long)
        return item

train_ds = CommentDS(train_df["comment_text"].tolist(), train_df["label"].tolist())
val_ds   = CommentDS(val_df["comment_text"].tolist(),   val_df["label"].tolist())
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)


In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(labels), id2label=id2label, label2id=label2id
).to(device)

optimizer = AdamW(model.parameters(), lr=LR)
num_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer,
        num_warmup_steps=int(0.1 * num_steps), num_training_steps=num_steps)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.train()
for epoch in range(EPOCHS):
    total = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for batch in pbar:
        batch = {k: v.to(device) for k, v in batch.items()}
        out = model(**batch)
        loss = out.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        total += loss.item()
        pbar.set_postfix(loss=f"{loss.item():.4f}")
    print(f"Epoch {epoch+1} mean loss: {total/len(train_loader):.4f}")


Epoch 1/2: 100%|██████████| 5000/5000 [31:56<00:00,  2.61it/s, loss=0.0000]


Epoch 1 mean loss: 0.0492


Epoch 2/2: 100%|██████████| 5000/5000 [45:05<00:00,  1.85it/s, loss=0.0000]    

Epoch 2 mean loss: 0.0000





In [8]:
model.eval()
preds, trues = [], []
with torch.no_grad():
    for batch in val_loader:
        trues.extend(batch["labels"].numpy().tolist())
        batch = {k: v.to(device) for k,v in batch.items()}
        logits = model(**batch).logits
        preds.extend(logits.argmax(dim=-1).cpu().numpy().tolist())

y_true = [id2label[i] for i in trues]
y_pred = [id2label[i] for i in preds]
print("Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred, digits=3))


Accuracy: 1.0
              precision    recall  f1-score   support

    negative      1.000     1.000     1.000      5427
     neutral      1.000     1.000     1.000      4983
    positive      1.000     1.000     1.000      5601
  suggestion      1.000     1.000     1.000      3989

    accuracy                          1.000     20000
   macro avg      1.000     1.000     1.000     20000
weighted avg      1.000     1.000     1.000     20000



In [9]:
stamp = time.strftime("%Y%m%d_%H%M")
save_dir = f"model_artifacts/sentiment_{stamp}"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# also save labels (redundant but handy)
with open(os.path.join(save_dir, "labels.txt"), "w") as f:
    for i in range(len(labels)):
        f.write(f"{i},{id2label[i]}\n")

print("Saved to:", save_dir)


Saved to: model_artifacts/sentiment_20250929_2033
