In [1]:
!pip install -q transformers scikit-learn torch


In [2]:
pip install -U "transformers>=4.44.0" "accelerate>=0.26.0" torch scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch, numpy as np, pandas as pd, re
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW



In [6]:
# Load your file
df = pd.read_csv("sih_mixedtypes_english_100k.csv")

def clean_text(s):
    if not isinstance(s, str): return ""
    s = s.lower().strip()
    s = re.sub(r"http\S+|www\S+", " <URL> ", s)
    s = re.sub(r"@\w+", " <USER> ", s)
    s = re.sub(r"#\w+", " <HASHTAG> ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

df["clean_text"] = df["comment_text"].apply(clean_text)

labels = sorted(df["gold_label"].unique().tolist())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
df["label"] = df["gold_label"].map(label2id)

idx = np.arange(len(df))
idx_train, idx_test = train_test_split(idx, test_size=0.2, stratify=df["label"], random_state=42)
train_df = df.loc[idx_train].reset_index(drop=True)
test_df  = df.loc[idx_test].reset_index(drop=True)

df[["comment_text","clean_text","gold_label","label"]].head()


Unnamed: 0,comment_text,clean_text,gold_label,label
0,who asked for this? lol nah 👎 #redtape,who asked for this? lol nah 👎 <HASHTAG>,negative,0
1,Love this update 🔥 less red tape #progress,love this update 🔥 less red tape <HASHTAG>,positive,2
2,This amendment simplifies compliance procedure...,this amendment simplifies compliance procedure...,positive,2
3,bruh these fines are wild 🤦 #overkill,bruh these fines are wild 🤦 <HASHTAG>,negative,0
4,bruh these fines are wild 🤦 #overkill,bruh these fines are wild 🤦 <HASHTAG>,negative,0


In [7]:
MODEL_NAME = "distilbert-base-uncased"  # faster on Mac; change to bert-base-uncased if you want

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 160
BATCH_SIZE = 16

class CommentDS(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_ds = CommentDS(train_df["clean_text"].tolist(), train_df["label"].tolist())
test_ds  = CommentDS(test_df["clean_text"].tolist(),  test_df["label"].tolist())

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)


In [8]:
device = torch.device("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
print("Device:", device)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(labels), id2label=id2label, label2id=label2id
).to(device)


Device: mps


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
EPOCHS = 2
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * num_training_steps), num_training_steps=num_training_steps
)
