In [None]:

# 1) Dependencies
!pip -q install -U "transformers>=4.31" "datasets>=2.12" "accelerate>=0.21" scikit-learn
import os
os.environ["WANDB_DISABLED"] = "true"

import json, inspect, sys
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding, set_seed
)



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_6

In [None]:
# 2) Normalizer --------------------------------------------------------------
class TextNormalizer:
    def __init__(self, map_path):
              # Load mapping file (JSON) that contains emojis and slang replacements

        with open(map_path, "r", encoding="utf-8") as f:
            mp = json.load(f)
             #  Create dictionary: emoji → label
        # Example: 😀 → joy, 😢 → sadness
        self.emoji_map = {emo: lab for lab, emos in mp.get("emoji_by_label", {}).items() for emo in emos}
        self.slang_map = mp.get("slang", {})         # Load slang dictionary (maps slang → formal word)
                                                          # Example: "u" → "you", "gr8" → "great"
    def normalize(self, text: str) -> str:
      # Lowercase everything for consistency
        text = text.lower()
          #  Replace slang words using slang_map
        text = " ".join(self.slang_map.get(w, w) for w in text.split())
          # Replace emojis with their label
        for emo, rep in self.emoji_map.items():
            if emo in text:
                text = text.replace(emo, f" {rep} ")
                        # Remove extra spaces and return clean text

        return " ".join(text.split())


In [None]:
# 3) Load data ----------------------------------------------------------------
def load_split(path):
    return pd.read_csv(path, sep=";", header=None, names=["text","label"])

train_df = load_split("train.txt")
val_df   = load_split("val.txt")
test_df  = load_split("test.txt")
noisy_df = load_split("noisy_test.txt")

normalizer = TextNormalizer("emoji_slang_map.json")
for df in (train_df, val_df, test_df, noisy_df):
    df["text"] = df["text"].apply(normalizer.normalize)

label_list = sorted(train_df["label"].unique())
label2id = {l:i for i,l in enumerate(label_list)}
for df in (train_df, val_df, test_df, noisy_df):
    df["label_id"] = df["label"].map(label2id)


In [None]:
# 4) Settings -----------------------------------------------------------------
model_name = "distilbert-base-uncased"  # <-- switch to: "microsoft/deberta-v3-base", "roberta-base",
                                        #     "google/electra-base-discriminator", "xlnet-base-cased",
                                        #     or "logistic_regression"
NUM_EPOCHS = 3
BATCH_SIZE = 16
SEED = 42
set_seed(SEED)

pretty = {
    "distilbert-base-uncased": "DistilBERT",
    "microsoft/deberta-v3-base": "DeBERTa-v3-base",
    "roberta-base": "RoBERTa-base",
    "google/electra-base-discriminator": "ELECTRA-base",
    "xlnet-base-cased": "XLNet-base",
    "logistic_regression": "Logistic Regression"
}
model_pretty = pretty.get(model_name, model_name)


In [None]:
# 5) Metrics ------------------------------------------------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
    yhat = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, yhat)
    p,r,f1,_ = precision_recall_fscore_support(labels, yhat, average="macro", zero_division=0)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

def print_eval(title, d):
    print(f"{title} - Acc: {100*d['eval_accuracy']:.2f}% | "
          f"P: {100*d['eval_precision']:.2f}% | R: {100*d['eval_recall']:.2f}% | F1: {100*d['eval_f1']:.2f}%")


In [None]:
# 6) Version-safe TrainingArguments builder -----------------------------------
def build_training_args(**kwargs):
    """Filter unknown kwargs AND avoid 'load_best_model_at_end' traps when eval/save strategy unsupported."""
    sig = inspect.signature(TrainingArguments.__init__).parameters
    supports = lambda k: (k in sig)

    # If eval/save strategy not supported, force-disable load_best_model_at_end & related keys
    has_eval = supports("evaluation_strategy")
    has_save = supports("save_strategy")
    if not (has_eval and has_save):
        kwargs.pop("evaluation_strategy", None)
        kwargs.pop("save_strategy", None)
        # turning off best-model logic to prevent mismatch error
        kwargs["load_best_model_at_end"] = False
        kwargs.pop("metric_for_best_model", None)
        kwargs.pop("greater_is_better", None)

    # Keep only supported keys
    filtered = {k:v for k,v in kwargs.items() if supports(k)}
    return TrainingArguments(**filtered)


In [None]:
# 7) Train/Evaluate -----------------------------------------------------------
if model_name == "logistic_regression":
    print(f"Training {model_pretty} ...")
    # Vectorize text data into TF-IDF features
    vec = TfidfVectorizer()
    Xtr = vec.fit_transform(train_df.text)
    Xte = vec.transform(test_df.text)
    Xno = vec.transform(noisy_df.text)

    clf = LogisticRegression(max_iter=1000, solver="liblinear")
    clf.fit(Xtr, train_df.label_id)

    def eval_split(X, y, title):
        pred = clf.predict(X)
        acc = accuracy_score(y, pred)
        p,r,f1,_ = precision_recall_fscore_support(y, pred, average="macro", zero_division=0)
        print(f"{title} - Acc: {100*acc:.2f}% | P: {100*p:.2f}% | R: {100*r:.2f}% | F1: {100*f1:.2f}%")

    print("\nBaseline performance (trained on clean train):")
    eval_split(Xte, test_df.label_id, "Clean Test")
    eval_split(Xno, noisy_df.label_id, "Noisy Test")

else:
    print(f"Training {model_pretty} ...")
    # HF datasets
    to_hfds = lambda df: Dataset.from_pandas(df[["text","label_id"]].rename(columns={"label_id":"labels"}))
    ds_tr, ds_va, ds_te, ds_no = map(to_hfds, (train_df, val_df, test_df, noisy_df))
 #Tokenize text for transformer models
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def tok(batch): return tokenizer(batch["text"], truncation=True)
    ds_tr, ds_va, ds_te, ds_no = [ds.map(tok, batched=True).remove_columns("text")
                                  for ds in (ds_tr, ds_va, ds_te, ds_no)]
    #  Load pre-trained transformer model for classification

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list))
    collator = DataCollatorWithPadding(tokenizer)
 # Define training arguments
    args = build_training_args(
        output_dir=f"./{model_pretty}_model",
        evaluation_strategy="epoch",     # auto-removed if unsupported
        save_strategy="epoch",           # auto-removed if unsupported
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,     # auto-disabled if eval/save not supported
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        save_total_limit=1,
        seed=SEED,
        logging_dir="./logs",
        logging_steps=50
    )
  #  HuggingFace Trainer setup
    trainer = Trainer(
        model=model, args=args,
        train_dataset=ds_tr, eval_dataset=ds_va,
        tokenizer=tokenizer, data_collator=collator,
        compute_metrics=compute_metrics
    )
    trainer.train()
#  Evaluate on clean & noisy test sets
    met_te  = trainer.evaluate(ds_te)
    met_no  = trainer.evaluate(ds_no)
    print(f"\n{model_pretty} Performance (trained on clean data):")
    print_eval("Clean Test", met_te)
    print_eval("Noisy Test", met_no)

     # Fine-tune (domain adaptation) on noisy dataset for robustness
    print(f"\nFine-tuning {model_pretty} on noisy data ...")
    noisy_args = build_training_args(
        output_dir=f"./{model_pretty}_noisy_adapt",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        num_train_epochs=1,
        seed=SEED,
        logging_steps=10
    )
    trainer_noisy = Trainer(
        model=trainer.model, args=noisy_args,
        train_dataset=ds_no, tokenizer=tokenizer,
        data_collator=collator, compute_metrics=compute_metrics
    )
    trainer_noisy.train()
# Evaluate again after fine-tuning on noisy data
    print(f"\n{model_pretty} Performance (after fine-tuning on noisy data):")
    print_eval("Clean Test", trainer_noisy.evaluate(ds_te))
    print_eval("Noisy Test", trainer_noisy.evaluate(ds_no))

Training DistilBERT ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
50,1.6347
100,1.3143
150,1.0077
200,0.7543
250,0.6017
300,0.4427
350,0.4292
400,0.3791
450,0.3088
500,0.3046


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



DistilBERT Performance (trained on clean data):
Clean Test - Acc: 92.90% | P: 88.96% | R: 87.11% | F1: 87.91%
Noisy Test - Acc: 82.25% | P: 80.96% | R: 75.51% | F1: 77.50%

Fine-tuning DistilBERT on noisy data ...


  trainer_noisy = Trainer(


Step,Training Loss
10,0.7307
20,0.7284
30,0.6528
40,0.6016
50,0.663
60,0.4655
70,0.4479
80,0.3282
90,0.5263
100,0.3156



DistilBERT Performance (after fine-tuning on noisy data):


Clean Test - Acc: 93.40% | P: 89.74% | R: 88.56% | F1: 89.01%
Noisy Test - Acc: 89.30% | P: 89.39% | R: 83.28% | F1: 85.68%


In [None]:
#test your own data
import torch, numpy as np

inv_label = {v: k for k, v in label2id.items()}

def predict_one(sentence: str, max_len=512):
    model.eval()
    # tokenize on CPU, then move each tensor to the model's device
    enc = tokenizer([sentence], return_tensors="pt", truncation=True, max_length=max_len)
    enc = {k: v.to(model.device) for k, v in enc.items()}  # << key fix
    with torch.no_grad():
        logits = model(**enc).logits[0].detach().cpu().numpy()  # bring back to CPU for numpy
    probs = np.exp(logits - logits.max()); probs = probs / probs.sum()
    pred_id = int(probs.argmax())
    return inv_label[pred_id], float(probs[pred_id])

print(predict_one("I am sad."))
print(predict_one("This is the best day ever!"))
print(predict_one("I feel anxious and worried."))


('sadness', 0.9791167378425598)
('joy', 0.7019320130348206)
('fear', 0.9959421753883362)


In [None]:
print(predict_one("This is the best day ever!"))


('joy', 0.7019320130348206)


In [None]:
print(predict_one("I feel anxious and worried."))


('fear', 0.9959421753883362)


In [None]:
print(predict_one("THIS IS EXACTLY WHAT I WANTED , I AM SOOOOOOOOO EXCITED"))


('joy', 0.9943960905075073)


In [None]:
print(predict_one("THIS IS EXACTLY WHAT I WANTED , I AM SOOOOOOOOO EXCITED"))


('joy', 0.9943960905075073)


In [None]:
print(predict_one("i feel sooooooooooooo disappointed wheeeeeeeeen my friiiiiiiind doesn t call me baaaack"))


('sadness', 0.9897728562355042)


In [None]:
print(predict_one("iii gaaaarb it frooom ttttthe aair its smooth fraaaaame feels cold to the toouch 🥴"))


('anger', 0.9855018854141235)


In [None]:
print(predict_one("I aM So MaD!!!!!!"))


('anger', 0.994768500328064)


In [None]:
print(predict_one("Wooooww I loved it.!!!!"))


('love', 0.7670127749443054)


In [None]:
print(predict_one("good??? yes!!!"))


('joy', 0.4398828446865082)


In [None]:
print(predict_one("THIS IS EXACTLY WHAT I WANTED , I AM SOOOOOOOOOOO EXCITED"))


('joy', 0.9936413168907166)


In [None]:
#Model Evaluation on Clean & Noisy Data
from sklearn.metrics import classification_report
import numpy as np, pandas as pd

def eval_split(hf_dataset, raw_df, title):
    # trainer handles device correctly internally
    pred = trainer.predict(hf_dataset)
    y_true = pred.label_ids
    y_pred = np.argmax(pred.predictions, axis=1)
    inv = {v:k for k,v in label2id.items()}
    names = [inv[i] for i in range(len(inv))]
    print(f"\n{title} — classification report")
    print(classification_report(y_true, y_pred, target_names=names, digits=3))
    pd.DataFrame({
        "text": raw_df["text"],
        "true": [inv[i] for i in y_true],
        "pred": [inv[i] for i in y_pred]
    }).to_csv(f"{title.replace(' ','_').lower()}_preds.csv", index=False)

eval_split(ds_te, test_df,  "Clean Test")
eval_split(ds_no, noisy_df, "Noisy Test")


Clean Test — classification report
              precision    recall  f1-score   support

       anger      0.905     0.935     0.919       275
        fear      0.893     0.929     0.910       224
         joy      0.965     0.947     0.956       695
        love      0.841     0.868     0.854       159
     sadness      0.966     0.969     0.967       581
    surprise      0.815     0.667     0.733        66

    accuracy                          0.934      2000
   macro avg      0.897     0.886     0.890      2000
weighted avg      0.934     0.934     0.934      2000




Noisy Test — classification report
              precision    recall  f1-score   support

       anger      0.900     0.880     0.890       275
        fear      0.864     0.848     0.856       224
         joy      0.903     0.940     0.921       695
        love      0.879     0.824     0.851       159
     sadness      0.889     0.914     0.902       581
    surprise      0.929     0.591     0.722        66

    accuracy                          0.893      2000
   macro avg      0.894     0.833     0.857      2000
weighted avg      0.893     0.893     0.892      2000



In [None]:
#Model Checkpoint Saving with Labels
import os, json
SAVE_DIR = "./DistilBERT_model_ckpt"   # new, explicit folder
os.makedirs(SAVE_DIR, exist_ok=True)

# save model + tokenizer
trainer.save_model(SAVE_DIR)           # writes config.json, pytorch_model.bin, etc.
tokenizer.save_pretrained(SAVE_DIR)    # writes tokenizer files

# save labels for inference UI
with open(os.path.join(SAVE_DIR, "labels.json"), "w") as f:
    json.dump(label_list, f)

# sanity check
import glob
print("Saved files:", sorted(os.path.basename(p) for p in glob.glob(SAVE_DIR+"/*")))


Saved files: ['config.json', 'labels.json', 'model.safetensors', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'training_args.bin', 'vocab.txt']


In [None]:
#DistilBERT Prediction Function after finetuning
import json, torch, numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_DIR = "./DistilBERT_model_ckpt"   # use the folder you just saved
MAX_LEN = 512

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to("cuda" if torch.cuda.is_available() else "cpu").eval()
labels = json.load(open(f"{MODEL_DIR}/labels.json", "r"))

def predict(sentence: str):
    enc = tokenizer([sentence], return_tensors="pt", truncation=True, max_length=MAX_LEN)
    enc = {k: v.to(model.device) for k, v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits[0].detach().cpu().numpy()
    probs = np.exp(logits - logits.max()); probs = probs / probs.sum()
    idx = int(probs.argmax())
    return labels[idx], float(probs[idx])

print(predict("I am happy."))
print(predict("This makes me angry!"))


('joy', 0.9905523061752319)
('anger', 0.9961214661598206)


In [None]:
print(predict("i told my fiance how i am feeling so angry and upset;"))


('anger', 0.9973813891410828)


In [None]:
print(predict("i dont blame it all to them and im not angry at them infact i feel fairly sympathetic for them"))


('love', 0.9858572483062744)


In [None]:
print(predict("i blunder through my life ignoring the pain when at all possible and feeling only that dull ache like hearing only the slightest echo of a scream far away"))


('sadness', 0.9969690442085266)


In [None]:
print(predict("i feel the pressure to be funny all the time"))


('surprise', 0.9740992784500122)


In [None]:
print(predict("THIS IS EXACTLY WHAT I WANTED , I AM SOOOOOOOOO EXCITED 😍"))


('joy', 0.994825005531311)


In [None]:
print(predict("THIS IS EXACTLY WHAT I WANTED , I AM SOOOOOOOOO EXCITED"))


('joy', 0.9943960905075073)


In [None]:
print(predict("i feel sooooooooooooo disappointed wheeeeeeeeen my friiiiiiiind doesn t call me baaaack 💔"))


('sadness', 0.9891345500946045)


In [None]:
print(predict("iii gaaaarb it frooom ttttthe aair its smooth fraaaaame feels cold to the toouch 🥴"))


('anger', 0.9855018854141235)
