In [None]:
pip install pandas scikit-learn torch transformers datasets emoji




In [None]:
import re
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight

from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)


In [None]:
df = pd.read_csv("/content/drive/MyDrive/PS_train.csv")

print(df.head())
print(df['labels'].value_counts())

                                             content         labels
0  ‡Æ§‡ØÜ‡Æ©‡Øç‡Æï‡Ææ‡Æö‡Æø ‡Æ§‡Øä‡Æï‡ØÅ‡Æ§‡Æø ‡Æ™‡ØÅ‡Æ§‡Æø‡ÆØ ‡Æ§‡ÆÆ‡Æø‡Æ¥‡Æï‡ÆÆ‡Øç ‡Æï‡Æü‡Øç‡Æö‡Æø ‡Æµ‡Øá‡Æü‡Øç‡Æ™‡Ææ‡Æ≥‡Æ∞‡Øç ...        Neutral
1  ‡ÆÖ‡Æ£‡Øç‡Æ£‡Æ©‡Øç ‡Æá‡Æ§‡Æ©‡Øà ‡Æö‡ØÇ‡Æö‡Æï‡ÆÆ‡Ææ‡Æï 11 ‡ÆÆ‡Ææ‡Æ§‡Æô‡Øç‡Æï‡Æ≥‡Øç ‡ÆÆ‡ØÅ‡Æ©‡Øç‡Æ™‡Øá ‡Æ™‡Øá‡Æü‡Øç‡Æü‡Æø‡ÆØ...  Substantiated
2  ‡Æí‡Æ∞‡ØÅ ‡Æµ‡Æ∞‡ØÅ‡Æü‡ÆÆ‡Øç ‡ÆÜ‡Æï‡Æø ‡Æµ‡Æø‡Æü‡Øç‡Æü‡Æ§‡ØÅ ‡Æá‡Æ®‡Øç‡Æ§ ‡Æ§‡ØÅ‡ÆØ‡Æ∞‡ÆÆ‡Øç ‡Æ®‡Øá‡Æ∞‡Øç‡Æ®‡Øç‡Æ§‡ØÅ......    Opinionated
3  ‡Æé‡Æü‡Æ™‡Øç‡Æ™‡Ææ‡Æü‡Æø‡ÆØ‡Øà ‡Æï‡Æ£‡Øç‡Æü‡ØÅ‡Æï‡Øä‡Æ≥‡Øç‡Æ≥‡Ææ‡Æ§ "‡Æé‡Æü‡Æ™‡Øç‡Æ™‡Ææ‡Æü‡Æø"ü´¢\n ---\n‡ÆÜ‡Æ§‡Æ∞...       Positive
4  ‡Æé‡Æô‡Øç‡Æï‡Æ≥‡Æø‡Æ©‡Øç ‡ÆÖ‡Æ∞‡Æö‡Æø‡ÆØ‡Æ≤‡Øç ‡ÆÖ‡Æü‡ØÅ‡Æ§‡Øç‡Æ§ ‡Æ§‡Æ≤‡Øà‡ÆÆ‡ØÅ‡Æ±‡Øà‡Æï‡Øç‡Æï‡ØÅ‡ÆÆ‡Ææ‡Æ©‡Æ§‡ØÅ \n#‡ÆÆ‡Æï...    Opinionated
labels
Opinionated          1361
Sarcastic             790
Neutral               637
Positive              575
Substantiated         412
Negative              406
None of the above     171
Name: count, dtype: int64


In [None]:
def preprocess_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["clean_text"] = df["content"].apply(preprocess_text)


In [None]:
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["labels"])


In [None]:
subst_df = df[df["labels"] == "Substantiated"]
op_df    = df[df["labels"] == "Opinionated"]

df = pd.concat([
    df,
    subst_df, subst_df,
    op_df
]).sample(frac=1, random_state=42).reset_index(drop=True)

print("After oversampling:")
print(df["labels"].value_counts())


After oversampling:
labels
Opinionated          2722
Substantiated        1236
Sarcastic             790
Neutral               637
Positive              575
Negative              406
None of the above     171
Name: count, dtype: int64


In [None]:
label2id = {str(label): int(i) for i, label in enumerate(label_encoder.classes_)}
id2label = {int(i): str(label) for i, label in enumerate(label_encoder.classes_)}

print(label2id)


{'Negative': 0, 'Neutral': 1, 'None of the above': 2, 'Opinionated': 3, 'Positive': 4, 'Sarcastic': 5, 'Substantiated': 6}


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["clean_text"].tolist(),
    df["label_id"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["label_id"]
)


In [None]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)

class_weights = torch.tensor(class_weights, dtype=torch.float)


In [None]:
MODEL_NAME = "google/muril-base-cased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:
class TamilDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=192):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [None]:
train_dataset = TamilPoliticalDataset(train_texts, train_labels, tokenizer)
val_dataset   = TamilPoliticalDataset(val_texts, val_labels, tokenizer)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=7,
    label2id=label2id,
    id2label=id2label
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {"macro_f1": f1_score(labels, preds, average="macro")}


In [None]:
class FocalLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs.logits

        ce_loss = F.cross_entropy(
            logits,
            labels,
            weight=class_weights.to(logits.device),
            reduction="none"
        )

        gamma = 2.0
        pt = torch.exp(-ce_loss)
        loss = ((1 - pt) ** gamma * ce_loss).mean()

        return (loss, outputs) if return_outputs else loss


In [None]:
training_args = TrainingArguments(
    output_dir="./muril_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    logging_steps=50
)


In [None]:
trainer = FocalLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Macro F1
1,1.4678,1.397587,0.156246
2,1.285,1.296185,0.189571
3,1.3095,1.213615,0.212975
4,1.0679,1.169306,0.264907
5,1.0566,1.137073,0.264083
6,1.0876,1.124863,0.279639
7,1.0898,1.128349,0.287419
8,0.8399,1.1161,0.29085
9,0.8408,1.110779,0.299643
10,0.9076,1.114458,0.299216


TrainOutput(global_step=6540, training_loss=1.0817967977727954, metrics={'train_runtime': 2723.3407, 'train_samples_per_second': 19.201, 'train_steps_per_second': 2.401, 'total_flos': 5159510522208000.0, 'train_loss': 1.0817967977727954, 'epoch': 10.0})

In [None]:
preds = trainer.predict(val_dataset)
probs = torch.softmax(torch.tensor(preds.predictions), dim=1).numpy()

y_pred = []
for p in probs:
    top1 = np.argmax(p)
    top2 = np.argsort(p)[-2]

    if abs(p[top1] - p[top2]) < 0.05:
        y_pred.append(label2id["Opinionated"])
    else:
        y_pred.append(top1)

y_true = preds.label_ids


In [None]:
print(classification_report(
    y_true,
    y_pred,
    target_names=label_encoder.classes_
))


                   precision    recall  f1-score   support

         Negative       0.00      0.00      0.00        81
          Neutral       0.13      0.07      0.09       128
None of the above       0.88      0.88      0.88        34
      Opinionated       0.43      0.81      0.56       545
         Positive       0.00      0.00      0.00       115
        Sarcastic       0.34      0.37      0.35       158
    Substantiated       0.00      0.00      0.00       247

         accuracy                           0.41      1308
        macro avg       0.25      0.30      0.27      1308
     weighted avg       0.26      0.41      0.31      1308



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("Predicted labels:",
      [id2label[i] for i in np.unique(y_pred)])



Predicted labels: ['Neutral', 'None of the above', 'Opinionated', 'Sarcastic']
