# Telugu Sentiment Analysis Using XLM-RoBERTa

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

## 1. XLM-RoBERTa Embeddings with In-Built Classifier on Translated English Comments

In [None]:


# ——— Project-specific hyperparameters ———
MODEL_NAME    = "xlm-roberta-base"  # MODIFIED
MAX_LEN       = 128
BATCH_SIZE    = 32
LEARNING_RATE = 3e-5
WEIGHT_DECAY  = 0.01
EPOCHS        = 15
WARMUP_RATIO  = 0.1
LABEL_MAP     = {"Negative": 0, "Neutral": 1, "Positive": 2}

# ——— Load & preprocess data ———
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Translated_English', 'Label']]
df = df.dropna().reset_index(drop=True)
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# ——— Train/test split ———
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

# ——— Dataset class ———
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# ——— Tokenizer & DataLoaders ———
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = SentimentDataset(train_df["Translated_English"].tolist(), train_df["Label"].tolist(), tokenizer, MAX_LEN)
test_ds  = SentimentDataset(test_df["Translated_English"].tolist(),  test_df["Label"].tolist(),  tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ——— Model with classifier head ———
class XLMRobertaClassifier(nn.Module):
    def __init__(self, model_name, n_classes):
        super().__init__()
        self.base = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        hidden_size = self.base.config.hidden_size
        self.fc = nn.Linear(hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        # XLM-Roberta has no pooler_output, so use CLS token representation
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.drop(cls_output)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = XLMRobertaClassifier(MODEL_NAME, n_classes=3).to(device)

# ——— Optimizer, Scheduler, Loss ———
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

# ——— Training loop ———
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    correct = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    acc = correct / len(train_ds)
    print(f"Epoch {epoch}/{EPOCHS} — Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

# ——— Evaluation ———
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ——— Metrics ———
print("\nTest Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Neutral", "Positive"]))
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
print("\nConfusion Matrix:\n", cm)


2025-06-11 12:42:44.692790: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-11 12:42:44.703368: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749625964.713916  716160 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749625964.716823  716160 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749625964.726796  716160 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/15 — Loss: 1.1008, Accuracy: 0.3868


Epoch 2: 100%|██████████| 33/33 [00:08<00:00,  4.12it/s]


Epoch 2/15 — Loss: 0.8248, Accuracy: 0.6589


Epoch 3: 100%|██████████| 33/33 [00:08<00:00,  4.08it/s]


Epoch 3/15 — Loss: 0.6573, Accuracy: 0.7736


Epoch 4: 100%|██████████| 33/33 [00:08<00:00,  4.06it/s]


Epoch 4/15 — Loss: 0.4827, Accuracy: 0.8367


Epoch 5: 100%|██████████| 33/33 [00:08<00:00,  4.04it/s]


Epoch 5/15 — Loss: 0.3937, Accuracy: 0.8717


Epoch 6: 100%|██████████| 33/33 [00:08<00:00,  4.01it/s]


Epoch 6/15 — Loss: 0.2935, Accuracy: 0.9067


Epoch 7: 100%|██████████| 33/33 [00:08<00:00,  3.99it/s]


Epoch 7/15 — Loss: 0.2109, Accuracy: 0.9349


Epoch 8: 100%|██████████| 33/33 [00:08<00:00,  3.99it/s]


Epoch 8/15 — Loss: 0.1788, Accuracy: 0.9436


Epoch 9: 100%|██████████| 33/33 [00:08<00:00,  3.99it/s]


Epoch 9/15 — Loss: 0.1119, Accuracy: 0.9640


Epoch 10: 100%|██████████| 33/33 [00:08<00:00,  3.97it/s]


Epoch 10/15 — Loss: 0.0520, Accuracy: 0.9883


Epoch 11: 100%|██████████| 33/33 [00:08<00:00,  3.97it/s]


Epoch 11/15 — Loss: 0.0353, Accuracy: 0.9874


Epoch 12: 100%|██████████| 33/33 [00:08<00:00,  3.98it/s]


Epoch 12/15 — Loss: 0.0519, Accuracy: 0.9883


Epoch 13: 100%|██████████| 33/33 [00:08<00:00,  3.98it/s]


Epoch 13/15 — Loss: 0.0505, Accuracy: 0.9845


Epoch 14: 100%|██████████| 33/33 [00:08<00:00,  3.97it/s]


Epoch 14/15 — Loss: 0.0205, Accuracy: 0.9932


Epoch 15: 100%|██████████| 33/33 [00:08<00:00,  3.98it/s]

Epoch 15/15 — Loss: 0.0321, Accuracy: 0.9883






Test Accuracy: 0.7674418604651163

Classification Report:
              precision    recall  f1-score   support

    Negative       0.77      0.56      0.65        85
     Neutral       0.70      0.87      0.78        85
    Positive       0.84      0.86      0.85        88

    accuracy                           0.77       258
   macro avg       0.77      0.77      0.76       258
weighted avg       0.77      0.77      0.76       258


Confusion Matrix:
 [[48 27 10]
 [ 6 74  5]
 [ 8  4 76]]


## 2. XLM-RoBERTa Embeddings with In-Built Classifier on Original Telugu Comments

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [2]:


# ——— Project-specific hyperparameters ———
MODEL_NAME    = "xlm-roberta-base"  # MODIFIED
MAX_LEN       = 128
BATCH_SIZE    = 32
LEARNING_RATE = 3e-5
WEIGHT_DECAY  = 0.01
EPOCHS        = 10
WARMUP_RATIO  = 0.1
LABEL_MAP     = {"Negative": 0, "Neutral": 1, "Positive": 2}

# ——— Load & preprocess data ———
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Statement', 'Label']]
df = df.dropna().reset_index(drop=True)
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# ——— Train/test split ———
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

# ——— Dataset class ———
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# ——— Tokenizer & DataLoaders ———
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = SentimentDataset(train_df["Statement"].tolist(), train_df["Label"].tolist(), tokenizer, MAX_LEN)
test_ds  = SentimentDataset(test_df["Statement"].tolist(),  test_df["Label"].tolist(),  tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ——— Model with classifier head ———
class XLMRobertaClassifier(nn.Module):
    def __init__(self, model_name, n_classes):
        super().__init__()
        self.base = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        hidden_size = self.base.config.hidden_size
        self.fc = nn.Linear(hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        # XLM-Roberta has no pooler_output, so use CLS token representation
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.drop(cls_output)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = XLMRobertaClassifier(MODEL_NAME, n_classes=3).to(device)

# ——— Optimizer, Scheduler, Loss ———
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

# ——— Training loop ———
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    correct = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    acc = correct / len(train_ds)
    print(f"Epoch {epoch}/{EPOCHS} — Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

# ——— Evaluation ———
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ——— Metrics ———
print("\nTest Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Neutral", "Positive"]))
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
print("\nConfusion Matrix:\n", cm)


Epoch 1: 100%|██████████| 33/33 [00:07<00:00,  4.19it/s]


Epoch 1/10 — Loss: 1.1325, Accuracy: 0.3304


Epoch 2: 100%|██████████| 33/33 [00:07<00:00,  4.19it/s]


Epoch 2/10 — Loss: 0.9549, Accuracy: 0.5685


Epoch 3: 100%|██████████| 33/33 [00:07<00:00,  4.15it/s]


Epoch 3/10 — Loss: 0.8024, Accuracy: 0.6987


Epoch 4: 100%|██████████| 33/33 [00:07<00:00,  4.15it/s]


Epoch 4/10 — Loss: 0.7081, Accuracy: 0.7512


Epoch 5: 100%|██████████| 33/33 [00:07<00:00,  4.14it/s]


Epoch 5/10 — Loss: 0.6535, Accuracy: 0.7464


Epoch 6: 100%|██████████| 33/33 [00:08<00:00,  4.11it/s]


Epoch 6/10 — Loss: 0.5160, Accuracy: 0.8154


Epoch 7: 100%|██████████| 33/33 [00:08<00:00,  4.07it/s]


Epoch 7/10 — Loss: 0.4661, Accuracy: 0.8367


Epoch 8: 100%|██████████| 33/33 [00:08<00:00,  4.06it/s]


Epoch 8/10 — Loss: 0.3186, Accuracy: 0.8950


Epoch 9: 100%|██████████| 33/33 [00:08<00:00,  4.04it/s]


Epoch 9/10 — Loss: 0.3205, Accuracy: 0.8912


Epoch 10: 100%|██████████| 33/33 [00:08<00:00,  4.03it/s]

Epoch 10/10 — Loss: 0.1945, Accuracy: 0.9407






Test Accuracy: 0.751937984496124

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.73      0.70        85
     Neutral       0.85      0.75      0.80        85
    Positive       0.76      0.77      0.76        88

    accuracy                           0.75       258
   macro avg       0.76      0.75      0.75       258
weighted avg       0.76      0.75      0.75       258


Confusion Matrix:
 [[62  8 15]
 [14 64  7]
 [17  3 68]]


## 3. Generate and Save XLM-RoBERTa Embeddings to a CSV File

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:

# Parameters
MODEL_NAME = "xlm-roberta-base"
MAX_LEN = 128
LABEL_MAP = {"Negative": 0, "Neutral": 1, "Positive": 2}

# Load Data
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Statement', 'Label']]
df = df.dropna()
df = df[df["Label"].isin(LABEL_MAP)]
df["Label"] = df["Label"].map(LABEL_MAP).astype(int)
df = df.reset_index(drop=True)

# Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Embedding extraction
def get_embeddings(texts):
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting XLM-Roberta embeddings"):
            encoded = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=MAX_LEN)
            input_ids = encoded['input_ids'].to(device)
            attention_mask = encoded['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]  # CLS token
            embeddings.append(cls_embedding)
    return np.array(embeddings)

# Extract and save
embeddings = get_embeddings(df["Statement"].tolist())
emb_df = pd.DataFrame(embeddings)
emb_df["Label"] = df["Label"].values

save_path = "/home/santhosh/Data Scraping/scource code/TSA codes/TL_telugu_xlmroberta_embeddings.csv"
emb_df.to_csv(save_path, index=False)
print(f"✅ Embeddings saved to {save_path}")


Extracting XLM-Roberta embeddings: 100%|██████████| 1287/1287 [00:05<00:00, 215.70it/s]


✅ Embeddings saved to /home/santhosh/Data Scraping/scource code/TSA codes/TL_telugu_xlmroberta_embeddings.csv


## 4. Load Saved Embeddings and Apply Traditional ML Classifiers on Translated English Comment

In [8]:
save_path = "/home/santhosh/Data Scraping/scource code/TSA codes/TL_english_xlmroberta_embeddings.csv"

# Load saved embeddings
df = pd.read_csv(save_path)
X = df.drop(columns=["Label"]).values
y = df["Label"].values

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Classifier dictionary with GridSearch params
classifiers = {
    "SVM": (SVC(), {"C": [0.1, 1], "kernel": ["linear", "rbf"]}),
    "RandomForest": (RandomForestClassifier(), {"n_estimators": [100, 200], "max_depth": [None, 10]}),
    "LogisticRegression": (LogisticRegression(max_iter=1000), {"C": [0.1, 1]}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5]}),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {"n_estimators": [100, 200], "max_depth": [3, 6]})
}

# Train and evaluate
for name, (clf, params) in classifiers.items():
    print(f"\n🔍 Classifier: {name}")
    grid = GridSearchCV(clf, params, scoring="f1_weighted", cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"Best Parameters: {grid.best_params_}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=LABEL_MAP.keys()))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



🔍 Classifier: SVM
Best Parameters: {'C': 1, 'kernel': 'linear'}
Accuracy: 0.7286821705426356
Classification Report:
               precision    recall  f1-score   support

    Negative       0.61      0.80      0.69        85
     Neutral       0.80      0.72      0.76        85
    Positive       0.83      0.67      0.74        88

    accuracy                           0.73       258
   macro avg       0.75      0.73      0.73       258
weighted avg       0.75      0.73      0.73       258

Confusion Matrix:
 [[68 10  7]
 [19 61  5]
 [24  5 59]]

🔍 Classifier: RandomForest
Best Parameters: {'max_depth': 10, 'n_estimators': 200}
Accuracy: 0.6937984496124031
Classification Report:
               precision    recall  f1-score   support

    Negative       0.60      0.68      0.64        85
     Neutral       0.78      0.71      0.74        85
    Positive       0.73      0.69      0.71        88

    accuracy                           0.69       258
   macro avg       0.70      0.69   

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'max_depth': 3, 'n_estimators': 200}
Accuracy: 0.7403100775193798
Classification Report:
               precision    recall  f1-score   support

    Negative       0.62      0.76      0.68        85
     Neutral       0.81      0.75      0.78        85
    Positive       0.84      0.70      0.77        88

    accuracy                           0.74       258
   macro avg       0.76      0.74      0.74       258
weighted avg       0.76      0.74      0.74       258

Confusion Matrix:
 [[65 12  8]
 [17 64  4]
 [23  3 62]]


## 5. Load Saved Embeddings and Apply Traditional ML Classifiers on Original Telugu Comments

In [10]:
save_path = "/home/santhosh/Data Scraping/scource code/TSA codes/TL_telugu_xlmroberta_embeddings.csv"

# Load saved embeddings
df = pd.read_csv(save_path)
X = df.drop(columns=["Label"]).values
y = df["Label"].values

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Classifier dictionary with GridSearch params
classifiers = {
    "SVM": (SVC(), {"C": [0.1, 1], "kernel": ["linear", "rbf"]}),
    "RandomForest": (RandomForestClassifier(), {"n_estimators": [100, 200], "max_depth": [None, 10]}),
    "LogisticRegression": (LogisticRegression(max_iter=1000), {"C": [0.1, 1]}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5]}),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {"n_estimators": [100, 200], "max_depth": [3, 6]})
}

# Train and evaluate
for name, (clf, params) in classifiers.items():
    print(f"\n🔍 Classifier: {name}")
    grid = GridSearchCV(clf, params, scoring="f1_weighted", cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"Best Parameters: {grid.best_params_}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=LABEL_MAP.keys()))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



🔍 Classifier: SVM
Best Parameters: {'C': 1, 'kernel': 'linear'}
Accuracy: 0.689922480620155
Classification Report:
               precision    recall  f1-score   support

    Negative       0.57      0.76      0.65        85
     Neutral       0.80      0.69      0.74        85
    Positive       0.77      0.61      0.68        88

    accuracy                           0.69       258
   macro avg       0.71      0.69      0.69       258
weighted avg       0.71      0.69      0.69       258

Confusion Matrix:
 [[65 10 10]
 [20 59  6]
 [29  5 54]]

🔍 Classifier: RandomForest
Best Parameters: {'max_depth': None, 'n_estimators': 200}
Accuracy: 0.6589147286821705
Classification Report:
               precision    recall  f1-score   support

    Negative       0.56      0.64      0.60        85
     Neutral       0.73      0.73      0.73        85
    Positive       0.70      0.61      0.65        88

    accuracy                           0.66       258
   macro avg       0.66      0.66  

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'max_depth': 3, 'n_estimators': 200}
Accuracy: 0.7248062015503876
Classification Report:
               precision    recall  f1-score   support

    Negative       0.64      0.69      0.67        85
     Neutral       0.78      0.81      0.79        85
    Positive       0.77      0.67      0.72        88

    accuracy                           0.72       258
   macro avg       0.73      0.73      0.72       258
weighted avg       0.73      0.72      0.72       258

Confusion Matrix:
 [[59 13 13]
 [11 69  5]
 [22  7 59]]
