# Telugu Sentiment Analysis Using mBERT

# Telugu Sentiment Analysis Using mBERT

## 1. mBERT Embeddings with In-Built Classifier on Translated English Comments

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [None]:

# ——— Project-specific hyperparameters ———
MODEL_NAME    = "bert-base-multilingual-cased"  # MODIFIED
MAX_LEN       = 128
BATCH_SIZE    = 32
LEARNING_RATE = 3e-5
WEIGHT_DECAY  = 0.01
EPOCHS        = 10
WARMUP_RATIO  = 0.1
LABEL_MAP     = {"Negative": 0, "Neutral": 1, "Positive": 2}

# ——— Load & preprocess data ———
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Translated_English', 'Label']]
df = df.dropna().reset_index(drop=True)
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# ——— Train/test split ———
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

# ——— Dataset ———
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# ——— Tokenizer & DataLoaders ———
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = SentimentDataset(train_df["Translated_English"].tolist(), train_df["Label"].tolist(), tokenizer, MAX_LEN)
test_ds  = SentimentDataset(test_df["Translated_English"].tolist(),  test_df["Label"].tolist(),  tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ——— mBERT with inbuilt classifier head ———
class MBertClassifier(nn.Module):
    def __init__(self, model_name, n_classes):
        super().__init__()
        self.base = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        hidden_size = self.base.config.hidden_size
        self.fc = nn.Linear(hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # mBERT has pooler_output
        x = self.drop(pooled_output)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MBertClassifier(MODEL_NAME, n_classes=3).to(device)

# ——— Optimizer, Scheduler, Loss ———
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

# ——— Training loop ———
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    correct = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    acc = correct / len(train_ds)
    print(f"Epoch {epoch}/{EPOCHS} — Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

# ——— Evaluation ———
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ——— Metrics ———
print("\nTest Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Neutral", "Positive"]))
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
print("\nConfusion Matrix:\n", cm)


2025-06-11 12:47:56.210843: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-11 12:47:56.221438: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749626276.232232  716584 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749626276.235159  716584 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749626276.245169  716584 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/10 — Loss: 0.9707, Accuracy: 0.4995


Epoch 2: 100%|██████████| 33/33 [00:07<00:00,  4.48it/s]


Epoch 2/10 — Loss: 0.7510, Accuracy: 0.6812


Epoch 3: 100%|██████████| 33/33 [00:07<00:00,  4.47it/s]


Epoch 3/10 — Loss: 0.5544, Accuracy: 0.7833


Epoch 4: 100%|██████████| 33/33 [00:07<00:00,  4.45it/s]


Epoch 4/10 — Loss: 0.3958, Accuracy: 0.8659


Epoch 5: 100%|██████████| 33/33 [00:07<00:00,  4.42it/s]


Epoch 5/10 — Loss: 0.2967, Accuracy: 0.8980


Epoch 6: 100%|██████████| 33/33 [00:07<00:00,  4.39it/s]


Epoch 6/10 — Loss: 0.2077, Accuracy: 0.9232


Epoch 7: 100%|██████████| 33/33 [00:07<00:00,  4.35it/s]


Epoch 7/10 — Loss: 0.1021, Accuracy: 0.9689


Epoch 8: 100%|██████████| 33/33 [00:07<00:00,  4.34it/s]


Epoch 8/10 — Loss: 0.0484, Accuracy: 0.9854


Epoch 9: 100%|██████████| 33/33 [00:07<00:00,  4.32it/s]


Epoch 9/10 — Loss: 0.0622, Accuracy: 0.9796


Epoch 10: 100%|██████████| 33/33 [00:07<00:00,  4.29it/s]

Epoch 10/10 — Loss: 0.0439, Accuracy: 0.9883






Test Accuracy: 0.6976744186046512

Classification Report:
              precision    recall  f1-score   support

    Negative       0.62      0.62      0.62        85
     Neutral       0.72      0.75      0.74        85
    Positive       0.75      0.72      0.73        88

    accuracy                           0.70       258
   macro avg       0.70      0.70      0.70       258
weighted avg       0.70      0.70      0.70       258


Confusion Matrix:
 [[53 19 13]
 [13 64  8]
 [19  6 63]]


## 2. mBERT Embeddings with In-Built Classifier on Original Telugu Comments

In [2]:

# ——— Project-specific hyperparameters ———
MODEL_NAME    = "bert-base-multilingual-cased"  # MODIFIED
MAX_LEN       = 128
BATCH_SIZE    = 32
LEARNING_RATE = 3e-5
WEIGHT_DECAY  = 0.01
EPOCHS        = 10
WARMUP_RATIO  = 0.1
LABEL_MAP     = {"Negative": 0, "Neutral": 1, "Positive": 2}

# ——— Load & preprocess data ———
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Statement', 'Label']]
df = df.dropna().reset_index(drop=True)
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# ——— Train/test split ———
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

# ——— Dataset ———
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# ——— Tokenizer & DataLoaders ———
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = SentimentDataset(train_df["Statement"].tolist(), train_df["Label"].tolist(), tokenizer, MAX_LEN)
test_ds  = SentimentDataset(test_df["Statement"].tolist(),  test_df["Label"].tolist(),  tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ——— mBERT with inbuilt classifier head ———
class MBertClassifier(nn.Module):
    def __init__(self, model_name, n_classes):
        super().__init__()
        self.base = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        hidden_size = self.base.config.hidden_size
        self.fc = nn.Linear(hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # mBERT has pooler_output
        x = self.drop(pooled_output)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MBertClassifier(MODEL_NAME, n_classes=3).to(device)

# ——— Optimizer, Scheduler, Loss ———
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

# ——— Training loop ———
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    correct = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    acc = correct / len(train_ds)
    print(f"Epoch {epoch}/{EPOCHS} — Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

# ——— Evaluation ———
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ——— Metrics ———
print("\nTest Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Neutral", "Positive"]))
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
print("\nConfusion Matrix:\n", cm)


Epoch 1: 100%|██████████| 33/33 [00:07<00:00,  4.44it/s]


Epoch 1/10 — Loss: 0.9916, Accuracy: 0.4704


Epoch 2: 100%|██████████| 33/33 [00:07<00:00,  4.41it/s]


Epoch 2/10 — Loss: 0.8320, Accuracy: 0.5792


Epoch 3: 100%|██████████| 33/33 [00:07<00:00,  4.39it/s]


Epoch 3/10 — Loss: 0.7095, Accuracy: 0.6968


Epoch 4: 100%|██████████| 33/33 [00:07<00:00,  4.36it/s]


Epoch 4/10 — Loss: 0.6162, Accuracy: 0.7279


Epoch 5: 100%|██████████| 33/33 [00:07<00:00,  4.32it/s]


Epoch 5/10 — Loss: 0.4051, Accuracy: 0.8523


Epoch 6: 100%|██████████| 33/33 [00:07<00:00,  4.30it/s]


Epoch 6/10 — Loss: 0.2663, Accuracy: 0.9038


Epoch 7: 100%|██████████| 33/33 [00:07<00:00,  4.27it/s]


Epoch 7/10 — Loss: 0.1739, Accuracy: 0.9456


Epoch 8: 100%|██████████| 33/33 [00:07<00:00,  4.26it/s]


Epoch 8/10 — Loss: 0.1449, Accuracy: 0.9543


Epoch 9: 100%|██████████| 33/33 [00:07<00:00,  4.23it/s]


Epoch 9/10 — Loss: 0.0971, Accuracy: 0.9747


Epoch 10: 100%|██████████| 33/33 [00:07<00:00,  4.24it/s]

Epoch 10/10 — Loss: 0.2307, Accuracy: 0.9300






Test Accuracy: 0.6550387596899225

Classification Report:
              precision    recall  f1-score   support

    Negative       0.65      0.54      0.59        85
     Neutral       0.66      0.75      0.70        85
    Positive       0.66      0.67      0.66        88

    accuracy                           0.66       258
   macro avg       0.65      0.65      0.65       258
weighted avg       0.65      0.66      0.65       258


Confusion Matrix:
 [[46 18 21]
 [11 64 10]
 [14 15 59]]


## 3. Generate and Save mBERT Embeddings to a CSV File

In [2]:
import os
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [5]:

# Parameters
MODEL_NAME = "bert-base-multilingual-cased"
MAX_LEN = 128
LABEL_MAP = {"Negative": 0, "Neutral": 1, "Positive": 2}


# Load and preprocess data
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Statement', 'Label']]
df = df.dropna()
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to extract pooled embeddings
def get_embeddings(texts):
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting mBERT embeddings"):
            encoded = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=MAX_LEN)
            input_ids = encoded['input_ids'].to(device)
            attention_mask = encoded['attention_mask'].to(device)
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            pooled = output.pooler_output.cpu().numpy()[0]
            embeddings.append(pooled)
    return np.array(embeddings)

SAVE_PATH = "/home/santhosh/Data Scraping/scource code/TSA codes/TL_telugu_mbert_embeddings.csv"
# Extract and save embeddings
embeddings = get_embeddings(df["Statement"].tolist())
emb_df = pd.DataFrame(embeddings)
emb_df["Label"] = df["Label"].values
emb_df.to_csv(SAVE_PATH, index=False)
print(f"✅ Embeddings saved to {SAVE_PATH}")



Extracting mBERT embeddings: 100%|██████████| 1287/1287 [00:05<00:00, 222.31it/s]


✅ Embeddings saved to /home/santhosh/Data Scraping/scource code/TSA codes/TL_telugu_mbert_embeddings.csv



## 4. Load Saved Embeddings and Apply Traditional ML Classifiers on Translated English Comments

In [4]:
SAVE_PATH = "/home/santhosh/Data Scraping/scource code/TSA codes/TL_english_mbert_embeddings.csv"

# Classification using traditional ML models
df = pd.read_csv(SAVE_PATH)
X = df.drop(columns=["Label"]).values
y = df["Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define classifiers and their grid parameters
classifiers = {
    "SVM": (SVC(), {"C": [0.1, 1], "kernel": ["linear", "rbf"]}),
    "RandomForest": (RandomForestClassifier(), {"n_estimators": [100, 200], "max_depth": [None, 10]}),
    "LogisticRegression": (LogisticRegression(max_iter=1000), {"C": [0.1, 1]}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5]}),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {"n_estimators": [100, 200], "max_depth": [3, 6]})
}

# Train and evaluate classifiers
for name, (clf, params) in classifiers.items():
    print(f"\n🔍 Classifier: {name}")
    grid = GridSearchCV(clf, params, scoring="f1_weighted", cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    print(f"Best Parameters: {grid.best_params_}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=LABEL_MAP.keys()))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



🔍 Classifier: SVM


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Accuracy: 0.689922480620155
Classification Report:
               precision    recall  f1-score   support

    Negative       0.59      0.65      0.61        85
     Neutral       0.80      0.71      0.75        85
    Positive       0.71      0.72      0.71        88

    accuracy                           0.69       258
   macro avg       0.70      0.69      0.69       258
weighted avg       0.70      0.69      0.69       258

Confusion Matrix:
 [[55 13 17]
 [16 60  9]
 [23  2 63]]

🔍 Classifier: RandomForest
Best Parameters: {'max_depth': 10, 'n_estimators': 200}
Accuracy: 0.6434108527131783
Classification Report:
               precision    recall  f1-score   support

    Negative       0.54      0.53      0.53        85
     Neutral       0.79      0.72      0.75        85
    Positive       0.62      0.68      0.65        88

    accuracy                           0.64       258
   macro avg       0.65      0.64      0.64       258


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'max_depth': 3, 'n_estimators': 200}
Accuracy: 0.6434108527131783
Classification Report:
               precision    recall  f1-score   support

    Negative       0.55      0.54      0.54        85
     Neutral       0.76      0.71      0.73        85
    Positive       0.63      0.68      0.66        88

    accuracy                           0.64       258
   macro avg       0.65      0.64      0.64       258
weighted avg       0.65      0.64      0.64       258

Confusion Matrix:
 [[46 15 24]
 [14 60 11]
 [24  4 60]]


## 5. Load Saved Embeddings and Apply Traditional ML Classifiers on Original Telugu Comments

In [6]:
SAVE_PATH = "/home/santhosh/Data Scraping/scource code/TSA codes/TL_telugu_mbert_embeddings.csv"

# Classification using traditional ML models
df = pd.read_csv(SAVE_PATH)
X = df.drop(columns=["Label"]).values
y = df["Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define classifiers and their grid parameters
classifiers = {
    "SVM": (SVC(), {"C": [0.1, 1], "kernel": ["linear", "rbf"]}),
    "RandomForest": (RandomForestClassifier(), {"n_estimators": [100, 200], "max_depth": [None, 10]}),
    "LogisticRegression": (LogisticRegression(max_iter=1000), {"C": [0.1, 1]}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5]}),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {"n_estimators": [100, 200], "max_depth": [3, 6]})
}

# Train and evaluate classifiers
for name, (clf, params) in classifiers.items():
    print(f"\n🔍 Classifier: {name}")
    grid = GridSearchCV(clf, params, scoring="f1_weighted", cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    print(f"Best Parameters: {grid.best_params_}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=LABEL_MAP.keys()))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



🔍 Classifier: SVM
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Accuracy: 0.6356589147286822
Classification Report:
               precision    recall  f1-score   support

    Negative       0.53      0.64      0.58        85
     Neutral       0.82      0.66      0.73        85
    Positive       0.61      0.61      0.61        88

    accuracy                           0.64       258
   macro avg       0.66      0.64      0.64       258
weighted avg       0.66      0.64      0.64       258

Confusion Matrix:
 [[54  8 23]
 [18 56 11]
 [30  4 54]]

🔍 Classifier: RandomForest
Best Parameters: {'max_depth': None, 'n_estimators': 100}
Accuracy: 0.5813953488372093
Classification Report:
               precision    recall  f1-score   support

    Negative       0.47      0.55      0.51        85
     Neutral       0.83      0.65      0.73        85
    Positive       0.52      0.55      0.53        88

    accuracy                           0.58       258
   macro avg       0.61      0.5

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'max_depth': 3, 'n_estimators': 200}
Accuracy: 0.5736434108527132
Classification Report:
               precision    recall  f1-score   support

    Negative       0.46      0.53      0.49        85
     Neutral       0.79      0.65      0.71        85
    Positive       0.53      0.55      0.54        88

    accuracy                           0.57       258
   macro avg       0.59      0.57      0.58       258
weighted avg       0.59      0.57      0.58       258

Confusion Matrix:
 [[45 11 29]
 [17 55 13]
 [36  4 48]]
