# Telugu Sentiment Analysis Using IndicBERT

## 1. IndicBERT Embeddings with In-Built Classifier on Translated English Comments

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


In [None]:

# ——— Project Hyperparameters ———
MODEL_NAME    = "ai4bharat/indic-bert"  # ✔ IndicBERT
MAX_LEN       = 128
BATCH_SIZE    = 32
LEARNING_RATE = 3e-5
WEIGHT_DECAY  = 0.01
EPOCHS        = 10
WARMUP_RATIO  = 0.1
LABEL_MAP     = {"Negative": 0, "Neutral": 1, "Positive": 2}

# ——— 1. Load & preprocess data ———
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Translated_English', 'Label']]
df = df.dropna().reset_index(drop=True)
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# ——— 2. Train/test split ———
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

# ——— 3. Dataset ———
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts     = texts
        self.labels    = labels
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# ——— 4. Tokenizer & DataLoaders ———
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = SentimentDataset(train_df["Translated_English"].tolist(), train_df["Label"].tolist(), tokenizer, MAX_LEN)
test_ds  = SentimentDataset(test_df["Translated_English"].tolist(),  test_df["Label"].tolist(),  tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ——— 5. IndicBERT Model + Classifier ———
class IndicBERTClassifier(nn.Module):
    def __init__(self, model_name, n_classes):
        super().__init__()
        self.base = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        hidden_size = self.base.config.hidden_size
        self.fc = nn.Linear(hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [CLS] token representation
        x = self.drop(pooled_output)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = IndicBERTClassifier(MODEL_NAME, n_classes=3).to(device)

# ——— 6. Optimizer, Scheduler, Loss ———
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

# ——— 7. Training Loop ———
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    correct = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    acc = correct / len(train_ds)
    print(f"Epoch {epoch}/{EPOCHS} — Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

# ——— 8. Evaluation ———
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ——— 9. Report ———
print("\nTest Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Neutral", "Positive"]))

cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
print("\nConfusion Matrix:\n", cm)


2025-06-11 13:39:53.160022: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-11 13:39:53.170537: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749629393.181739  718242 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749629393.184856  718242 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749629393.194153  718242 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/10 — Loss: 1.0888, Accuracy: 0.3868


Epoch 2: 100%|██████████| 33/33 [00:06<00:00,  5.05it/s]


Epoch 2/10 — Loss: 1.0437, Accuracy: 0.5355


Epoch 3: 100%|██████████| 33/33 [00:06<00:00,  5.04it/s]


Epoch 3/10 — Loss: 1.0177, Accuracy: 0.5413


Epoch 4: 100%|██████████| 33/33 [00:06<00:00,  5.01it/s]


Epoch 4/10 — Loss: 0.9404, Accuracy: 0.5879


Epoch 5: 100%|██████████| 33/33 [00:06<00:00,  4.98it/s]


Epoch 5/10 — Loss: 0.8600, Accuracy: 0.6414


Epoch 6: 100%|██████████| 33/33 [00:06<00:00,  4.96it/s]


Epoch 6/10 — Loss: 0.7731, Accuracy: 0.7153


Epoch 7: 100%|██████████| 33/33 [00:06<00:00,  4.93it/s]


Epoch 7/10 — Loss: 0.6264, Accuracy: 0.7775


Epoch 8: 100%|██████████| 33/33 [00:06<00:00,  4.89it/s]


Epoch 8/10 — Loss: 0.5311, Accuracy: 0.8192


Epoch 9: 100%|██████████| 33/33 [00:06<00:00,  4.86it/s]


Epoch 9/10 — Loss: 0.3916, Accuracy: 0.8921


Epoch 10: 100%|██████████| 33/33 [00:06<00:00,  4.86it/s]

Epoch 10/10 — Loss: 0.3720, Accuracy: 0.8834






Test Accuracy: 0.6782945736434108

Classification Report:
              precision    recall  f1-score   support

    Negative       0.56      0.68      0.62        85
     Neutral       0.71      0.73      0.72        85
    Positive       0.81      0.62      0.71        88

    accuracy                           0.68       258
   macro avg       0.69      0.68      0.68       258
weighted avg       0.70      0.68      0.68       258


Confusion Matrix:
 [[58 19  8]
 [18 62  5]
 [27  6 55]]


## 2. IndicBERT Embeddings with In-Built Classifier on Original Telugu Comments

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [2]:


# ——— Project Hyperparameters ———
MODEL_NAME    = "ai4bharat/indic-bert"  # ✔ IndicBERT
MAX_LEN       = 128
BATCH_SIZE    = 32
LEARNING_RATE = 3e-5
WEIGHT_DECAY  = 0.01
EPOCHS        = 10
WARMUP_RATIO  = 0.1
LABEL_MAP     = {"Negative": 0, "Neutral": 1, "Positive": 2}

# ——— 1. Load & preprocess data ———
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Statement', 'Label']]
df = df.dropna().reset_index(drop=True)
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# ——— 2. Train/test split ———
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

# ——— 3. Dataset ———
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts     = texts
        self.labels    = labels
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# ——— 4. Tokenizer & DataLoaders ———
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = SentimentDataset(train_df["Statement"].tolist(), train_df["Label"].tolist(), tokenizer, MAX_LEN)
test_ds  = SentimentDataset(test_df["Statement"].tolist(),  test_df["Label"].tolist(),  tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ——— 5. IndicBERT Model + Classifier ———
class IndicBERTClassifier(nn.Module):
    def __init__(self, model_name, n_classes):
        super().__init__()
        self.base = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        hidden_size = self.base.config.hidden_size
        self.fc = nn.Linear(hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [CLS] token representation
        x = self.drop(pooled_output)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = IndicBERTClassifier(MODEL_NAME, n_classes=3).to(device)

# ——— 6. Optimizer, Scheduler, Loss ———
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

# ——— 7. Training Loop ———
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    correct = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    acc = correct / len(train_ds)
    print(f"Epoch {epoch}/{EPOCHS} — Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

# ——— 8. Evaluation ———
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ——— 9. Report ———
print("\nTest Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Neutral", "Positive"]))

cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
print("\nConfusion Matrix:\n", cm)


Epoch 1: 100%|██████████| 33/33 [00:06<00:00,  5.01it/s]


Epoch 1/10 — Loss: 1.0882, Accuracy: 0.3460


Epoch 2: 100%|██████████| 33/33 [00:06<00:00,  5.01it/s]


Epoch 2/10 — Loss: 1.0325, Accuracy: 0.4888


Epoch 3: 100%|██████████| 33/33 [00:06<00:00,  4.97it/s]


Epoch 3/10 — Loss: 1.0006, Accuracy: 0.5384


Epoch 4: 100%|██████████| 33/33 [00:06<00:00,  4.95it/s]


Epoch 4/10 — Loss: 0.9462, Accuracy: 0.5539


Epoch 5: 100%|██████████| 33/33 [00:06<00:00,  4.89it/s]


Epoch 5/10 — Loss: 0.9063, Accuracy: 0.5734


Epoch 6: 100%|██████████| 33/33 [00:06<00:00,  4.88it/s]


Epoch 6/10 — Loss: 0.8310, Accuracy: 0.6016


Epoch 7: 100%|██████████| 33/33 [00:06<00:00,  4.82it/s]


Epoch 7/10 — Loss: 0.7507, Accuracy: 0.6531


Epoch 8: 100%|██████████| 33/33 [00:06<00:00,  4.81it/s]


Epoch 8/10 — Loss: 0.6346, Accuracy: 0.7123


Epoch 9: 100%|██████████| 33/33 [00:06<00:00,  4.79it/s]


Epoch 9/10 — Loss: 0.5585, Accuracy: 0.7765


Epoch 10: 100%|██████████| 33/33 [00:06<00:00,  4.77it/s]

Epoch 10/10 — Loss: 0.3674, Accuracy: 0.8795






Test Accuracy: 0.6162790697674418

Classification Report:
              precision    recall  f1-score   support

    Negative       0.55      0.40      0.46        85
     Neutral       0.81      0.66      0.73        85
    Positive       0.54      0.78      0.64        88

    accuracy                           0.62       258
   macro avg       0.63      0.61      0.61       258
weighted avg       0.63      0.62      0.61       258


Confusion Matrix:
 [[34 11 40]
 [11 56 18]
 [17  2 69]]


## 3. Generate and Save IndicBERT Embeddings to a CSV File

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from tqdm import tqdm

## 4. Load Saved Embeddings and Apply Traditional ML Classifiers on Original Telugu Comments


In [None]:


# ——— Config ———
MODEL_NAME = "ai4bharat/indic-bert"  # Changed to IndicBERT
MAX_LEN = 128
BATCH_SIZE = 32
LABEL_MAP = {"Negative": 0, "Neutral": 1, "Positive": 2}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ——— Load Data ———
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Statement', 'Label']]
df = df.dropna().reset_index(drop=True)
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# ——— Dataset Class ———
class SentimentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

# ——— Load Tokenizer & Model ———
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# ——— Embedding Extraction ———
def extract_embeddings(texts):
    dataset = SentimentDataset(texts, tokenizer, MAX_LEN)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE)

    embeddings = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Extracting Embeddings"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            embeddings.append(cls_embeddings.cpu())
    return torch.cat(embeddings).numpy()

# ——— Generate and Save All Embeddings + Labels ———
all_embeddings = extract_embeddings(df["Statement"].tolist())
df_embeddings = pd.DataFrame(all_embeddings)
df_embeddings["label"] = df["Label"].values
df_embeddings.to_csv("TL_telugu_indicbert_embeddings.csv", index=False)

# ——— Train-Test Split ———
X = df_embeddings.drop("label", axis=1).values
y = df_embeddings["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ——— Feature Scaling ———
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ——— Define Models and Parameter Grids ———
models_params = {
    "LogisticRegression": (
        LogisticRegression(max_iter=1000),
        {"C": [0.1, 1, 10]}
    ),
    "SVM": (
        SVC(),
        {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    ),
    "RandomForest": (
        RandomForestClassifier(),
        {"n_estimators": [100, 200], "max_depth": [None, 10, 20]}
    ),
    "KNN": (
        KNeighborsClassifier(),
        {"n_neighbors": [3, 5, 7]}
    ),
    "XGBoost": (
        XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"),
        {"n_estimators": [100, 200], "max_depth": [3, 5], "learning_rate": [0.05, 0.1]}
    )
}

# ——— Train + Evaluate Each Model ———
for name, (model, param_grid) in models_params.items():
    print(f"\n🔍 Running GridSearch for {name}...")
    grid = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    print(f"\n✅ Best Parameters for {name}: {grid.best_params_}")
    print(f"📊 Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("📑 Classification Report:\n", classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))
    print("🔢 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


2025-06-12 12:05:44.130920: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-12 12:05:44.139959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749710144.150274  859670 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749710144.153122  859670 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749710144.161182  859670 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 


🔍 Running GridSearch for LogisticRegression...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


✅ Best Parameters for LogisticRegression: {'C': 0.1}
📊 Accuracy: 0.5853
📑 Classification Report:
               precision    recall  f1-score   support

    Negative       0.50      0.52      0.51        85
     Neutral       0.67      0.75      0.71        85
    Positive       0.58      0.49      0.53        88

    accuracy                           0.59       258
   macro avg       0.58      0.59      0.58       258
weighted avg       0.58      0.59      0.58       258

🔢 Confusion Matrix:
 [[44 19 22]
 [12 64  9]
 [32 13 43]]

🔍 Running GridSearch for SVM...

✅ Best Parameters for SVM: {'C': 1, 'kernel': 'rbf'}
📊 Accuracy: 0.5891
📑 Classification Report:
               precision    recall  f1-score   support

    Negative       0.49      0.52      0.51        85
     Neutral       0.74      0.72      0.73        85
    Positive       0.54      0.53      0.54        88

    accuracy                           0.59       258
   macro avg       0.59      0.59      0.59       258
weig

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


✅ Best Parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
📊 Accuracy: 0.5698
📑 Classification Report:
               precision    recall  f1-score   support

    Negative       0.50      0.52      0.51        85
     Neutral       0.73      0.72      0.73        85
    Positive       0.48      0.48      0.48        88

    accuracy                           0.57       258
   macro avg       0.57      0.57      0.57       258
weighted avg       0.57      0.57      0.57       258

🔢 Confusion Matrix:
 [[44 12 29]
 [ 8 61 16]
 [36 10 42]]


## 5. Load Saved Embeddings and Apply Traditional ML Classifiers on Translated English Comments

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from tqdm import tqdm

In [2]:


# ——— Config ———
MODEL_NAME = "ai4bharat/indic-bert"  # Changed to IndicBERT
MAX_LEN = 128
BATCH_SIZE = 32
LABEL_MAP = {"Negative": 0, "Neutral": 1, "Positive": 2}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ——— Load Data ———
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Translated_English', 'Label']]
df = df.dropna().reset_index(drop=True)
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)
df["Label"] = df["Label"].map(LABEL_MAP)

# ——— Dataset Class ———
class SentimentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

# ——— Load Tokenizer & Model ———
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# ——— Embedding Extraction ———
def extract_embeddings(texts):
    dataset = SentimentDataset(texts, tokenizer, MAX_LEN)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE)

    embeddings = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Extracting Embeddings"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            embeddings.append(cls_embeddings.cpu())
    return torch.cat(embeddings).numpy()

# ——— Generate and Save All Embeddings + Labels ———
all_embeddings = extract_embeddings(df["Translated_English"].tolist())
df_embeddings = pd.DataFrame(all_embeddings)
df_embeddings["label"] = df["Label"].values
df_embeddings.to_csv("TL_english_indicbert_embeddings.csv", index=False)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Extracting Embeddings: 100%|██████████| 41/41 [00:02<00:00, 15.34it/s]


In [3]:
# ——— Train-Test Split ———
X = df_embeddings.drop("label", axis=1).values
y = df_embeddings["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ——— Feature Scaling ———
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ——— Define Models and Parameter Grids ———
models_params = {
    "LogisticRegression": (
        LogisticRegression(max_iter=1000),
        {"C": [0.1, 1, 10]}
    ),
    "SVM": (
        SVC(),
        {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    ),
    "RandomForest": (
        RandomForestClassifier(),
        {"n_estimators": [100, 200], "max_depth": [None, 10, 20]}
    ),
    "KNN": (
        KNeighborsClassifier(),
        {"n_neighbors": [3, 5, 7]}
    ),
    "XGBoost": (
        XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"),
        {"n_estimators": [100, 200], "max_depth": [3, 5], "learning_rate": [0.05, 0.1]}
    )
}

# ——— Train + Evaluate Each Model ———
for name, (model, param_grid) in models_params.items():
    print(f"\n🔍 Running GridSearch for {name}...")
    grid = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    print(f"\n✅ Best Parameters for {name}: {grid.best_params_}")
    print(f"📊 Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("📑 Classification Report:\n", classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))
    print("🔢 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



🔍 Running GridSearch for LogisticRegression...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


✅ Best Parameters for LogisticRegression: {'C': 0.1}
📊 Accuracy: 0.6047
📑 Classification Report:
               precision    recall  f1-score   support

    Negative       0.50      0.59      0.54        85
     Neutral       0.74      0.65      0.69        85
    Positive       0.61      0.58      0.60        88

    accuracy                           0.60       258
   macro avg       0.62      0.60      0.61       258
weighted avg       0.62      0.60      0.61       258

🔢 Confusion Matrix:
 [[50 12 23]
 [21 55  9]
 [30  7 51]]

🔍 Running GridSearch for SVM...

✅ Best Parameters for SVM: {'C': 1, 'kernel': 'rbf'}
📊 Accuracy: 0.6279
📑 Classification Report:
               precision    recall  f1-score   support

    Negative       0.52      0.60      0.55        85
     Neutral       0.75      0.71      0.73        85
    Positive       0.65      0.58      0.61        88

    accuracy                           0.63       258
   macro avg       0.64      0.63      0.63       258
weig

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


✅ Best Parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
📊 Accuracy: 0.5814
📑 Classification Report:
               precision    recall  f1-score   support

    Negative       0.48      0.48      0.48        85
     Neutral       0.70      0.75      0.72        85
    Positive       0.56      0.51      0.53        88

    accuracy                           0.58       258
   macro avg       0.58      0.58      0.58       258
weighted avg       0.58      0.58      0.58       258

🔢 Confusion Matrix:
 [[41 15 29]
 [14 64  7]
 [30 13 45]]
