In [None]:
# Install required packages (run in Colab)
!pip install -q transformers datasets sentencepiece torch torchvision torchaudio scikit-learn openpyxl emoji

# (optional) If you plan to use Indic NLP tokenizer or sacremoses:
!pip install -q indic-nlp-library

# Check GPU
import torch
print("PyTorch version:", torch.__version__)
print("GPU available:", torch.cuda.is_available(), torch.cuda.device_count())

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.1/121.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hPyTorch version: 2.9.0+cu126
GPU available: True 1


In [None]:
import os, random, json, math, re
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [None]:
# Update paths if you use Google Drive mount
DATA_DIR = Path("/content/drive/MyDrive/Projects/sentimental_analysis/Datasets")  # or "/content/drive/MyDrive/your-folder"
EN = pd.read_excel(DATA_DIR/"English.xlsx")
TE = pd.read_excel(DATA_DIR/"Telugu.xlsx")
HI = pd.read_excel(DATA_DIR/"Hindi.xlsx")
UR = pd.read_excel(DATA_DIR/"Urdu.xlsx")
TA = pd.read_excel(DATA_DIR/"Tamil.xlsx")

# Normalize column names: ensure 'id' and 'text' exist
def normalize_df(df):
    df = df.copy()
    cols_lower = {c.lower():c for c in df.columns}
    # map common names
    if 'text' not in df.columns:
        for k in ['text','TEXT','Text','comment','sentence','tweet']:
            if k in df.columns:
                df = df.rename(columns={k:'text'}); break
    if 'id' not in df.columns:
        df.insert(0,'id', range(1, len(df)+1))
    df['text'] = df['text'].astype(str)
    return df

EN = normalize_df(EN); TE = normalize_df(TE); HI = normalize_df(HI); UR = normalize_df(UR); TA = normalize_df(TA)
print("Rows:", len(EN), len(TE), len(HI), len(UR), len(TA))


Rows: 15000 15000 15000 15000


In [None]:
# Example harmonization for each language df:
def harmonize(df, language_code):
    df = df.copy()
    # ensure sentiment column exists
    if 'sentiment' not in df.columns:
        df['sentiment'] = None
    if 'emotion' not in df.columns:
        df['emotion'] = None
    if 'aspect' not in df.columns:
        df['aspect'] = None
    if 'reaction' not in df.columns:
        df['reaction'] = None
    df['language'] = language_code
    # normalize sentiment strings to lower-case and set unknown->neutral
    df['sentiment'] = df['sentiment'].astype(str).str.lower().replace({'none':'neutral','nan':'neutral','nan.0':'neutral'})
    df['sentiment'] = df['sentiment'].replace({'nan':'neutral'})
    df['sentiment'] = df['sentiment'].fillna('neutral')
    df['emotion'] = df['emotion'].fillna('neutral')
    df['aspect'] = df['aspect'].fillna('general')
    return df

EN = harmonize(EN, 'en')
TE = harmonize(TE, 'te')
HI = harmonize(HI, 'hi')
UR = harmonize(UR, 'ur')
TA = harmonize(TA, 'ta')

# Combine into one big multilingual dataframe (for transformer fine-tuning)
ALL = pd.concat([EN, TE, HI, UR, TA], ignore_index=True)
print("Combined rows:", len(ALL))
ALL.head()


Combined rows: 60000


Unnamed: 0,id,text,sentiment,reaction,sentiment_score,emotion,aspect,aspect_sentiment,language,provided_emotion
0,1,I am impressed with the delivery — overall exp...,positive,positive,1.0,neutral,delivery,positive,en,
1,2,It's okay overall; the sound doesn't stand out...,neutral,positive,0.0,neutral,sound,neutral,en,
2,3,Great design quality; this exceeded my expecta...,positive,negative,1.0,joy,"price, design","positive, positive",en,
3,4,The battery is unreliable and caused many prob...,negative,negative,-0.9,anger,battery,negative,en,
4,5,Moderate experience with the display — accepta...,neutral,negative,0.0,neutral,display,neutral,en,


In [None]:
import emoji
def clean_text(s):
    s = str(s).strip()
    # replace multiple whitespace
    s = re.sub(r'\s+', ' ', s)
    return s

ALL['text'] = ALL['text'].apply(clean_text)

In [None]:
# Build English baseline - filter english rows
en_df = ALL[ALL['language']=='en'].copy()
en_df = en_df[en_df['sentiment'].notnull()]
X = en_df['text'].values; y = en_df['sentiment'].values

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.15, random_state=42, stratify=y)

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
Xtr = tfidf.fit_transform(X_train)
Xv = tfidf.transform(X_val)

svm = SVC(kernel='linear', class_weight='balanced', probability=True)
svm.fit(Xtr, y_train)
pred = svm.predict(Xv)
print("SVM baseline (EN) acc:", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))

SVM baseline (EN) acc: 0.896
              precision    recall  f1-score   support

       mixed       0.77      0.67      0.72       184
    negative       0.74      0.65      0.70       168
     neutral       0.92      0.96      0.94       941
    positive       0.91      0.92      0.92       957

    accuracy                           0.90      2250
   macro avg       0.84      0.80      0.82      2250
weighted avg       0.89      0.90      0.89      2250



In [None]:
from transformers import Trainer
import torch.nn.functional as F
import torch

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        class_weights = torch.tensor(
            [1.0, 1.3, 0.9, 1.2],  # positive, negative, neutral, mixed
            device=logits.device
        )

        loss = F.cross_entropy(logits, labels, weight=class_weights)
        return (loss, outputs) if return_outputs else loss

In [None]:
# Choose model
MODEL = "google/muril-base-cased"   # good for Indic languages + English

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Prepare a label mapping for sentiment
labels = ['positive','negative','neutral','mixed']
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

# Create a dataset for sentiment training
# Keep only rows with a valid sentiment label
df_train = ALL[ALL['sentiment'].isin(labels)].copy()
df_train = df_train[['text','sentiment','language']].reset_index(drop=True)
df_train['label'] = df_train['sentiment'].map(label2id)

# Train/val split
train_df, val_df = train_test_split(df_train, test_size=0.12, stratify=df_train['label'], random_state=42)

# Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df[['text','label']])
val_ds = Dataset.from_pandas(val_df[['text','label']])

# Tokenize
def tokenize_fn(ex):
    return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=192)

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds = val_ds.map(tokenize_fn, batched=True)

# Set format
train_ds = train_ds.remove_columns(['text']).with_format("torch")
val_ds = val_ds.remove_columns(['text']).with_format("torch")

# Model
num_labels = len(labels)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels, id2label=id2label, label2id=label2id)
model.to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="muril-sentiment",
    eval_strategy="epoch",
    save_strategy="epoch", # Added this line to match eval_strategy
    per_device_train_batch_size=16 if device=='cuda' else 8,
    per_device_eval_batch_size=32 if device=='cuda' else 16,
    num_train_epochs=5,
    save_total_limit=2,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    logging_steps=100,
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# Metric
# metric = load_metric("accuracy") # Removed this line
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training (GPU will accelerate)
trainer.train()
trainer.save_model("muril-sentiment-best")

Map:   0%|          | 0/52800 [00:00<?, ? examples/s]

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc 

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2806,0.29313,0.884722


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2806,0.29313,0.884722
2,0.249,0.265076,0.890833
3,0.2134,0.276235,0.896667


In [None]:
trainer.save_model("final_model")
tokenizer.save_pretrained("final_model")

In [None]:
!zip -r final_model.zip final_model

In [None]:
from google.colab import files
files.download("final_model.zip")