In [1]:
!pip install transformers datasets torch sentencepiece accelerate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [2]:
# -- coding: utf-8 --
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
# Load the data
df = pd.read_csv("/content/test1.csv")

# Normalize the labels
df[["happy", "sad", "anger", "fear"]] /= 10  # scaling 0-10 to 0-1

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_data(paragraph, cot):
    paragraph = "" if pd.isna(paragraph) else str(paragraph)
    cot = "" if pd.isna(cot) else str(cot)
    combined_text = paragraph + " " + tokenizer.sep_token + " " + cot
    return tokenizer(combined_text, padding="max_length", truncation=True, max_length=256, return_tensors="pt")

# Apply tokenization
df["tokens"] = df.apply(lambda row: tokenize_data(row["Paragraph"], row["COT"]), axis=1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [4]:
# Define the dataset
class TeluguEmotionDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.labels = torch.tensor(self.data[["happy", "sad", "anger", "fear"]].values, dtype=torch.float)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data.iloc[idx]["tokens"]
        return {
            "input_ids": tokens["input_ids"].squeeze(),
            "attention_mask": tokens["attention_mask"].squeeze(),
            "labels": self.labels[idx]
        }

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = TeluguEmotionDataset(train_df)
test_dataset = TeluguEmotionDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Define the model
class EnhancedEmotionClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.dropout = nn.Dropout(0.3)
        self.shared_dense = nn.Linear(self.bert.config.hidden_size, 256)
        self.classifier_happy = nn.Linear(256, 1)
        self.classifier_sad = nn.Linear(256, 1)
        self.classifier_anger = nn.Linear(256, 1)
        self.classifier_fear = nn.Linear(256, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        features = F.relu(self.shared_dense(pooled_output))

        happy_logit = self.classifier_happy(features)
        sad_logit = self.classifier_sad(features)
        anger_logit = self.classifier_anger(features)
        fear_logit = self.classifier_fear(features)

        logits = torch.cat([happy_logit, sad_logit, anger_logit, fear_logit], dim=1)
        return logits

# Model, optimizer, loss
model = EnhancedEmotionClassifier().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
loss_fn = nn.BCEWithLogitsLoss()



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [9]:
# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        all_preds.append(outputs.detach().cpu().numpy())
        all_labels.append(labels.detach().cpu().numpy())

    avg_loss = total_loss / len(train_loader)
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    mse = mean_squared_error(all_labels, all_preds)
    mae = mean_absolute_error(all_labels, all_preds)
    r2 = r2_score(all_labels, all_preds)

    print(f"Epoch {epoch+1}: Train Loss={avg_loss:.4f}, MSE={mse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")

print("Training complete!")



Epoch 1: Train Loss=0.4720, MSE=2.9094, MAE=1.5275, R2=-1592.7773
Epoch 2: Train Loss=0.4710, MSE=3.0834, MAE=1.5522, R2=-1719.8911
Epoch 3: Train Loss=0.4703, MSE=2.9915, MAE=1.5441, R2=-1594.0266
Epoch 4: Train Loss=0.4712, MSE=2.9855, MAE=1.5350, R2=-1604.7538
Epoch 5: Train Loss=0.4697, MSE=3.0448, MAE=1.5547, R2=-1690.9121
Epoch 6: Train Loss=0.4713, MSE=2.9810, MAE=1.5396, R2=-1610.7142
Epoch 7: Train Loss=0.4686, MSE=3.0403, MAE=1.5537, R2=-1680.1218
Epoch 8: Train Loss=0.4683, MSE=3.0717, MAE=1.5492, R2=-1697.1807
Epoch 9: Train Loss=0.4687, MSE=3.0986, MAE=1.5652, R2=-1673.1415
Epoch 10: Train Loss=0.4648, MSE=3.0305, MAE=1.5428, R2=-1616.3566
Epoch 11: Train Loss=0.4648, MSE=3.0954, MAE=1.5527, R2=-1662.5496
Epoch 12: Train Loss=0.4607, MSE=3.0843, MAE=1.5535, R2=-1655.2634
Epoch 13: Train Loss=0.4599, MSE=3.2359, MAE=1.5783, R2=-1665.6794
Epoch 14: Train Loss=0.4575, MSE=3.0914, MAE=1.5568, R2=-1642.6913
Epoch 15: Train Loss=0.4584, MSE=3.1934, MAE=1.5689, R2=-1643.9564
Epoc

In [10]:
# Evaluation on Test Data
model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask)
        test_preds.append(outputs.cpu().numpy())
        test_labels.append(labels.cpu().numpy())

test_preds = np.vstack(test_preds)
test_labels = np.vstack(test_labels)

mse_test = mean_squared_error(test_labels, test_preds)
mae_test = mean_absolute_error(test_labels, test_preds)
r2_test = r2_score(test_labels, test_preds)

print("\nTest Set Evaluation:")
print(f"Test MSE: {mse_test:.4f}")
print(f"Test MAE: {mae_test:.4f}")
print(f"Test R2 Score: {r2_test:.4f}")



Test Set Evaluation:
Test MSE: 4.1612
Test MAE: 1.7736
Test R2 Score: -534.1198


In [11]:

# ------------------- Prediction Function -------------------
def predict_emotions(model, tokenizer, paragraph, cot=None):
    model.eval()
    if cot:
        combined_text = paragraph + " " + tokenizer.sep_token + " " + cot
        inputs = tokenizer(combined_text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
    else:
        inputs = tokenizer(paragraph, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(inputs["input_ids"], inputs["attention_mask"])

    probabilities = F.softmax(outputs, dim=1).cpu().numpy().flatten()

    total_intensity = sum(probabilities)
    scaled_probabilities = [p / total_intensity * 10 for p in probabilities]

    intensities = [int(round(p)) for p in scaled_probabilities]

    while sum(intensities) != 10:
        if sum(intensities) < 10:
            for i in range(len(intensities)):
                if intensities[i] < 10:
                    intensities[i] += 1
                    break
        elif sum(intensities) > 10:
            for i in range(len(intensities)):
                if intensities[i] > 1:
                    intensities[i] -= 1
                    break

    emotions = ["Happy", "Sad", "Anger", "Fear"]
    return {emotions[i]: intensities[i] for i in range(len(emotions))}

# Example Prediction
new_paragraph = "ఇలా నాలో నేను మాట్లాడుకుంటుంటే... ఏయ్...నువ్వు ఇప్పట్లో అడిగేలా లేవ్ కాని నేనో విషయం అడగనా ?. ఆయనతో నా చాట్ కట్ చేసి, హరి తో ఏంటో అడగండి మేడం. ఉన్నట్టుండి ఏదో ఆలోచనలోకి వెళ్లి పోతావ్, పిలుస్తున్నా పలకావ్ నీకేమైనా హెల్త్ ప్రాబ్లం ఉందా !?. హ హా హ హా...ఎందుకు నవ్వుతున్నావ్, ఇదోటి...ఏదైనా అడిగితె ఇలా అసహ్యంగా పల్లికిలిస్తావ్. వెంటనే నవ్వటం ఆపేసి,సమస్య అంటూ ఏం లేదమ్మా, అప్పుడప్పుడు నాలో నేనే మాట్లాడుకుంటా అంతే, దాన్నే అంతరంగిక శోధన అంటారు ఆధ్యాత్మిక బాషలో."
predicted_emotions = predict_emotions(model, tokenizer, new_paragraph)

print("\nPredicted Emotion Percentages:")
for emotion, percentage in predicted_emotions.items():
    print(f"{emotion}: {percentage}")



Predicted Emotion Percentages:
Happy: 1
Sad: 6
Anger: 2
Fear: 1
