In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import joblib




In [2]:
file_path = 'sen_dataset.csv'
dataset = pd.read_csv(file_path)

In [3]:
# Clean dataset
dataset = dataset[dataset['Sentiment Label'] != 'Sentiment Label']
dataset.reset_index(drop=True, inplace=True)

In [4]:
# Encode labels
label_encoder = LabelEncoder()
dataset['Encoded Label'] = label_encoder.fit_transform(dataset['Sentiment Label'])

In [5]:
# Split dataset
X = dataset['Tamil Transcript']
y = dataset['Encoded Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Define dataset class for PyTorch
class TamilDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [7]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Prepare datasets and dataloaders
max_length = 128
train_dataset = TamilDataset(X_train, y_train, tokenizer, max_length)
test_dataset = TamilDataset(X_test, y_test, tokenizer, max_length)

In [9]:
# Define a compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=1).numpy()
    labels = torch.tensor(labels).numpy()
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)



In [11]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
# Train the model
trainer.train()

  0%|          | 0/560 [00:00<?, ?it/s]

{'loss': 2.0662, 'grad_norm': 3.549680233001709, 'learning_rate': 1.9642857142857145e-05, 'epoch': 0.18}
{'loss': 2.1014, 'grad_norm': 3.544799327850342, 'learning_rate': 1.928571428571429e-05, 'epoch': 0.36}
{'loss': 2.0974, 'grad_norm': 3.2842206954956055, 'learning_rate': 1.892857142857143e-05, 'epoch': 0.54}
{'loss': 1.9807, 'grad_norm': 8.861156463623047, 'learning_rate': 1.8571428571428575e-05, 'epoch': 0.71}
{'loss': 1.9166, 'grad_norm': 14.201337814331055, 'learning_rate': 1.8214285714285715e-05, 'epoch': 0.89}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 1.7222609519958496, 'eval_accuracy': 0.5315315315315315, 'eval_f1': 0.45737646878998944, 'eval_runtime': 2.9973, 'eval_samples_per_second': 74.068, 'eval_steps_per_second': 4.671, 'epoch': 1.0}
{'loss': 1.8175, 'grad_norm': 8.134785652160645, 'learning_rate': 1.785714285714286e-05, 'epoch': 1.07}
{'loss': 1.8045, 'grad_norm': 9.662507057189941, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.25}
{'loss': 1.7506, 'grad_norm': 11.8667631149292, 'learning_rate': 1.7142857142857142e-05, 'epoch': 1.43}
{'loss': 1.5715, 'grad_norm': 13.430989265441895, 'learning_rate': 1.678571428571429e-05, 'epoch': 1.61}
{'loss': 1.3971, 'grad_norm': 11.309673309326172, 'learning_rate': 1.642857142857143e-05, 'epoch': 1.79}
{'loss': 1.1807, 'grad_norm': 13.833097457885742, 'learning_rate': 1.6071428571428572e-05, 'epoch': 1.96}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 1.0140422582626343, 'eval_accuracy': 0.7477477477477478, 'eval_f1': 0.726659509697238, 'eval_runtime': 2.8546, 'eval_samples_per_second': 77.768, 'eval_steps_per_second': 4.904, 'epoch': 2.0}
{'loss': 1.1072, 'grad_norm': 5.860641956329346, 'learning_rate': 1.5714285714285715e-05, 'epoch': 2.14}
{'loss': 1.0449, 'grad_norm': 12.97945499420166, 'learning_rate': 1.535714285714286e-05, 'epoch': 2.32}
{'loss': 0.9684, 'grad_norm': 25.847293853759766, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.5}
{'loss': 0.8526, 'grad_norm': 18.596006393432617, 'learning_rate': 1.4642857142857144e-05, 'epoch': 2.68}
{'loss': 0.7108, 'grad_norm': 7.087488174438477, 'learning_rate': 1.4285714285714287e-05, 'epoch': 2.86}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.5697405338287354, 'eval_accuracy': 0.7927927927927928, 'eval_f1': 0.7580934582814814, 'eval_runtime': 2.8698, 'eval_samples_per_second': 77.358, 'eval_steps_per_second': 4.878, 'epoch': 3.0}
{'loss': 0.7705, 'grad_norm': 8.790425300598145, 'learning_rate': 1.3928571428571429e-05, 'epoch': 3.04}
{'loss': 0.5895, 'grad_norm': 30.962587356567383, 'learning_rate': 1.3571428571428574e-05, 'epoch': 3.21}
{'loss': 0.5916, 'grad_norm': 10.077142715454102, 'learning_rate': 1.3214285714285716e-05, 'epoch': 3.39}
{'loss': 0.5561, 'grad_norm': 4.258518218994141, 'learning_rate': 1.2857142857142859e-05, 'epoch': 3.57}
{'loss': 0.4217, 'grad_norm': 6.547934055328369, 'learning_rate': 1.25e-05, 'epoch': 3.75}
{'loss': 0.4141, 'grad_norm': 9.776383399963379, 'learning_rate': 1.2142857142857142e-05, 'epoch': 3.93}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.3377887010574341, 'eval_accuracy': 0.9054054054054054, 'eval_f1': 0.9049547156935406, 'eval_runtime': 2.8669, 'eval_samples_per_second': 77.436, 'eval_steps_per_second': 4.883, 'epoch': 4.0}
{'loss': 0.3519, 'grad_norm': 50.84397888183594, 'learning_rate': 1.1785714285714287e-05, 'epoch': 4.11}
{'loss': 0.3405, 'grad_norm': 46.98176193237305, 'learning_rate': 1.1428571428571429e-05, 'epoch': 4.29}
{'loss': 0.3416, 'grad_norm': 34.71142578125, 'learning_rate': 1.1071428571428572e-05, 'epoch': 4.46}
{'loss': 0.3511, 'grad_norm': 7.940845489501953, 'learning_rate': 1.0714285714285714e-05, 'epoch': 4.64}
{'loss': 0.3365, 'grad_norm': 3.5514538288116455, 'learning_rate': 1.0357142857142859e-05, 'epoch': 4.82}
{'loss': 0.3165, 'grad_norm': 5.209074974060059, 'learning_rate': 1e-05, 'epoch': 5.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2447182536125183, 'eval_accuracy': 0.9279279279279279, 'eval_f1': 0.9286617580925853, 'eval_runtime': 1.3142, 'eval_samples_per_second': 168.929, 'eval_steps_per_second': 10.653, 'epoch': 5.0}
{'loss': 0.194, 'grad_norm': 3.4705047607421875, 'learning_rate': 9.642857142857144e-06, 'epoch': 5.18}
{'loss': 0.1609, 'grad_norm': 6.336822032928467, 'learning_rate': 9.285714285714288e-06, 'epoch': 5.36}
{'loss': 0.2363, 'grad_norm': 9.156965255737305, 'learning_rate': 8.92857142857143e-06, 'epoch': 5.54}
{'loss': 0.1947, 'grad_norm': 82.60816955566406, 'learning_rate': 8.571428571428571e-06, 'epoch': 5.71}
{'loss': 0.2728, 'grad_norm': 15.034436225891113, 'learning_rate': 8.214285714285714e-06, 'epoch': 5.89}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.25204381346702576, 'eval_accuracy': 0.9234234234234234, 'eval_f1': 0.923922939259063, 'eval_runtime': 2.8665, 'eval_samples_per_second': 77.447, 'eval_steps_per_second': 4.884, 'epoch': 6.0}
{'loss': 0.2214, 'grad_norm': 7.526901721954346, 'learning_rate': 7.857142857142858e-06, 'epoch': 6.07}
{'loss': 0.1412, 'grad_norm': 5.101956367492676, 'learning_rate': 7.500000000000001e-06, 'epoch': 6.25}
{'loss': 0.1275, 'grad_norm': 1.3173834085464478, 'learning_rate': 7.1428571428571436e-06, 'epoch': 6.43}
{'loss': 0.1055, 'grad_norm': 20.31029510498047, 'learning_rate': 6.785714285714287e-06, 'epoch': 6.61}
{'loss': 0.2187, 'grad_norm': 35.20564651489258, 'learning_rate': 6.4285714285714295e-06, 'epoch': 6.79}
{'loss': 0.1159, 'grad_norm': 1.2105159759521484, 'learning_rate': 6.071428571428571e-06, 'epoch': 6.96}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2783849835395813, 'eval_accuracy': 0.9279279279279279, 'eval_f1': 0.928281950880821, 'eval_runtime': 2.8799, 'eval_samples_per_second': 77.087, 'eval_steps_per_second': 4.861, 'epoch': 7.0}
{'loss': 0.0891, 'grad_norm': 9.126173973083496, 'learning_rate': 5.7142857142857145e-06, 'epoch': 7.14}
{'loss': 0.1089, 'grad_norm': 4.319641590118408, 'learning_rate': 5.357142857142857e-06, 'epoch': 7.32}
{'loss': 0.1396, 'grad_norm': 12.745698928833008, 'learning_rate': 5e-06, 'epoch': 7.5}
{'loss': 0.1208, 'grad_norm': 27.966938018798828, 'learning_rate': 4.642857142857144e-06, 'epoch': 7.68}
{'loss': 0.1074, 'grad_norm': 13.827014923095703, 'learning_rate': 4.2857142857142855e-06, 'epoch': 7.86}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2477552741765976, 'eval_accuracy': 0.9324324324324325, 'eval_f1': 0.9326275165910913, 'eval_runtime': 2.8637, 'eval_samples_per_second': 77.522, 'eval_steps_per_second': 4.889, 'epoch': 8.0}
{'loss': 0.0617, 'grad_norm': 1.3675810098648071, 'learning_rate': 3.928571428571429e-06, 'epoch': 8.04}
{'loss': 0.0725, 'grad_norm': 1.8335506916046143, 'learning_rate': 3.5714285714285718e-06, 'epoch': 8.21}
{'loss': 0.1063, 'grad_norm': 1.872449278831482, 'learning_rate': 3.2142857142857147e-06, 'epoch': 8.39}
{'loss': 0.0842, 'grad_norm': 0.826352596282959, 'learning_rate': 2.8571428571428573e-06, 'epoch': 8.57}
{'loss': 0.0445, 'grad_norm': 11.768636703491211, 'learning_rate': 2.5e-06, 'epoch': 8.75}
{'loss': 0.0507, 'grad_norm': 7.975843906402588, 'learning_rate': 2.1428571428571427e-06, 'epoch': 8.93}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.26919621229171753, 'eval_accuracy': 0.9279279279279279, 'eval_f1': 0.9284708794155863, 'eval_runtime': 2.8794, 'eval_samples_per_second': 77.101, 'eval_steps_per_second': 4.862, 'epoch': 9.0}
{'loss': 0.0568, 'grad_norm': 1.933126449584961, 'learning_rate': 1.7857142857142859e-06, 'epoch': 9.11}
{'loss': 0.0573, 'grad_norm': 1.282114863395691, 'learning_rate': 1.4285714285714286e-06, 'epoch': 9.29}
{'loss': 0.0704, 'grad_norm': 2.0959694385528564, 'learning_rate': 1.0714285714285714e-06, 'epoch': 9.46}
{'loss': 0.0398, 'grad_norm': 0.7674557566642761, 'learning_rate': 7.142857142857143e-07, 'epoch': 9.64}
{'loss': 0.1048, 'grad_norm': 9.13603687286377, 'learning_rate': 3.5714285714285716e-07, 'epoch': 9.82}
{'loss': 0.0683, 'grad_norm': 1.1718531847000122, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2801665961742401, 'eval_accuracy': 0.9279279279279279, 'eval_f1': 0.92844058922524, 'eval_runtime': 1.5171, 'eval_samples_per_second': 146.334, 'eval_steps_per_second': 9.228, 'epoch': 10.0}
{'train_runtime': 1765.3519, 'train_samples_per_second': 5.019, 'train_steps_per_second': 0.317, 'train_loss': 0.5914577308510031, 'epoch': 10.0}


TrainOutput(global_step=560, training_loss=0.5914577308510031, metrics={'train_runtime': 1765.3519, 'train_samples_per_second': 5.019, 'train_steps_per_second': 0.317, 'total_flos': 582822383493120.0, 'train_loss': 0.5914577308510031, 'epoch': 10.0})

In [13]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

  0%|          | 0/14 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.2477552741765976, 'eval_accuracy': 0.9324324324324325, 'eval_f1': 0.9326275165910913, 'eval_runtime': 1.4965, 'eval_samples_per_second': 148.348, 'eval_steps_per_second': 9.355, 'epoch': 10.0}


In [None]:
# Generate classification report
test_logits = trainer.predict(test_dataset).predictions
y_pred = torch.argmax(torch.tensor(test_logits), axis=1).numpy()
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

  0%|          | 0/14 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       Anger       0.88      0.85      0.87        27
    Calmness       0.97      1.00      0.98        30
  Excitement       0.96      0.93      0.95        28
 Frustration       0.81      0.88      0.85        25
   Gratitude       0.96      0.96      0.96        26
Indifference       0.93      0.93      0.93        28
         Joy       0.96      0.96      0.96        28
     Sadness       0.97      0.93      0.95        30

    accuracy                           0.93       222
   macro avg       0.93      0.93      0.93       222
weighted avg       0.93      0.93      0.93       222



In [None]:
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

('./sentiment_model\\tokenizer_config.json',
 './sentiment_model\\special_tokens_map.json',
 './sentiment_model\\sentencepiece.bpe.model',
 './sentiment_model\\added_tokens.json',
 './sentiment_model\\tokenizer.json')

In [None]:
joblib.dump(label_encoder, "./label_encoder.pkl")

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import joblib

In [2]:
label_encoder = joblib.load("./label_encoder.pkl")

saved_model_path = "./sentiment_model"
loaded_model = AutoModelForSequenceClassification.from_pretrained(saved_model_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(saved_model_path)

In [3]:
def predict_sentiment(texts):
    encodings = loaded_tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )
    outputs = loaded_model(**encodings)
    predictions = torch.argmax(outputs.logits, axis=1).numpy()
    sentiment_labels = label_encoder.inverse_transform(predictions)
    return sentiment_labels
new_texts = [
    "எனக்கு இன்று மிகவும் மகிழ்ச்சி.",
    "நான் மிகவும் கோபமாக உள்ளேன்.",
    "நான் இன்று சோர்வாக இறுக்கிறேன்."
]
predictions = predict_sentiment(new_texts)
for text, sentiment in zip(new_texts, predictions):
    print(f"Text: {text} => Predicted Sentiment: {sentiment}")

Text: எனக்கு இன்று மிகவும் மகிழ்ச்சி. => Predicted Sentiment: Joy
Text: நான் மிகவும் கோபமாக உள்ளேன். => Predicted Sentiment: Anger
Text: நான் இன்று சோர்வாக இறுக்கிறேன். => Predicted Sentiment: Sadness


In [2]:
#Hugging face
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import joblib  
import os


label_encoder_path = "C:/Users/lenovo/Desktop/model/Tamil/Tamilsent/label_encoder.pkl"


label_encoder = joblib.load(label_encoder_path) 

model_name = "Rajkumar57/tamilsentiment-model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

text = "இந்த ஒரு உதாரணம்."

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predicted_class_idx = torch.argmax(logits, dim=-1).item()

predicted_class_label = label_encoder.inverse_transform([predicted_class_idx])[0]

print(f"Predicted sentiment label: {predicted_class_label}")

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Predicted sentiment label: Indifference
