# Konkani Emotion Detection Training - Google Colab

Train emotion classifier for Konkani text while ASR trains on Kaggle!

**Model:** DistilBERT fine-tuned for Konkani emotion classification  
**Time:** ~1-2 hours  
**GPU:** T4 (free on Colab)

## Step 1: Check GPU

In [None]:
!nvidia-smi

import torch
print(f"\nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö†Ô∏è No GPU! Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

## Step 2: Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Change to your project directory
import os
os.chdir('/content/drive/MyDrive/konkani')  # Update this path
print(f"Working directory: {os.getcwd()}")

## Step 3: Install Dependencies

In [None]:
!pip install -q transformers datasets accelerate
print("‚úÖ Dependencies installed!")

## Step 4: Load Emotion Dataset

In [None]:
import pandas as pd
import json

# Load your Konkani emotion dataset
# Update path to your emotion data
df = pd.read_csv('data/generated/konkani_sentiment.csv')

print(f"Dataset size: {len(df)}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nEmotion distribution:")
print(df['emotion'].value_counts())
print(f"\nSample:")
print(df.head())

## Step 5: Prepare Data for Training

In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['emotion'])

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

# Create label mapping
emotions = sorted(df['emotion'].unique())
label2id = {label: i for i, label in enumerate(emotions)}
id2label = {i: label for label, i in label2id.items()}

print(f"\nEmotions: {emotions}")
print(f"Label mapping: {label2id}")

# Load tokenizer
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"\n‚úÖ Using model: {model_name}")

## Step 6: Create Dataset

In [None]:
from datasets import Dataset

def prepare_dataset(df):
    return Dataset.from_dict({
        'text': df['text'].tolist(),
        'label': [label2id[e] for e in df['emotion'].tolist()]
    })

train_dataset = prepare_dataset(train_df)
val_dataset = prepare_dataset(val_df)

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

print("‚úÖ Datasets prepared and tokenized!")

## Step 7: Load Model and Train

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotions),
    id2label=id2label,
    label2id=label2id
)

# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./emotion_model',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,  # Mixed precision
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("üöÄ Starting training...\n")
trainer.train()

print("\n‚úÖ Training complete!")

## Step 8: Evaluate and Save

In [None]:
# Evaluate
results = trainer.evaluate()
print("\nValidation Results:")
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"F1 Score: {results['eval_f1']:.4f}")

# Save model
model.save_pretrained('./konkani_emotion_model')
tokenizer.save_pretrained('./konkani_emotion_model')

print("\n‚úÖ Model saved to: ./konkani_emotion_model")

## Step 9: Test the Model

In [None]:
from transformers import pipeline

# Load pipeline
classifier = pipeline('text-classification', model='./konkani_emotion_model', tokenizer=tokenizer)

# Test samples
test_texts = [
    "‡§π‡§æ‡§Ç‡§µ ‡§ñ‡•Ç‡§∂ ‡§Ü‡§∏‡§æ",  # I am happy
    "‡§π‡§æ‡§Ç‡§µ ‡§¶‡•Å‡§ñ‡•Ä ‡§Ü‡§∏‡§æ",  # I am sad
    "‡§π‡§æ‡§Ç‡§µ ‡§∞‡§æ‡§ó‡•Ä‡§§ ‡§Ü‡§∏‡§æ",  # I am angry
]

print("Testing emotion detection:\n")
for text in test_texts:
    result = classifier(text)[0]
    print(f"Text: {text}")
    print(f"Emotion: {result['label']} (confidence: {result['score']:.4f})\n")