# Konkani Emotion Detection - Kaggle Training

**Model:** DistilBERT fine-tuned for Konkani emotions  
**Time:** ~1 hour on P100  
**Parallel:** Runs while ASR trains on main account!

## Step 1: Check GPU

In [None]:
!nvidia-smi

import torch
print(f"\nGPU: {torch.cuda.get_device_name(0)}")
print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## Step 2: Upload Dataset

**Before running:**
1. Upload your emotion dataset as a Kaggle dataset
2. Add it to this notebook
3. Update DATASET_PATH below

In [None]:
import os

DATASET_PATH = "/kaggle/input/konkani-emotion-data"  # UPDATE THIS

print(f"Dataset path: {DATASET_PATH}")
if os.path.exists(DATASET_PATH):
    print("‚úÖ Dataset found!")
    !ls -la {DATASET_PATH}
else:
    print("‚ùå Dataset not found. Please add it to this notebook.")

## Step 3: Install Dependencies

In [None]:
!pip install -q transformers datasets accelerate
print("‚úÖ Dependencies installed!")

## Step 4: Load Data

In [None]:
import pandas as pd
from pathlib import Path

# Find CSV file
csv_files = list(Path(DATASET_PATH).glob('*.csv'))
if csv_files:
    df = pd.read_csv(csv_files[0])
    print(f"‚úÖ Loaded: {csv_files[0].name}")
else:
    print("‚ùå No CSV file found")

print(f"\nDataset size: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nEmotion distribution:")
print(df['emotion'].value_counts())
print(f"\nSample:")
print(df.head(3))

## Step 5: Prepare Training Data

In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

# Split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['emotion'])

print(f"Training: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")

# Label mapping
emotions = sorted(df['emotion'].unique())
label2id = {label: i for i, label in enumerate(emotions)}
id2label = {i: label for label, i in label2id.items()}

print(f"\nEmotions ({len(emotions)}): {emotions}")

# Tokenizer
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create datasets
def prepare_dataset(df):
    return Dataset.from_dict({
        'text': df['text'].tolist(),
        'label': [label2id[e] for e in df['emotion'].tolist()]
    })

train_dataset = prepare_dataset(train_df)
val_dataset = prepare_dataset(val_df)

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

print("\n‚úÖ Data prepared!")

## Step 6: Train Model

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotions),
    id2label=id2label,
    label2id=label2id
)

# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

# Training config
training_args = TrainingArguments(
    output_dir='/kaggle/working/emotion_model',
    num_train_epochs=3,
    per_device_train_batch_size=32,  # Larger for P100
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("="*70)
print("üöÄ STARTING EMOTION DETECTION TRAINING")
print("="*70)
print(f"Model: {model_name}")
print(f"Emotions: {len(emotions)}")
print(f"Training samples: {len(train_dataset)}")
print(f"Epochs: 3")
print("="*70)
print()

trainer.train()

print("\n‚úÖ Training complete!")

## Step 7: Evaluate

In [None]:
# Evaluate
results = trainer.evaluate()

print("="*70)
print("FINAL RESULTS")
print("="*70)
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"F1 Score: {results['eval_f1']:.4f}")
print("="*70)

# Detailed report
predictions = trainer.predict(val_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=emotions))

## Step 8: Save Model

In [None]:
# Save
model.save_pretrained('/kaggle/working/konkani_emotion_model')
tokenizer.save_pretrained('/kaggle/working/konkani_emotion_model')

# Save label mapping
import json
with open('/kaggle/working/konkani_emotion_model/label_mapping.json', 'w') as f:
    json.dump({'label2id': label2id, 'id2label': id2label}, f, indent=2)

print("‚úÖ Model saved to: /kaggle/working/konkani_emotion_model")
print("\nDownload from Output tab after notebook finishes!")

!ls -lh /kaggle/working/konkani_emotion_model/

## Step 9: Test Model

In [None]:
from transformers import pipeline

# Load pipeline
classifier = pipeline(
    'text-classification',
    model='/kaggle/working/konkani_emotion_model',
    tokenizer=tokenizer,
    device=0
)

# Test
test_texts = [
    "‡§π‡§æ‡§Ç‡§µ ‡§ñ‡•Ç‡§∂ ‡§Ü‡§∏‡§æ",
    "‡§π‡§æ‡§Ç‡§µ ‡§¶‡•Å‡§ñ‡•Ä ‡§Ü‡§∏‡§æ",
    "‡§π‡§æ‡§Ç‡§µ ‡§∞‡§æ‡§ó‡•Ä‡§§ ‡§Ü‡§∏‡§æ",
    "‡§π‡§æ‡§Ç‡§µ ‡§≠‡§Ø‡§≠‡•Ä‡§§ ‡§Ü‡§∏‡§æ",
]

print("="*70)
print("TESTING EMOTION DETECTION")
print("="*70)

for text in test_texts:
    result = classifier(text)[0]
    print(f"\nText: {text}")
    print(f"Emotion: {result['label']}")
    print(f"Confidence: {result['score']:.4f}")

print("\n" + "="*70)
print("‚úÖ EMOTION DETECTION MODEL READY!")
print("="*70)