# Generate Emotion Labels for Konkani Text

**Strategy:** Use pre-trained multilingual emotion model to label Konkani text  
**Then:** Train custom Konkani-specific emotion model on this data

## Step 1: Install Dependencies

In [None]:
!pip install -q transformers torch pandas tqdm
print("✅ Dependencies installed!")

## Step 2: Load Pre-trained Emotion Model

In [None]:
from transformers import pipeline
import torch

# Use multilingual emotion classifier
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    device=0 if torch.cuda.is_available() else -1,
    top_k=None
)

print("✅ Pre-trained emotion model loaded!")
print(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

## Step 3: Load Your Konkani Text Data

In [None]:
import pandas as pd
import json

# Load your Konkani text (from ASR transcripts, NER data, or other sources)
# Option 1: From ASR transcripts
with open('data/konkani-asr-v0/splits/manifests/train.json', 'r') as f:
    asr_data = [json.loads(line) for line in f]
    konkani_texts = [item['text'] for item in asr_data]

# Option 2: From generated data
# df = pd.read_json('data/generated/konkani_large_dataset.json')
# konkani_texts = df['text'].tolist()

# Option 3: From custom corpus
# with open('your_konkani_corpus.txt', 'r') as f:
#     konkani_texts = f.readlines()

print(f"Loaded {len(konkani_texts)} Konkani texts")
print(f"\nSamples:")
for i in range(min(3, len(konkani_texts))):
    print(f"  {i+1}. {konkani_texts[i][:100]}...")

## Step 4: Generate Emotion Labels

In [None]:
from tqdm import tqdm

# Generate labels in batches
batch_size = 32
labeled_data = []

print("Generating emotion labels...\n")

for i in tqdm(range(0, len(konkani_texts), batch_size)):
    batch = konkani_texts[i:i+batch_size]
    
    # Get predictions
    results = emotion_classifier(batch)
    
    # Store with highest confidence emotion
    for text, result in zip(batch, results):
        # Get top emotion
        top_emotion = max(result, key=lambda x: x['score'])
        
        labeled_data.append({
            'text': text,
            'emotion': top_emotion['label'],
            'confidence': top_emotion['score'],
            'all_scores': {r['label']: r['score'] for r in result}
        })

print(f"\n✅ Generated {len(labeled_data)} labeled samples!")

## Step 5: Filter High-Confidence Samples

In [None]:
# Keep only high-confidence predictions for training
confidence_threshold = 0.7

high_conf_data = [d for d in labeled_data if d['confidence'] >= confidence_threshold]

print(f"Total samples: {len(labeled_data)}")
print(f"High confidence (≥{confidence_threshold}): {len(high_conf_data)}")
print(f"Filtered out: {len(labeled_data) - len(high_conf_data)}")

# Show distribution
df = pd.DataFrame(high_conf_data)
print(f"\nEmotion distribution:")
print(df['emotion'].value_counts())

print(f"\nAverage confidence: {df['confidence'].mean():.4f}")

## Step 6: Balance Dataset (Optional)

In [None]:
# Balance classes by undersampling majority or oversampling minority
from sklearn.utils import resample

# Find minimum class size
min_samples = df['emotion'].value_counts().min()
target_samples = min(min_samples * 2, 500)  # At least 500 per class

balanced_dfs = []
for emotion in df['emotion'].unique():
    emotion_df = df[df['emotion'] == emotion]
    
    if len(emotion_df) > target_samples:
        # Undersample
        emotion_df = resample(emotion_df, n_samples=target_samples, random_state=42)
    elif len(emotion_df) < target_samples:
        # Oversample
        emotion_df = resample(emotion_df, n_samples=target_samples, random_state=42, replace=True)
    
    balanced_dfs.append(emotion_df)

balanced_df = pd.concat(balanced_dfs).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Balanced dataset size: {len(balanced_df)}")
print(f"\nBalanced distribution:")
print(balanced_df['emotion'].value_counts())

## Step 7: Save Training Data

In [None]:
import os

os.makedirs('data/generated', exist_ok=True)

# Save as CSV for training
balanced_df[['text', 'emotion', 'confidence']].to_csv(
    'data/generated/konkani_emotion_training_data.csv',
    index=False
)

# Save full data with all scores as JSON
with open('data/generated/konkani_emotion_full_data.json', 'w', encoding='utf-8') as f:
    json.dump(labeled_data, f, ensure_ascii=False, indent=2)

print("✅ Saved training data:")
print("   - data/generated/konkani_emotion_training_data.csv")
print("   - data/generated/konkani_emotion_full_data.json")

print(f"\nReady for training with {len(balanced_df)} samples!")

## Step 8: Verify Data Quality

In [None]:
# Show samples from each emotion
print("="*70)
print("SAMPLE LABELED DATA")
print("="*70)

for emotion in balanced_df['emotion'].unique():
    print(f"\n{emotion.upper()}:")
    samples = balanced_df[balanced_df['emotion'] == emotion].head(3)
    for idx, row in samples.iterrows():
        print(f"  • {row['text'][:80]}... (conf: {row['confidence']:.3f})")

print("\n" + "="*70)
print("✅ DATA GENERATION COMPLETE!")
print("="*70)
print("\nNext step: Upload this data to Kaggle and train custom model!")