# Konkani Emotion + Translation - Dual GPU Pipeline

**Strategy:** Train both models simultaneously on 2 GPUs!  
**GPU 0:** Emotion model  
**GPU 1:** Translation model  
**Time:** ~2 hours (parallel training!)

## Step 1: Check Dual GPU Setup

In [None]:
!nvidia-smi

import torch
num_gpus = torch.cuda.device_count()

print(f"\n{'='*70}")
print(f"DUAL GPU SETUP")
print(f"{'='*70}")
print(f"Available GPUs: {num_gpus}")

for i in range(num_gpus):
    print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")

if num_gpus >= 2:
    print(f"\n✅ Dual GPU detected!")
    print(f"GPU 0: Emotion model training")
    print(f"GPU 1: Translation model training")
    emotion_device = torch.device('cuda:0')
    translation_device = torch.device('cuda:1')
else:
    print(f"\n⚠️ Single GPU - will train sequentially")
    emotion_device = torch.device('cuda:0')
    translation_device = torch.device('cuda:0')

print(f"{'='*70}")

## Step 2: Install Dependencies

In [None]:
!pip install -q transformers torch pandas tqdm scikit-learn sentencepiece
print("✅ Dependencies installed!")

## Step 3: Load Konkani Text

In [None]:
import json
import pandas as pd

DATASET_PATH = "/kaggle/input/konkani-text-data"  # UPDATE THIS

konkani_texts = []

try:
    with open(f'{DATASET_PATH}/train.json', 'r') as f:
        data = [json.loads(line) for line in f]
        konkani_texts = [item['text'] for item in data]
except:
    with open(f'{DATASET_PATH}/konkani_corpus.txt', 'r') as f:
        konkani_texts = [line.strip() for line in f if line.strip()]

konkani_texts = list(set(konkani_texts))
print(f"Loaded {len(konkani_texts)} unique Konkani texts")

## Step 4: Generate Data (Both Models)

In [None]:
from transformers import pipeline
from tqdm import tqdm

print("="*70)
print("GENERATING TRAINING DATA")
print("="*70)

# Emotion labels
print("\n1. Generating emotion labels...")
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", device=0, top_k=None)

emotion_data = []
for i in tqdm(range(0, len(konkani_texts), 32)):
    batch = konkani_texts[i:i+32]
    results = emotion_classifier(batch)
    for text, result in zip(batch, results):
        top = max(result, key=lambda x: x['score'])
        if top['score'] >= 0.7:
            emotion_data.append({'text': text, 'emotion': top['label'], 'confidence': top['score']})

emotion_df = pd.DataFrame(emotion_data)
print(f"✅ Generated {len(emotion_df)} emotion labels")

# Translation pairs
print("\n2. Generating translations...")
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", device=0)

translation_pairs = []
for i in tqdm(range(0, len(konkani_texts), 16)):
    batch = konkani_texts[i:i+16]
    try:
        results = translator(batch, max_length=128)
        for konkani, result in zip(batch, results):
            english = result['translation_text']
            if len(english.split()) >= 2 and english != konkani:
                translation_pairs.append({'konkani': konkani, 'english': english})
    except:
        continue

print(f"✅ Generated {len(translation_pairs)} translation pairs")
print("\n" + "="*70)