# Generate Translation Pairs for Konkani-English

**Strategy:** Use pre-trained translation model to create Konkani-English pairs  
**Then:** Train custom Konkani-specific translation model on this data

## Step 1: Install Dependencies

In [None]:
!pip install -q transformers torch pandas tqdm sentencepiece
print("✅ Dependencies installed!")

## Step 2: Load Pre-trained Translation Model

In [None]:
from transformers import pipeline
import torch

# Use multilingual translation model
# This model can handle Indic languages including Konkani
translator = pipeline(
    "translation",
    model="Helsinki-NLP/opus-mt-mul-en",  # Multilingual to English
    device=0 if torch.cuda.is_available() else -1
)

print("✅ Pre-trained translation model loaded!")
print(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

## Step 3: Load Konkani Text Data

In [None]:
import json
import pandas as pd

# Load your Konkani text from various sources
konkani_texts = []

# Option 1: From ASR transcripts
try:
    with open('data/konkani-asr-v0/splits/manifests/train.json', 'r') as f:
        asr_data = [json.loads(line) for line in f]
        konkani_texts.extend([item['text'] for item in asr_data])
    print(f"✅ Loaded {len(asr_data)} texts from ASR")
except:
    print("⚠️ ASR data not found")

# Option 2: From NER data
try:
    with open('data/generated/konkani_large_dataset.json', 'r') as f:
        ner_data = json.load(f)
        konkani_texts.extend([item['text'] for item in ner_data])
    print(f"✅ Loaded {len(ner_data)} texts from NER data")
except:
    print("⚠️ NER data not found")

# Option 3: From custom corpus
# with open('your_konkani_corpus.txt', 'r') as f:
#     konkani_texts.extend(f.readlines())

# Remove duplicates and clean
konkani_texts = list(set([t.strip() for t in konkani_texts if t.strip()]))

print(f"\nTotal unique Konkani texts: {len(konkani_texts)}")
print(f"\nSamples:")
for i in range(min(5, len(konkani_texts))):
    print(f"  {i+1}. {konkani_texts[i][:100]}...")

## Step 4: Generate English Translations

In [None]:
from tqdm import tqdm

# Generate translations in batches
batch_size = 16
translation_pairs = []

print("Generating English translations...\n")

for i in tqdm(range(0, len(konkani_texts), batch_size)):
    batch = konkani_texts[i:i+batch_size]
    
    try:
        # Translate batch
        results = translator(batch, max_length=128)
        
        # Store pairs
        for konkani, result in zip(batch, results):
            translation_pairs.append({
                'konkani': konkani,
                'english': result['translation_text']
            })
    except Exception as e:
        print(f"\n⚠️ Error in batch {i}: {e}")
        continue

print(f"\n✅ Generated {len(translation_pairs)} translation pairs!")

## Step 5: Quality Check & Filter

In [None]:
# Filter out poor quality translations
def is_valid_translation(pair):
    konkani = pair['konkani']
    english = pair['english']
    
    # Basic quality checks
    if len(english.split()) < 2:  # Too short
        return False
    if len(english) > len(konkani) * 3:  # Too long
        return False
    if english == konkani:  # Not translated
        return False
    if not any(c.isalpha() for c in english):  # No letters
        return False
    
    return True

# Filter
filtered_pairs = [p for p in translation_pairs if is_valid_translation(p)]

print(f"Total pairs: {len(translation_pairs)}")
print(f"Valid pairs: {len(filtered_pairs)}")
print(f"Filtered out: {len(translation_pairs) - len(filtered_pairs)}")

# Show statistics
df = pd.DataFrame(filtered_pairs)
print(f"\nAverage Konkani length: {df['konkani'].str.len().mean():.1f} chars")
print(f"Average English length: {df['english'].str.len().mean():.1f} chars")

## Step 6: Add Reverse Translations (Optional)

In [None]:
# Optionally add reverse translations (English → Konkani)
# This helps create bidirectional translation capability

print("Generating reverse translations (English → Konkani)...\n")

# Load reverse translator
reverse_translator = pipeline(
    "translation",
    model="Helsinki-NLP/opus-mt-en-mul",  # English to Multilingual
    device=0 if torch.cuda.is_available() else -1
)

# Take a subset of English texts to translate back
english_texts = [p['english'] for p in filtered_pairs[:1000]]  # Limit for speed

reverse_pairs = []
for i in tqdm(range(0, len(english_texts), batch_size)):
    batch = english_texts[i:i+batch_size]
    
    try:
        results = reverse_translator(batch, max_length=128)
        
        for english, result in zip(batch, results):
            reverse_pairs.append({
                'konkani': result['translation_text'],
                'english': english
            })
    except Exception as e:
        print(f"\n⚠️ Error: {e}")
        continue

print(f"\n✅ Generated {len(reverse_pairs)} reverse pairs!")

# Combine
all_pairs = filtered_pairs + reverse_pairs
print(f"Total translation pairs: {len(all_pairs)}")

## Step 7: Save Training Data

In [None]:
import os

os.makedirs('data/generated', exist_ok=True)

# Save as JSON
with open('data/generated/konkani_english_translation_pairs.json', 'w', encoding='utf-8') as f:
    json.dump(all_pairs, f, ensure_ascii=False, indent=2)

# Save as CSV for easy viewing
df_all = pd.DataFrame(all_pairs)
df_all.to_csv('data/generated/konkani_english_translation_pairs.csv', index=False)

print("✅ Saved translation data:")
print("   - data/generated/konkani_english_translation_pairs.json")
print("   - data/generated/konkani_english_translation_pairs.csv")

print(f"\nReady for training with {len(all_pairs)} translation pairs!")

## Step 8: Verify Data Quality

In [None]:
# Show sample translations
print("="*70)
print("SAMPLE TRANSLATION PAIRS")
print("="*70)

import random
samples = random.sample(all_pairs, min(10, len(all_pairs)))

for i, pair in enumerate(samples, 1):
    print(f"\n{i}.")
    print(f"   Konkani: {pair['konkani']}")
    print(f"   English: {pair['english']}")

print("\n" + "="*70)
print("✅ DATA GENERATION COMPLETE!")
print("="*70)
print("\nNext step: Upload this data to Kaggle and train custom translation model!")