# Konkani-English Translation - Kaggle Training

**Model:** MarianMT fine-tuned for Konkani‚ÜîEnglish  
**Time:** ~2-3 hours on P100  
**Parallel:** Runs while ASR trains!

## Step 1: Check GPU

In [None]:
!nvidia-smi

import torch
print(f"\nGPU: {torch.cuda.get_device_name(0)}")
print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## Step 2: Upload Dataset

**Before running:**
1. Upload your translation dataset (Konkani-English pairs)
2. Add it to this notebook
3. Update DATASET_PATH below

In [None]:
import os

DATASET_PATH = "/kaggle/input/konkani-english-translation"  # UPDATE THIS

print(f"Dataset path: {DATASET_PATH}")
if os.path.exists(DATASET_PATH):
    print("‚úÖ Dataset found!")
    !ls -la {DATASET_PATH}
else:
    print("‚ùå Dataset not found. Please add it to this notebook.")

## Step 3: Install Dependencies

In [None]:
!pip install -q transformers datasets sacrebleu accelerate sentencepiece
print("‚úÖ Dependencies installed!")

## Step 4: Load Translation Data

In [None]:
import json
from pathlib import Path

# Load your Konkani-English pairs
# Expected format: [{"konkani": "...", "english": "..."}, ...]

json_files = list(Path(DATASET_PATH).glob('*.json'))
if json_files:
    with open(json_files[0], 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"‚úÖ Loaded: {json_files[0].name}")
else:
    print("‚ùå No JSON file found")

print(f"\nTranslation pairs: {len(data)}")
print(f"\nSample:")
for i in range(min(3, len(data))):
    print(f"\nKonkani: {data[i]['konkani']}")
    print(f"English: {data[i]['english']}")

## Step 5: Prepare Data

In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

# Split
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

print(f"Training pairs: {len(train_data)}")
print(f"Validation pairs: {len(val_data)}")

# Model - using multilingual model
model_name = "Helsinki-NLP/opus-mt-mul-en"  # Multilingual to English
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create datasets
train_dataset = Dataset.from_dict({
    'konkani': [d['konkani'] for d in train_data],
    'english': [d['english'] for d in train_data]
})

val_dataset = Dataset.from_dict({
    'konkani': [d['konkani'] for d in val_data],
    'english': [d['english'] for d in val_data]
})

print("\n‚úÖ Datasets created!")

## Step 6: Tokenize

In [None]:
max_length = 128

def preprocess_function(examples):
    inputs = examples['konkani']
    targets = examples['english']
    
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding='max_length')
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True, padding='max_length')
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

print("‚úÖ Data tokenized!")

## Step 7: Train Translation Model

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np

# Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Metrics
bleu = evaluate.load('sacrebleu')

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Clean
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {'bleu': result['score']}

# Training config
training_args = Seq2SeqTrainingArguments(
    output_dir='/kaggle/working/translation_model',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='bleu',
    predict_with_generate=True,
    fp16=True,
    generation_max_length=max_length,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("="*70)
print("üöÄ STARTING TRANSLATION TRAINING")
print("="*70)
print(f"Model: {model_name}")
print(f"Direction: Konkani ‚Üí English")
print(f"Training pairs: {len(train_dataset)}")
print(f"Epochs: 5")
print("="*70)
print()

trainer.train()

print("\n‚úÖ Training complete!")

## Step 8: Evaluate

In [None]:
# Evaluate
results = trainer.evaluate()

print("="*70)
print("FINAL RESULTS")
print("="*70)
print(f"BLEU Score: {results['eval_bleu']:.2f}")
print("="*70)

## Step 9: Save Model

In [None]:
# Save
model.save_pretrained('/kaggle/working/konkani_translation_model')
tokenizer.save_pretrained('/kaggle/working/konkani_translation_model')

print("‚úÖ Model saved to: /kaggle/working/konkani_translation_model")
print("\nDownload from Output tab!")

!ls -lh /kaggle/working/konkani_translation_model/

## Step 10: Test Translation

In [None]:
from transformers import pipeline

# Load pipeline
translator = pipeline(
    'translation',
    model='/kaggle/working/konkani_translation_model',
    tokenizer=tokenizer,
    device=0
)

# Test
test_texts = [
    "‡§π‡§æ‡§Ç‡§µ ‡§ò‡§∞‡§æ ‡§µ‡§§‡§æ",
    "‡§§‡•Å‡§µ‡•á‡§Ç ‡§ï‡§∏‡•ã ‡§Ü‡§∏‡§æ?",
    "‡§π‡§æ‡§Ç‡§µ ‡§ï‡•ã‡§Ç‡§ï‡§£‡•Ä ‡§∂‡§ø‡§ï‡§§‡§æ",
    "‡§Ü‡§Æ‡•Ä ‡§≠‡§æ‡§§ ‡§ñ‡§æ‡§§‡§æ",
]

print("="*70)
print("TESTING TRANSLATION (Konkani ‚Üí English)")
print("="*70)

for text in test_texts:
    result = translator(text, max_length=128)[0]
    print(f"\nKonkani: {text}")
    print(f"English: {result['translation_text']}")

print("\n" + "="*70)
print("‚úÖ TRANSLATION MODEL READY!")
print("="*70)