In [None]:
# ===================================================================
# KAGGLE-COMPATIBLE PACKAGE INSTALLATION
# ===================================================================
# All packages are combined into a single command for efficiency.
# The "-q" flag ensures a "quiet" installation with minimal output.

!pip install -q transformers datasets evaluate pandas openpyxl torch accelerate \
               sentencepiece numpy sacrebleu rouge-score scipy scikit-learn

print("✓ All packages installed successfully!")
print("✓ Enhanced training packages ready!")


In [4]:
# ===================================================================
# CELL 2: Access Data Files from Kaggle Dataset
# ===================================================================
import os

# Updated with your specific dataset name
dataset_name = 'lmtdata'

# Define file paths in the Kaggle input directory
train_file_path = f'/kaggle/input/lmtdata/english-hindi-train.xlsx'
val_file_path = f'/kaggle/input/lmtdata/english-hindi-valid.xlsx'

# Verify files exist
if os.path.exists(train_file_path):
    print("✓ Training file found in Kaggle input!")
else:
    print(f"✗ Training file not found! Please check the path: {train_file_path}")

if os.path.exists(val_file_path):
    print("✓ Validation file found in Kaggle input!")
else:
    print(f"✗ Validation file not found! Please check the path: {val_file_path}")
    

✓ Training file found in Kaggle input!
✓ Validation file found in Kaggle input!


In [None]:
# ===================================================================
# CELL 3: Import All Libraries and Setup Functions
# ===================================================================
# This cell is perfectly compatible with Kaggle. No changes are needed.

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    TrainingArguments, Trainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset as HFDataset
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!")

In [6]:
# ===================================================================
# CELL 4: Define Data Loading Functions
# ===================================================================
# This code is perfectly compatible with Kaggle. No changes are needed.
# These functions use standard pandas and will work with the Kaggle file paths.

def load_excel_data(file_path, sheet_name=0):
    """Load Excel/XLSX file and return DataFrame"""
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def explore_translation_data(df, name):
    """Explore the structure and statistics of translation data"""
    print(f"\n=== {name} Dataset Exploration ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    # Check for missing values
    print(f"Missing values:\n{df.isnull().sum()}")

    # Show sample data
    print(f"\nSample data:")
    print(df.head())

    # Check text lengths if source/translation columns exist
    if 'Source' in df.columns or 'source' in df.columns or 'English' in df.columns:
        source_cols = ['Source', 'source', 'English', 'english']
        source_col = None
        for col in source_cols:
            if col in df.columns:
                source_col = col
                break

        if source_col:
            source_lengths = df[source_col].str.split().str.len()
            print(f"\nSource text length stats (words):")
            print(f"  Mean: {source_lengths.mean():.1f}")
            print(f"  Median: {source_lengths.median():.1f}")
            print(f"  Min: {source_lengths.min()}")
            print(f"  Max: {source_lengths.max()}")

    if 'Translation' in df.columns or 'translation' in df.columns or 'Hindi' in df.columns:
        trans_cols = ['Translation', 'translation', 'Hindi', 'hindi']
        trans_col = None
        for col in trans_cols:
            if col in df.columns:
                trans_col = col
                break

        if trans_col:
            trans_lengths = df[trans_col].str.split().str.len()
            print(f"\nTranslation text length stats (words):")
            print(f"  Mean: {trans_lengths.mean():.1f}")
            print(f"  Median: {trans_lengths.median():.1f}")
            print(f"  Min: {trans_lengths.min()}")
            print(f"  Max: {trans_lengths.max()}")

print("✓ Data loading functions defined!")

✓ Data loading functions defined!


In [None]:
# ===================================================================
# CELL 5: Load Data from Kaggle Dataset
# ===================================================================
# The function calls are correct; only the print statements are updated for clarity.

print("Loading training data from Kaggle dataset...")
train_df = load_excel_data(train_file_path)

print("\nLoading validation data from Kaggle dataset...")
val_df = load_excel_data(val_file_path)

# This exploration code works perfectly as it operates on the DataFrames.
if train_df is not None:
    explore_translation_data(train_df, "Training")
if val_df is not None:
    explore_translation_data(val_df, "Validation") 


Loading training data from Kaggle dataset...


In [None]:
# ===================================================================
# CELL 6: Define Data Preprocessing Functions
# ===================================================================
# This code is perfectly compatible with Kaggle. No changes are needed.

def prepare_translation_data(train_df, val_df):
    """Prepare data for training"""

    # Try different possible column names
    source_cols = ['Source', 'source', 'English', 'english', 'EN', 'en']
    target_cols = ['Translation', 'translation', 'Hindi', 'hindi', 'HI', 'hi']

    source_col = None
    target_col = None

    for col in source_cols:
        if col in train_df.columns:
            source_col = col
            break

    for col in target_cols:
        if col in train_df.columns:
            target_col = col
            break

    if source_col is None or target_col is None:
        print("Error: Could not find source/target columns")
        print(f"Available columns: {list(train_df.columns)}")
        return None, None, None, None

    print(f"Using source column: {source_col}")
    print(f"Using target column: {target_col}")

    # Extract training data
    train_source = train_df[source_col].fillna('').astype(str).tolist()
    train_target = train_df[target_col].fillna('').astype(str).tolist()

    # Extract validation data (only source text for prediction)
    val_source_col = source_col if source_col in val_df.columns else None
    if val_source_col is None:
        for col in source_cols:
            if col in val_df.columns:
                val_source_col = col
                break

    if val_source_col is None:
        print("Error: Could not find source column in validation data")
        print(f"Available validation columns: {list(val_df.columns)}")
        return None, None, None, None

    val_source = val_df[val_source_col].fillna('').astype(str).tolist()
    val_ids = val_df['ID'].tolist() if 'ID' in val_df.columns else list(range(len(val_df)))

    print(f"Training pairs: {len(train_source)}")
    print(f"Validation samples: {len(val_source)}")

    return train_source, train_target, val_source, val_ids

print("✓ Data preprocessing functions defined!")

In [None]:
# ===================================================================
# ADDITIONAL OPTIMIZATIONS FOR OTHER CELLS
# ===================================================================

# ============= OPTIMIZATION 1: REPLACE CELL 7 =============
# Enhanced Model Setup with Better Architecture Selection

def setup_translation_model(model_name="Helsinki-NLP/opus-mt-en-hi"):
    """Setup translation model with optimization for Hindi translation"""
    print(f"Loading model: {model_name}")
    
    # Try multiple models in order of preference for EN->HI translation
    models_to_try = [
        "Helsinki-NLP/opus-mt-en-hi",      # Best for EN->HI
        "ai4bharat/IndicTrans2-En-Indic-1B",  # Specialized for Indian languages
        "google/mt5-base",                  # Larger than mt5-small
        "facebook/mbart-large-50-many-to-many-mmt"  # Multilingual
    ]
    
    model_loaded = False
    for try_model in models_to_try:
        try:
            print(f"Attempting to load: {try_model}")
            tokenizer = AutoTokenizer.from_pretrained(try_model)
            model = AutoModelForSeq2SeqLM.from_pretrained(try_model)
            model_name = try_model
            model_loaded = True
            print(f"✓ Successfully loaded {try_model}")
            break
        except:
            print(f"✗ Failed to load {try_model}, trying next...")
            continue
    
    if not model_loaded:
        raise Exception("Could not load any translation model")
    
    # Add special tokens if needed
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer, model_name

def create_translation_dataset(source_texts, target_texts, tokenizer, max_length=512):  # Increased from 256
    """Enhanced dataset creation with better preprocessing"""
    
    # Data cleaning function
    def clean_text(text):
        """Clean and normalize text"""
        if isinstance(text, str):
            # Remove HTML entities
            text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
            # Normalize whitespace
            text = ' '.join(text.split())
            # Add language tags if model supports it
            if "mbart" in tokenizer.name_or_path.lower():
                return f"[en_XX] {text}"  # Source language tag
            return text
        return ""
    
    # Clean texts
    source_texts = [clean_text(text) for text in source_texts]
    target_texts = [clean_text(text) for text in target_texts]
    
    def tokenize_function(examples):
        # Tokenize with optimal settings
        model_inputs = tokenizer(
            examples['source'],
            max_length=max_length,
            truncation=True,
            padding=False,
            add_special_tokens=True
        )
        
        # Tokenize targets with proper formatting
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                examples['target'],
                max_length=max_length,
                truncation=True,
                padding=False,
                add_special_tokens=True
            )
        
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    
    # Create dataset
    dataset = HFDataset.from_dict({
        'source': source_texts,
        'target': target_texts
    })
    
    # Tokenize with optimized batching
    tokenized_dataset = dataset.map(
        tokenize_function, 
        batched=True,
        batch_size=1000,  # Process in larger batches
        num_proc=2  # Use multiple processes if available
    )
    
    return tokenized_dataset

In [None]:
# ===================================================================
# CELL 9: Define Inference Functions
# ===================================================================
def translate_texts(model, tokenizer, source_texts, ids, max_length=256, batch_size=8):
    """Generate translations for given source texts"""
    model.eval()
    all_translations = []

    # Process in batches
    for i in range(0, len(source_texts), batch_size):
        batch_texts = source_texts[i:i+batch_size]

        # Tokenize batch
        inputs = tokenizer(
            batch_texts,
            max_length=max_length,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Move to GPU if available
        if torch.cuda.is_available():
            inputs = {key: val.cuda() for key, val in inputs.items()}

        # Generate translations
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3,
                do_sample=False,
                temperature=1.0
            )

        # Decode translations
        batch_translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_translations.extend(batch_translations)

        print(f"Processed {i+len(batch_texts)}/{len(source_texts)} samples")

    return all_translations

def create_translation_submission(ids, translations, filename="answer.csv"):
    """Create submission file in required CSV format"""

    submission_df = pd.DataFrame({
        'ID': ids,
        'Translation': translations
    })

    submission_df.to_csv(filename, index=False, encoding='utf-8')

    print(f"Submission file created: {filename}")
    print(f"Total entries: {len(ids)}")
    print(f"Sample entries:")
    print(submission_df.head())

print(" Inference functions defined!")



In [None]:
# =ag==================================================================
# CELL 10: Check GPU and Prepare Data
# ===================================================================
# The Colab-specific instruction for enabling a GPU has been updated for Kaggle.

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    # These two lines must be indented to be inside the 'if' block
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    # Updated message for the Kaggle environment
    print("✗ No GPU detected! Please enable the T4 GPU in the notebook settings (Accelerator option).")

# Prepare data
print("\nPreparing translation data...")
train_source, train_target, val_source, val_ids = prepare_translation_data(train_df, val_df)

if train_source is None:
    print("✗ Error in data preparation. Please check your column names.")
else:
    print(f"✓ Data prepared successfully!")
    print(f"  Training pairs: {len(train_source)}")
    print(f"  Validation samples: {len(val_source)}")

In [None]:
# ===================================================================
# CELL 11: Setup Model and Create Dataset
# ===================================================================
# This is the missing cell that creates the 'train_dataset' variable.

# Setup model and tokenizer
print("Setting up translation model...")
model, tokenizer, model_name = setup_translation_model()

# Move model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    print("✓ Model moved to GPU")

# Create the tokenized training dataset
print("\nCreating training dataset...")
train_dataset = create_translation_dataset(train_source, train_target, tokenizer)

print("\n✓ Model and dataset ready for training!")

In [None]:
# ===================================================================
# CELL 12: Start Training (This will take 2-3 hours) - KAGGLE VERSION
# ===================================================================
# The Google Drive mount has been removed.
# All outputs are now saved to the standard Kaggle output directory.


# Define the output path in the Kaggle working directory
output_dir = "/kaggle/working/lmt-final-model"

# Updated message for Kaggle's background execution
print("Starting training... ")
print("You can 'Save Version' to run this in the background. You don't need to keep the tab open!")

# The core training function logic is already compatible.
def train_translation_model_fixed(train_dataset, model, tokenizer, model_name, output_dir):
    """Train the translation model with corrected arguments"""

    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True
    )

    # Training arguments pointing to the Kaggle output directory
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=24,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=16,
        warmup_steps=6250,
        weight_decay=0.02,
        warmup_ratio=0.1,    # 10% warmup
        lr_scheduler_type="cosine_with_restarts",  # Better than linear
        logging_dir=f"{output_dir}/logs",
        logging_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=8,
        fp16=True,
        learning_rate=3e-5,
        remove_unused_columns=True,
        dataloader_pin_memory=False,
        load_best_model_at_end=False,
        report_to="none",
    )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train
    print("\nStarting training process...")
    trainer.train()

    # Save the final model
    print("\nSaving final model...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"\n✓ Training completed! Model saved to {output_dir}")
    return trainer

# Train the model, passing the correct Kaggle output path
trainer = train_translation_model_fixed(train_dataset, model, tokenizer, model_name, output_dir=output_dir)

In [None]:
# ===================================================================
# CELL 13: Run Model on Full Validation Dataset - KAGGLE VERSION
# ===================================================================

print("Running trained model on full validation dataset...")
print(f"Validation samples: {len(val_source)}")

# Load the trained model from the Kaggle working directory.
# This MUST exactly match the 'output_dir' from your training cell.
model_path = "/kaggle/working/lmt-final-model" 
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

if torch.cuda.is_available():
    model = model.cuda()
    print("✓ Model loaded on GPU")

# Generate translations for the entire validation set
print("\nGenerating translations... This may take 10-15 minutes")
val_translations = translate_texts(model, tokenizer, val_source, val_ids, batch_size=16)

print("\n✓ Validation translations completed!")
print(f"  Generated {len(val_translations)} translations")

# Show sample results
print("\nSample validation results:")
for i in range(min(10, len(val_translations))):
    print(f"\n--- Sample {i+1} ---")
    print(f"  ID: {val_ids[i]}")
    print(f"  Source: {val_source[i][:100]}{'...' if len(val_source[i]) > 100 else ''}")
    print(f"  Translation: {val_translations[i]}")

In [None]:
# ===================================================================
# CELL 14: Create Validation Submission File
# ===================================================================
# This cell is already compatible with Kaggle's file system.

print("Creating validation submission file...")

# This function saves files to /kaggle/working/
create_translation_submission(val_ids, val_translations, "validation_predictions.csv")

# This will also be saved to /kaggle/working/
create_translation_submission(val_ids, val_translations, "answer.csv")

# --- Verification Step ---
import pandas as pd
submission_df = pd.read_csv("/kaggle/working/answer.csv")

print(f"\nValidation Submission Summary:")
print(f"Total predictions: {len(submission_df)}")
print(f"Unique IDs: {submission_df['ID'].nunique()}")
print(f"Average translation length: {submission_df['Translation'].str.len().mean():.1f} characters")

print("\nFirst 5 predictions:")
print(submission_df.head()) 