In [4]:
# Cell 1: Install Libraries
!pip install -q transformers[torch] datasets evaluate sacrebleu sentencepiece accelerate
!pip install protobuf==3.20.3
print("‚úì Libraries installed.")

Collecting protobuf==3.20.3
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m162.1/162.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.33.0
    Uninstalling protobuf-6.33.0:
      Successfully uninstalled protobuf-6.33.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
onnx 1.18.0 requires protobuf>=

In [5]:
# Cell 2: Imports
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer, 
    DataCollatorForSeq2Seq
)
from datasets import load_dataset, DatasetDict
import evaluate
import numpy as np

# Setup Device (GPU or CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚úì Using device: {device}")

‚úì Using device: cuda


In [91]:
# Cell 3: Clean and Load Saamayik Dataset (Final Fix)
import pandas as pd
import re
from datasets import Dataset, DatasetDict
import os

# Define your file names (Make sure these match what you uploaded)
data_dir = "/kaggle/input/coustom"
en_file = os.path.join(data_dir, "dev.en")
sa_file = os.path.join(data_dir, "dev.sa")

def clean_line(line):
    # 1. Remove the tags
    #    We use a raw string for regex, but careful with backslashes
    #    This pattern finds "', '', line)
    
    # 2. Remove the triple quotes (""") that make code look commented
    line = line.replace('"""', '')
    
    # 3. Remove leading/trailing whitespace
    return line.strip()

print("Processing Saamayik dataset...")

try:
    # Check if files exist
    if not os.path.exists(en_file) or not os.path.exists(sa_file):
        raise FileNotFoundError(f"Could not find {en_file} or {sa_file}. Please upload them!")

    # Read files
    with open(en_file, "r", encoding="utf-8") as f:
        en_lines = f.readlines()
        
    with open(sa_file, "r", encoding="utf-8") as f:
        sa_lines = f.readlines()

    # Create data pairs
    data = []
    # Zip combines the two lists line-by-line
    for en, sa in zip(en_lines, sa_lines):
        clean_en = clean_line(en)
        clean_sa = clean_line(sa)
        
        # Only add if we have valid text in both languages
        if len(clean_en) > 0 and len(clean_sa) > 0:
            data.append({"src": clean_sa, "tgt": clean_en})

    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Convert to Hugging Face Dataset
    full_dataset = Dataset.from_pandas(df)
    
    # Split into Train (90%) and Validation (10%)
    split = full_dataset.train_test_split(test_size=0.1, seed=42)
    dataset = DatasetDict({
        'train': split['train'],
        'validation': split['test']
    })

    print(f"\n‚úì Saamayik Dataset Ready!")
    print(f"Total pairs: {len(dataset['train'])}")
    print(f"Sample Input: {dataset['train'][0]['src']}")
    print(f"Sample Target: {dataset['train'][0]['tgt']}")

except Exception as e:
    print(f"‚ùå Error: {e}")

Processing Saamayik dataset...

‚úì Saamayik Dataset Ready!
Total pairs: 2173
Sample Input: (‡§¨‡§ø‡§®‡•ç‡§¶‡•Å 1) ‡§≠‡§µ‡§®‡•ç‡§§: ‡§è‡§§‡§Ç ‡§™‡§Ç‡§ï‡•ç‡§§‡•á: ‡§Ö‡§®‡•ç‡§§‡§ø‡§Æ‡§∂‡•Ä‡§∞‡•ç‡§∑‡§Ç ‡§™‡§∞‡•ç‡§Ø‡§®‡•ç‡§§‡§Ç ‡§Ü‡§µ‡§∞‡•ç‡§§‡§Ø‡•á‡§§ ‡•§
Sample Target: (point 1), so you are ready to repeat to the end of the row


In [92]:
# Cell 4: Load Model
model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.to(device)

print(f"‚úì Model {model_checkpoint} loaded.")



‚úì Model google/mt5-small loaded.


In [93]:
# Cell 5: Preprocessing with Padding Fix
max_length = 64

def preprocess_function(examples):
    inputs = examples["src"]
    targets = examples["tgt"]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs, 
        max_length=max_length, 
        truncation=True, 
        padding="max_length"
    )
    
    # Tokenize targets
    labels = tokenizer(
        targets, 
        max_length=max_length, 
        truncation=True, 
        padding="max_length"
    )
    
    # CRITICAL: Replace padding token (0) with -100 to ignore loss on padding
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply to dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)
print("‚úì Data tokenized and padding masked.")

Map:   0%|          | 0/2173 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

‚úì Data tokenized and padding masked.


In [94]:
# Cell 13: Compute Metrics (Safe Fix)
import numpy as np

# Load the evaluation metric (BLEU)
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # 1. Handle tuple output (if any)
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # 2. THE FIX: Convert -100 to 0 ONLY for decoding
    #    This allows the tokenizer to read the text without crashing.
    #    It does NOT affect the model's actual training gradients.
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    
    # 3. Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # 4. Clean up labels (Same fix: -100 -> 0 for readability)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # 5. Clean up text for BLEU
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    # 6. Compute
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {"bleu": result["score"]}

print("‚úì Metrics function updated. Safe to continue training.")

‚úì Metrics function updated. Safe to continue training.


In [104]:
# Cell 7: Optimized Hyperparameters (Balanced for ~2400 Sentences)
from transformers import Seq2SeqTrainingArguments
import transformers
import torch

# Clean up memory first
torch.cuda.empty_cache()

# 1. Define base arguments
args_dict = {
    "output_dir": "sanskrit-en-model-v2",
    
    # LEARNING RATE: 5e-4 is the "Sweet Spot" for fine-tuning.
    # It learns quickly but doesn't destroy pre-trained knowledge.
    "learning_rate": 5e-4,              
    
    # BATCH SIZE: Increased to 8 for speed (Kaggle P100/T4 GPUs can handle this).
    "per_device_train_batch_size": 8,   
    "per_device_eval_batch_size": 8,
    
    "weight_decay": 0.01,
    "save_total_limit": 2,              # Keep only the last 2 checkpoints
    
    # EPOCHS: 20 is ideal for this dataset size.
    # Math: 2400 lines / 8 batch = 300 steps per epoch. 
    # 20 epochs = 6000 total steps. Perfect for convergence.
    "num_train_epochs": 20,            
    
    "predict_with_generate": True,
    
    # FP16: ENABLED. This speeds up training significantly on Kaggle.
    "fp16": True,                      
    
    "logging_steps": 50,                # Log less frequently to keep output clean
    "report_to": "none",
    "save_strategy": "epoch"            # Save a checkpoint at the end of every epoch
}

# 2. Automatically select the correct strategy parameter (Version check)
if transformers.__version__ >= "4.41.0":
    args_dict["eval_strategy"] = "epoch"
else:
    args_dict["evaluation_strategy"] = "epoch"

# 3. Initialize arguments
args = Seq2SeqTrainingArguments(**args_dict)

from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

print("‚úì Optimized Hyperparameters Loaded (20 Epochs, FP16 Enabled).")

‚úì Optimized Hyperparameters Loaded (20 Epochs, FP16 Enabled).


In [105]:
# Cell: Verify Alignment
print("Checking data alignment at different points...")

# Check index 0 (Start)
print(f"--- Index 0 ---")
print(f"San: {dataset['train'][0]['src']}")
print(f"Eng: {dataset['train'][0]['tgt']}")

# Check index 100 (Early)
print(f"\n--- Index 100 ---")
print(f"San: {dataset['train'][145]['src']}")
print(f"Eng: {dataset['train'][145]['tgt']}")

# Check index 1000 (Middle)
print(f"\n--- Index 1000 ---")
print(f"San: {dataset['train'][1000]['src']}")
print(f"Eng: {dataset['train'][1000]['tgt']}")

Checking data alignment at different points...
--- Index 0 ---
San: (‡§¨‡§ø‡§®‡•ç‡§¶‡•Å 1) ‡§≠‡§µ‡§®‡•ç‡§§: ‡§è‡§§‡§Ç ‡§™‡§Ç‡§ï‡•ç‡§§‡•á: ‡§Ö‡§®‡•ç‡§§‡§ø‡§Æ‡§∂‡•Ä‡§∞‡•ç‡§∑‡§Ç ‡§™‡§∞‡•ç‡§Ø‡§®‡•ç‡§§‡§Ç ‡§Ü‡§µ‡§∞‡•ç‡§§‡§Ø‡•á‡§§ ‡•§
Eng: (point 1), so you are ready to repeat to the end of the row

--- Index 100 ---
San: ‡§Ø‡§§‡•ã ‡§µ‡•ç‡§Ø‡§µ‡§∏‡•ç‡§•‡§Ø‡§æ ‡§Ø‡•á ‡§Æ‡§π‡§æ‡§Ø‡§æ‡§ú‡§ï‡§æ ‡§®‡§ø‡§∞‡•Ç‡§™‡•ç‡§Ø‡§®‡•ç‡§§‡•á ‡§§‡•á ‡§¶‡•å‡§∞‡•ç‡§¨‡•ç‡§¨‡§≤‡•ç‡§Ø‡§Ø‡•Å‡§ï‡•ç‡§§‡§æ ‡§Æ‡§æ‡§®‡§µ‡§æ‡§É ‡§ï‡§ø‡§®‡•ç‡§§‡•Å ‡§µ‡•ç‡§Ø‡§µ‡§∏‡•ç‡§•‡§æ‡§§‡§É ‡§™‡§∞‡§Ç ‡§∂‡§™‡§•‡§Ø‡•Å‡§ï‡•ç‡§§‡•á‡§® ‡§µ‡§æ‡§ï‡•ç‡§Ø‡•á‡§® ‡§Ø‡•ã ‡§Æ‡§π‡§æ‡§Ø‡§æ‡§ú‡§ï‡•ã ‡§®‡§ø‡§∞‡•Ç‡§™‡§ø‡§§‡§É ‡§∏‡•ã ‡§Ω‡§®‡§®‡•ç‡§§‡§ï‡§æ‡§≤‡§æ‡§∞‡•ç‡§•‡§Ç ‡§∏‡§ø‡§¶‡•ç‡§ß‡§É ‡§™‡•Å‡§§‡•ç‡§∞ ‡§è‡§µ‡•§
Eng: For the law maketh men high priests which have infirmity; but the word of the oath, which was since the law, maketh the Son, who is consecrated for evermore.

--- Index 1000 ---
San: ‡§™‡§∂‡•ç‡§ö‡§æ‡§§‡•ç  Finish  ‡§á‡§§‡•ç‡§Ø‡§§‡•ç‡§∞ 

In [106]:
# Cell 8: Start Training
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("üöÄ Starting training...")
trainer.train()
print("‚úì Training complete.")

  trainer = Seq2SeqTrainer(


üöÄ Starting training...




Epoch,Training Loss,Validation Loss,Bleu
1,3.9953,3.636428,3.960434
2,3.5691,3.469731,4.930482
3,3.2405,3.388294,5.14559
4,3.1003,3.336191,5.845201
5,2.8132,3.30305,6.618728
6,2.7033,3.313157,6.302028
7,2.5145,3.282174,7.374906
8,2.367,3.316272,7.352389
9,2.2595,3.32785,7.800122
10,2.0809,3.333607,8.037683




‚úì Training complete.


In [107]:
# Cell 9: Save Model
model.save_pretrained("./final_sanskrit_model")
tokenizer.save_pretrained("./final_sanskrit_model")
print("‚úì Model saved.")

‚úì Model saved.


In [1]:
# Cell 10: Test the Translator
def translate(text):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt").to(device)
    
    # Generate
    outputs = model.generate(
        inputs["input_ids"], 
        max_length=64, 
        num_beams=4, 
        early_stopping=True,
        no_repeat_ngram_size=2
    )
    
    # Decode
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Test sentences
tests = [
    "‡§Ö‡§§‡§É ‡§µ‡§Ø‡§Æ‡§ø‡§¶‡§Ç ‡§ï‡§®‡•ç‡§∏‡•ç‡§ü‡•ç‡§∞‡§ï‡•ç‡§ü‡§∞‡•ç ‡§Ö‡§∏‡•ç‡§Ø ‡§™‡•ç‡§∞‡§•‡§Æ‡§Ç ‡§™‡§ô‡•ç‡§ï‡•ç‡§§‡§ø‡§Ç ‡§ï‡•Å‡§∞‡•ç‡§Æ‡§É ‡•§",   # Rama goes to the forest
    "‡§§‡•ç‡§µ‡§Ç ‡§ö‡§ø‡§§‡•ç‡§∞‡§Æ‡•ç ‡§Ö‡§™‡§∂‡•ç‡§Ø‡§É ", # I go to school
    "‡§§‡•á ‡§µ‡•Ä‡§∞‡§æ‡§É ‡•§"    # Dharma protects those who protect it
]

print("-" * 30)
for t in tests:
    print(f"Sanskrit: {t}")
    print(f"English:  {translate(t)}")
    print("-" * 30)

------------------------------
Sanskrit: ‡§Ö‡§§‡§É ‡§µ‡§Ø‡§Æ‡§ø‡§¶‡§Ç ‡§ï‡§®‡•ç‡§∏‡•ç‡§ü‡•ç‡§∞‡§ï‡•ç‡§ü‡§∞‡•ç ‡§Ö‡§∏‡•ç‡§Ø ‡§™‡•ç‡§∞‡§•‡§Æ‡§Ç ‡§™‡§ô‡•ç‡§ï‡•ç‡§§‡§ø‡§Ç ‡§ï‡•Å‡§∞‡•ç‡§Æ‡§É ‡•§


NameError: name 'tokenizer' is not defined

In [112]:
# Cell 11: Save Model for Download
import shutil
import os
from IPython.display import FileLink

# 1. Define where to save
save_path = "sanskrit_translator_final"

print(f"Saving model to {save_path}...")

# 2. Save Model and Tokenizer
#    (We save both because the model needs the exact same tokenizer to work)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("‚úì Model and Tokenizer saved.")

# 3. Zip the folder
#    (Kaggle makes it easier to download one zip file than 5-6 separate files)
zip_filename = "sanskrit_model.zip"
shutil.make_archive("sanskrit_model", 'zip', save_path)

print(f"‚úì Zipped into {zip_filename}")

# 4. Create a Download Link
print("\n‚¨áÔ∏è Click the link below to download your model:")
display(FileLink(zip_filename))

Saving model to sanskrit_translator_final...
‚úì Model and Tokenizer saved.
‚úì Zipped into sanskrit_model.zip

‚¨áÔ∏è Click the link below to download your model:


In [86]:
import pandas as pd
import re

def clean_line(line):
    # Remove the tags and leading/trailing whitespace
    cleaned = re.sub(r'\', '', line).strip()
    return cleaned

def prepare_saamayik(en_path, sa_path, output_csv="saamayik_cleaned.csv"):
    # Read files
    with open(en_path, "r", encoding="utf-8") as f_en, \
         open(sa_path, "r", encoding="utf-8") as f_sa:
        
        en_lines = f_en.readlines()
        sa_lines = f_sa.readlines()

    # Ensure alignment (rudimentary check)
    if len(en_lines) != len(sa_lines):
        print(f"Warning: Line counts differ! En: {len(en_lines)}, Sa: {len(sa_lines)}")
        # You might need more advanced alignment if lines are missing, 
        # but Saamayik is usually 1:1.

    data = []
    
    # Process lines
    for en, sa in zip(en_lines, sa_lines):
        clean_en = clean_line(en)
        clean_sa = clean_line(sa)
        
        # Only add if both have content
        if clean_en and clean_sa:
            data.append({"English": clean_en, "Sanskrit": clean_sa})

    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Save to CSV
    df.to_csv(output_csv, index=False)
    print(f"Successfully saved {len(df)} pairs to {output_csv}")
    return df

# Usage (assuming your files are named dev.en and dev.sa)
# df = prepare_saamayik('dev.en', 'dev.sa')
# print(df.head())

SyntaxError: unterminated string literal (detected at line 6) (1700770016.py, line 6)