In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from IndicTransToolkit import IndicProcessor

# Device config
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Language codes for Hindi and Bhojpuri
src_lang, tgt_lang = "hin_Deva", "unk_Deva"
model_name = "ai4bharat/indictrans2-indic-en-dist-200M"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to(DEVICE)

# Init IndicProcessor
ip = IndicProcessor(inference=False)  # Training mode

# Load your dataset (Hindi to Bhojpuri)
df = pd.read_csv("/kaggle/input/hindi-to-bhojpuri/translated_hin_bho.csv")  # Columns: 'input' and 'target'

# Preprocessing: Apply IndicProcessor and tokenization
def preprocess_function(examples):
    src_sentences = ip.preprocess_batch(examples["input"], src_lang=src_lang, tgt_lang=tgt_lang)
    tgt_sentences = ip.preprocess_batch(examples["target"], src_lang=src_lang, tgt_lang=tgt_lang)

    model_inputs = tokenizer(src_sentences, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(tgt_sentences, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert pandas to Hugging Face dataset
hf_dataset = Dataset.from_pandas(df)
tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./indictrans2-bhojpuri-finetuned",
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    logging_steps=50,
    save_steps=200,
    num_train_epochs=3,
    fp16=True,
    learning_rate=2e-5,
    warmup_steps=100,
    save_total_limit=2,
    report_to="none",
)

# Data Collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset.select(range(100)),  # Small slice for validation
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start fine-tuning
trainer.train()
