In [None]:
# Step 1: Install Libraries
!pip install transformers datasets pandas torch sentencepiece -q

In [None]:
# Step 2: Import Libraries
import pandas as pd
from datasets import Dataset
from transformers import MBartForConditionalGeneration, MBartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

In [None]:
# Step 3: Load the Dataset
file_path = "/content/Bhagwad_Gita_QA_Pairs.xlsx"  # Update with your file's path on Colab
df = pd.read_excel(file_path)

# Assuming your dataset has columns: 'Question' and 'Answer' (English, Hindi, Sanskrit)
df = df.dropna()  # Remove rows with missing values

# Example: Choose the Answer column for one language (e.g., Hindi)
language_code = "hi_IN"  # Options: 'hi_IN', 'sa' (for Sanskrit), 'en_XX'
df['Answer'] = df['Answer']  # Adjust as per your column names

In [None]:
# Step 4: Prepare the Dataset
def preprocess_data(examples):
    model_inputs = tokenizer(examples['Question'], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(examples['Answer'], max_length=128, truncation=True, padding="max_length")
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang=language_code)

tokenized_dataset = dataset.map(preprocess_data, batched=True)

# Split into train and validation datasets
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]


In [None]:
# Step 5: Load mBART-50 Model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

In [None]:

# Step 6: Define Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart50-finetuned-bhagwadgita",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU is available
    push_to_hub=False,
)

In [None]:
# Step 7: Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)


In [None]:

# Step 8: Train the Model
trainer.train()

In [None]:
# Step 9: Save the Fine-tuned Model
trainer.save_model("./mbart50-finetuned-bhagwadgita")
tokenizer.save_pretrained("./mbart50-finetuned-bhagwadgita")


('./mbart50-finetuned-bhagwadgita/tokenizer_config.json',
 './mbart50-finetuned-bhagwadgita/special_tokens_map.json',
 './mbart50-finetuned-bhagwadgita/sentencepiece.bpe.model',
 './mbart50-finetuned-bhagwadgita/added_tokens.json')

In [59]:
# Optional: Push to Hugging Face Hub
# trainer.push_to_hub("mbart50-finetuned-bhagwadgita")

# Step 10: Evaluate the Model with Enhanced Parameters
sample_question = "What is soul in hindi?"

# Tokenize the input question
inputs = tokenizer(
    sample_question,
    return_tensors="pt",
    max_length=256,  # Allow for a longer input
    truncation=True
).to(model.device)

# Generate the answer with custom parameters
output_tokens = model.generate(
    **inputs,
    max_length=1000,  # Maximum length of the generated output
    num_beams=3,     # Use beam search for better coherence
    no_repeat_ngram_size=2,  # Avoid repeating n-grams
    early_stopping=True,  # Stop when a complete sentence is formed
    length_penalty=1.0  # Adjust length bias (higher value favors longer outputs)
)

# Decode the generated answer
decoded_answer = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Print the results
print(f"Question: {sample_question}")
print(f"Answer: {decoded_answer}")

Question: What is soul in hindi?
Answer: ।।11.20।। व्याख्या -- [इसी अध्यायके आरम्भमें अर्जुनने कहा था कि कामनाओंसे जिनका चित्त हर लिया गया है? ऐसे भक्तियोगी के बनो मत होना चाहिये। क्योंकि भगवान्के सिवाय दूसरी किसी वस्तुकी इच्छा नहीं है। इसलिये वह काम करना नहीं चाहता (टिप्पणी प0 597)। परन्तु जो भक्त मेरी ही उपासना करते हैं? उनके लिये मैं ही सर्वथा तत्पर रहता हूँ -- ऐसा कहते हैं।]यदा भूतपृथग्भावं ৷৷. मनःप्राणेन्द्रिय
