In [1]:
pip install transformers datasets accelerate sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [8]:
!pip install transformers datasets sacremoses



In [8]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd

In [10]:
df = pd.read_csv("hindi_english_parallel.csv")  # Make sure file path is correct
dataset = Dataset.from_pandas(df)
df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [7]:
split_dataset = dataset.train_test_split(test_size=0.2)

In [9]:
from transformers import MarianTokenizer, MarianMTModel
model_name = "Helsinki-NLP/opus-mt-hi-en"  # For Hindi to English translation
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [11]:
force_download = True

In [5]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
   ---------------------------------------- 0.0/897.5 kB ? eta -:--:--
   ----------------------------------- ---- 786.4/897.5 kB 8.5 MB/s eta 0:00:01
   ---------------------------------------- 897.5/897.5 kB 6.8 MB/s eta 0:00:00
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [13]:
print(dataset.column_names)

['hindi', 'english']


In [15]:
print(dataset)
print(type(dataset))
print(dataset.column_names)
print(dataset[0])

Dataset({
    features: ['hindi', 'english'],
    num_rows: 1561841
})
<class 'datasets.arrow_dataset.Dataset'>
['hindi', 'english']
{'hindi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें', 'english': 'Give your application an accessibility workout'}


In [12]:
def is_valid(example):
    return isinstance(example["hindi"], str) and isinstance(example["english"], str)

clean_dataset = dataset.filter(is_valid)

Filter:   0%|          | 0/1561841 [00:00<?, ? examples/s]

In [14]:
def preprocess_function(examples):
    return tokenizer(
        examples["hindi"],
        text_target=examples["english"],
        max_length=128,
        padding="max_length",
        truncation=True
    )

In [16]:
tokenized_dataset = clean_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1555574 [00:00<?, ? examples/s]

In [18]:
from datasets import DatasetDict

# 90% train, 10% validation split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)

# Rename keys to match expected names
split_dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"]
})

In [34]:
!pip install --upgrade accelerate



In [18]:
!pip install transformers==4.38.2 accelerate==0.27.2

Collecting transformers==4.38.2
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
Collecting accelerate==0.27.2
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
   ---------------------------------------- 0.0/8.5 MB ? eta -:--:--
   ------ --------------------------------- 1.3/8.5 MB 7.5 MB/s eta 0:00:01
   ------- -------------------------------- 1.6/8.5 MB 8.4 MB/s eta 0:00:01
   ----------- ---------------------------- 2.4/8.5 MB 3.9 MB/s eta 0:00:02
   ---------------------- ----------------- 4.7/8.5 MB 5.8 MB/s eta 0:00:01
   ------------------------------------ --- 7.9/8.5 MB 7.8 MB/s eta 0:00:01
   ---------------------------------------- 8.5/8.5 MB 8.0 MB/s eta 0:00:00
Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
Installing collected packages: accelerate, transformers
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.6.0
    Uninstalling accele

In [20]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# Optional: Use smaller subsets for faster training
small_train_dataset = split_dataset["train"].select(range(500))        # You can adjust this
small_eval_dataset = split_dataset["validation"].select(range(100))

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# ✅ Training arguments (THIS is where we control epochs & steps)
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,                       # ⬅️ Only 3 epochs!
    logging_steps=100,
    save_total_limit=1,
    save_strategy="no",                       # ⬅️ Don't save checkpoints
    evaluation_strategy="no",                 # ⬅️ Skip evaluation during training
                                    # ⬅️ Optional if on GPU
    report_to="none",                         # ⬅️ Disable W&B/TensorBoard
    predict_with_generate=True,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train!
trainer.train()

Step,Training Loss
100,0.3993


TrainOutput(global_step=189, training_loss=0.3596232984431837, metrics={'train_runtime': 574.9238, 'train_samples_per_second': 2.609, 'train_steps_per_second': 0.329, 'total_flos': 50847547392000.0, 'train_loss': 0.3596232984431837, 'epoch': 3.0})

In [22]:
trainer.save_model("./my_finetuned_model")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}


In [24]:
tokenizer.save_pretrained("./my_finetuned_model")

('./my_finetuned_model\\tokenizer_config.json',
 './my_finetuned_model\\special_tokens_map.json',
 './my_finetuned_model\\vocab.json',
 './my_finetuned_model\\added_tokens.json')

In [4]:
# only for loading

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("./my_finetuned_model")
tokenizer = AutoTokenizer.from_pretrained("./my_finetuned_model")

In [33]:
input_text = "आपका नाम क्या है?"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Generate translation
output_ids = model.generate(**inputs)
translated = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated:", translated)

Translated: What is your name?


In [30]:
input_text = "यह एक परीक्षा है"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Generate translation
output_ids = model.generate(**inputs)
translated = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated:", translated)

Translated: It is a test


In [32]:
input_text = "मैं स्कूल जा रहा हूँ।"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Generate translation
output_ids = model.generate(**inputs)
translated = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated:", translated)

Translated: I am going to school.


In [34]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [38]:
!pip install sacrebleu jiwer

!pip install rouge_score absl-py nltk


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting click>=8.1.8 (from jiwer)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------------------------- -------------- 1.0/1.6 MB 6.3 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 6.2 MB/s eta 0:00:00
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected pack

In [None]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")



Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...


Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

In [50]:
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Limit for quick test
validation_data = split_dataset["validation"].select(range(100))
batch_size = 32

predictions = []
references = []

for i in tqdm(range(0, len(validation_data), batch_size)):
    batch = validation_data.select(range(i, min(i + batch_size, len(validation_data))))
    batch_dict = batch.to_dict()

    hindi_texts = batch_dict["hindi"]
    english_refs = batch_dict["english"]

    inputs = tokenizer(hindi_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = model.generate(**inputs)
    translated_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    predictions.extend(translated_batch)
    references.extend(english_refs)


100%|██████████| 4/4 [03:30<00:00, 52.57s/it]


In [56]:
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_score = rouge.compute(predictions=predictions, references=references)
meteor_score = meteor.compute(predictions=predictions, references=references)

print("BLEU:", bleu_score)
print("ROUGE:", rouge_score)
print("METEOR:", meteor_score)



BLEU: {'bleu': 0.1889078247813802, 'precisions': [0.47947761194029853, 0.23143236074270557, 0.1358198451794511, 0.08792846497764531], 'brevity_penalty': 0.9900990911851015, 'length_ratio': 0.9901477832512315, 'translation_length': 1608, 'reference_length': 1624}
ROUGE: {'rouge1': 0.4020023626044187, 'rouge2': 0.21414160421906642, 'rougeL': 0.370264331636888, 'rougeLsum': 0.3707874567552187}
METEOR: {'meteor': 0.35038725039883917}
