In [None]:
import os
import requests
import zipfile
import json
import random
import torch

In [None]:
'''
zip links for all the datasets
datasets = {
    "french": "https://object.pouta.csc.fi/OPUS-wikimedia/v20230407/moses/en-fr.txt.zip",
    "spanish": "https://object.pouta.csc.fi/OPUS-wikimedia/v20230407/moses/en-es.txt.zip",
    "hindi" : "https://object.pouta.csc.fi/OPUS-wikimedia/v20230407/moses/en-hi.txt.zip"
}
'''

In [None]:
FRENCH_EN_PATH = "/kaggle/input/french/wikimedia.en-fr.en"
FRENCH_FR_PATH = "/kaggle/input/french/wikimedia.en-fr.fr"

SPANISH_EN_PATH = "/kaggle/input/spanish/wikimedia.en-es.en"
SPANISH_ES_PATH = "/kaggle/input/spanish/wikimedia.en-es.es"

HINDI_EN_PATH = "/kaggle/input/hindii/wikimedia.en-hi.en"
HINDI_HI_PATH = "/kaggle/input/hindii/wikimedia.en-hi.hi"

In [None]:
def convert_to_json(source_file, target_file, source_lang):
    data = []
    with open(source_file, "r", encoding="utf-8") as src_f, open(target_file, "r", encoding="utf-8") as tgt_f:
        for source_text, target_text in zip(src_f, tgt_f):
            data.append({
                "source_lang": source_lang,
                "source_text": source_text.strip(),
                "target_text": target_text.strip()
            })

    return data

In [None]:
french_data = convert_to_json(FRENCH_FR_PATH, FRENCH_EN_PATH, "fra")
spanish_data = convert_to_json(SPANISH_ES_PATH, SPANISH_EN_PATH, "spa")
hindi_data = convert_to_json(HINDI_HI_PATH, HINDI_EN_PATH, "hin")

In [None]:
min_size = 10000

In [None]:
french_data = french_data[:min_size]
spanish_data = spanish_data[:min_size]
hindi_data = hindi_data[:min_size]

In [None]:
hindi_data[:10]

In [None]:
spanish_data[:10]

In [None]:
french_data[:10]

In [None]:
def save_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Saved {filename}")

In [None]:
save_json(french_data, "french_to_english.json")
save_json(spanish_data, "spanish_to_english.json")
save_json(hindi_data, "hindi_to_english.json")

print("WikiMatrix datasets processed, balanced, and saved as JSON!")

In [None]:
!pip install transformers sentencepiece

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

SOURCE_LANGS = {"fra": "fre_Latn", "spa": "spa_Latn", "hin": "hin_Deva"} 

In [None]:
print(len(french_data))
print(len(spanish_data))
print(len(hindi_data))

In [None]:
import json
from tqdm import tqdm

with open("french_to_english.json", "r", encoding="utf-8") as f:
    french_data = json.load(f)

with open("spanish_to_english.json", "r", encoding="utf-8") as f:
    spanish_data = json.load(f)

with open("hindi_to_english.json", "r", encoding="utf-8") as f:
    hindi_data = json.load(f)

dataset = french_data + spanish_data + hindi_data

def tokenize_example(example):
    source_lang = SOURCE_LANGS[example["source_lang"]]
    source_text = example["source_text"]
    target_text = example["target_text"]

    inputs = tokenizer(source_text, padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(target_text, padding="max_length", truncation=True, max_length=128)

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"],
    }

tokenized_dataset = []
for example in tqdm(dataset, desc="Tokenizing examples", unit="ex"):
    tokenized_dataset.append(tokenize_example(example))

with open("tokenized_data.json", "w", encoding="utf-8") as f:
    json.dump(tokenized_dataset, f, ensure_ascii=False, indent=4)

print("Tokenization complete! Saved as tokenized_data.json")


In [None]:
!pip install datasets

In [None]:
from datasets import Dataset

with open("tokenized_data.json", "r", encoding="utf-8") as f:
    tokenized_data = json.load(f)

hf_dataset = Dataset.from_list(tokenized_data)

hf_dataset = hf_dataset.train_test_split(test_size=0.1)

train_dataset = hf_dataset["train"]
val_dataset = hf_dataset["test"]

print("Dataset loaded and split!")

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",  
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1000,
    push_to_hub=False,
)

print("Training arguments set!")

In [None]:
from transformers import Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
!wandb login bea8fbefbd84318fa74b0bbffb9f0972a893845a

In [None]:
import wandb
wandb.init(project="fre_spa_hin_to_eng")

In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

In [None]:
model.save_pretrained("./fine_tuned_nllb")
tokenizer.save_pretrained("./fine_tuned_nllb")

print("Fine-tuned model saved successfully!")