<a href="https://colab.research.google.com/github/Supratim0406/100-days-of-machine-learning/blob/main/Machine_Translation_With_HuggingFace_Transformers___Language_Translation_With_Seq2Seq_Transformer___English_To_Urdu_Using_LLM_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [36]:
!pip install datasets



In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Loading the Dataset


In [38]:
## Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/hindi_english_parallel.csv') # Replace with your dataset file
df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [40]:
# Rename columns for clarity
df.columns = ['target_text', 'source_text']  # Rename columns for clarity
df = df[['source_text','target_text']]
df.head()

Unnamed: 0,source_text,target_text
0,Give your application an accessibility workout,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
1,Accerciser Accessibility Explorer,एक्सेर्साइसर पहुंचनीयता अन्वेषक
2,The default plugin layout for the bottom panel,निचले पटल के लिए डिफोल्ट प्लग-इन खाका
3,The default plugin layout for the top panel,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
4,A list of plugins that are disabled by default,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...


In [41]:
## Take only 6000 samples from the dataset
df = df.sample(n=6000, random_state=42).reset_index(drop=True)
df.shape

(6000, 2)

In [42]:
## Check for null values
df.isnull().sum()

Unnamed: 0,0
source_text,5
target_text,28


In [43]:
## Remove null values
df = df.dropna()
df.isnull().sum()

Unnamed: 0,0
source_text,0
target_text,0


In [44]:
# Convert all entries to strings
df["source_text"] = df["source_text"].astype(str)
df["target_text"] = df["target_text"].astype(str)
df.shape

(5971, 2)

## Split the data into Train and Test

In [45]:
# Split dataset into training and validation sets
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [46]:
train_dataset

Dataset({
    features: ['source_text', 'target_text', '__index_level_0__'],
    num_rows: 5373
})

# Tokenization

In [47]:
from transformers import AutoTokenizer

# Load a tokenizer for the chosen model (e.g., mT5 or mBART)
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")


In [48]:
input_max_len = max([len(tokenizer.encode(text)) for text in df['source_text']])
input_max_len

236

In [49]:
output_max_len = max([len(tokenizer.encode(text)) for text in df['target_text']])
output_max_len

294

In [50]:

# Tokenization function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['source_text'],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        examples['target_text'],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/5373 [00:00<?, ? examples/s]

Map:   0%|          | 0/598 [00:00<?, ? examples/s]

In [51]:
train_dataset[0]

{'input_ids': tensor([250004,   4966,   6637,     70,  42878,   2450,  30310,      7,  68034,
             98,   2367,    642,   1884,     47,   5351,     83,     10,     52,
            184,  25667,    970,  40250,      4,    642,    765,    959,   1902,
           2499,   5122,     18,    111,  32603,   4935,  68828,   8780,   3934,
            450,  45559,    221,   2060,     20, 102971,   2685,   1556,   2809,
           5045, 145456,   3934,     70,  67153,   5180,    111,  22458,      7,
            450, 134629,      5,      2,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1

#  Load Pretrained Sequence-to-Sequence Model

In [52]:
import os

results_dir = "/content/drive/MyDrive/results"
model_dir = "/content/drive/MyDrive/my_trans-model"

# Create the results directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [53]:
from transformers import Seq2SeqTrainingArguments

from transformers import AutoModelForSeq2SeqLM

# AutoModelForSeq2SeqLM is a class in the Hugging Face Transformers library that automatically loads a pre-trained sequence-to-sequence model.
# It is used for tasks like machine translation, summarization, and other text generation tasks, where both the encoder and decoder are trained to process input and generate output sequences.

# Load a pre-trained sequence-to-sequence model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")

# Set language-specific tokens if using mBART
model.config.decoder_start_token_id = tokenizer.lang_code_to_id["hi_IN"]  # hindi token
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "hi_IN"


# Seq2SeqTrainingArguments is a class in Hugging Face's Transformers library designed specifically for training sequence-to-sequence models.
# It provides various training configurations such as batch size, number of epochs, evaluation strategy, and output directory, optimized for tasks like translation, summarization, or text generation.

training_args = Seq2SeqTrainingArguments(
    output_dir=results_dir,
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    predict_with_generate=True,
    generation_max_length=128,
)


from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [54]:
#Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.3618,0.595426
2,0.5332,0.514211
3,0.3196,0.532777




TrainOutput(global_step=2016, training_loss=0.650774042284678, metrics={'train_runtime': 3915.4489, 'train_samples_per_second': 4.117, 'train_steps_per_second': 0.515, 'total_flos': 4366495591170048.0, 'train_loss': 0.650774042284678, 'epoch': 3.0})

In [55]:
# Evaluate the model on the test dataset
results = trainer.evaluate()
print(results)

{'eval_loss': 0.5327774882316589, 'eval_runtime': 25.1217, 'eval_samples_per_second': 23.804, 'eval_steps_per_second': 2.985, 'epoch': 3.0}


# Save Model

In [56]:

# Create the directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)

# Save the fine-tuned model
trainer.save_model(model_dir)

# Save the tokenizer explicitly
tokenizer.save_pretrained(model_dir)


loaded_model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_dir)

print("Model and tokenizer reloaded successfully!")


Model and tokenizer reloaded successfully!


# Translation System

In [57]:
def translate_text(text):
    # Tokenize input
    inputs = loaded_tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
    # Generate translation
    outputs = loaded_model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    # Decode the translation
    translation = loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Example usage
text_to_translate = "The book of the generation of Jesus Christ"
translated_text = translate_text(text_to_translate)
print("Translated text:", translated_text)


Translated text: इसराईल की वंशज पत्रिका


In [58]:
# Example usage
text_to_translate = "We appreciate you"
translated_text = translate_text(text_to_translate)
print("Translated text:", translated_text)

Translated text: हम आपको अत्यधिक कृतज्ञ हैं


In [None]:
# Example list of English sentences to translate
texts_to_translate = [
    "Children who are sixteen years old or younger are not allowed in the theater",
    "She borrowed the book from him many years ago",
    "She asked him to not quit his job because they needed the money",
    "Tom would've liked to attend Mary's party, unfortunately, he couldn't",
    "When you meet someone for the first time, be careful about your impressions"
]
# Translate each sentence and print the result
for sentence in texts_to_translate:
    translated_text = translate_text(sentence)
    print(f"Original: {sentence}")
    print(f"Translated: {translated_text}\n")

Original: Children who are sixteen years old or younger are not allowed in the theater
Translated: और जो बच्चे छह से छह वर्ष की आयु से कम या ज्यादा की हों उन्हें मल्टीप्लेक्स में प्रवेश नहीं है

Original: She borrowed the book from him many years ago
Translated: वे उनसे कई वर्ष पहले एक पुस्तक ले आए थे।

