In [1]:
import pandas as pd

from transformers import MBartTokenizer, MBartForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
model_name = "facebook/mbart-large-50"

In [3]:
data = pd.read_csv("./data/English_Hinglish_Corpus.csv")
data.head()

Unnamed: 0,english,hinglish
0,Are Barcelona playing today at the camp nou?,क्या barcelona आज camp nou में खेल रहा है?
1,The last time we went to the USA was in 1994.,आखिरी बार हम 1994 में USA गए थे।
2,I am thinking about applying for the MS progra...,मैं UC Berkley में MS program के लिए apply करन...
3,I think Stanford would be too expensive for me.,मुझे लगता है कि Stanform मेरे लिए बहुत महंगा ह...
4,I got my driver's license today.,मुझे आज अपना Driver's License मिल गया।


In [4]:
dataset = data.to_dict(orient='records')

In [5]:
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# Tokenize and format your dataset
tokenized_dataset = tokenizer.prepare_seq2seq_batch(src_texts=[item["english"] for item in dataset], tgt_texts=[item["hinglish"] for item in dataset], return_tensors="pt", padding=True, truncation=True, max_length=1024)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



In [7]:
# Create a custom dataset
class CustomDataset(Dataset):
    def __init__(self, tokenized_dataset):
        self.data = tokenized_dataset

    def __len__(self):
        return len(self.data["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.data["input_ids"][idx],
            "attention_mask": self.data["attention_mask"][idx],
            "labels": self.data["labels"][idx]
        }

custom_dataset = CustomDataset(tokenized_dataset)

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-model",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=1,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=None,  # Use the default data collator
    train_dataset=custom_dataset,  # Provide the custom dataset
)

In [9]:
# Start fine-tuning
trainer.train()



  0%|          | 0/64 [00:00<?, ?it/s]

{'train_runtime': 615.4312, 'train_samples_per_second': 0.208, 'train_steps_per_second': 0.104, 'train_loss': 6.708470344543457, 'epoch': 2.0}


TrainOutput(global_step=64, training_loss=6.708470344543457, metrics={'train_runtime': 615.4312, 'train_samples_per_second': 0.208, 'train_steps_per_second': 0.104, 'train_loss': 6.708470344543457, 'epoch': 2.0})

In [10]:
# Save the fine-tuned model
trainer.save_model("./fine-tuned-model")

In [11]:
# Load the fine-tuned model and tokenizer
upd_model = MBartForConditionalGeneration.from_pretrained("./fine-tuned-model")
tokenizer = MBartTokenizer.from_pretrained(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
input_text_one = "Definitely share your feedback in the comment section"
input_text_two = "So even if it's a big video, I will clearly mention all the products."
input_text_three = "I was waiting for my bag."

In [13]:
input_ids_one = tokenizer.encode("en_" + input_text_one, return_tensors="pt", max_length=1024, padding="max_length", truncation=True)

In [14]:
translation_one = upd_model.generate(input_ids_one, max_length=1024, num_return_sequences=1, decoder_start_token_id=upd_model.config.decoder_start_token_id)

KeyboardInterrupt: 

In [None]:
translated_text_one = tokenizer.decode(translation_one[0], skip_special_tokens=True)

In [17]:
input_ids_two = tokenizer.encode("en_" + input_text_two, return_tensors="pt", max_length=1024, padding="max_length", truncation=True)

In [None]:
translation_two = upd_model.generate(input_ids_two, max_length=1024, num_return_sequences=1, decoder_start_token_id=upd_model.config.decoder_start_token_id)

In [None]:
translated_text_two = tokenizer.decode(translation_two[0], skip_special_tokens=True)

In [15]:
input_ids_three = tokenizer.encode("en_" + input_text_three, return_tensors="pt", max_length=1024, padding="max_length", truncation=True)

In [16]:
translation_three = upd_model.generate(input_ids_three, max_length=1024, num_return_sequences=1, decoder_start_token_id=upd_model.config.decoder_start_token_id)

KeyboardInterrupt: 

In [None]:
translated_text_three = tokenizer.decode(translation_three[0], skip_special_tokens=True)

In [23]:
print("The translated texts are:")
print("1)" + translated_text_one)
print("2)" + translated_text_two)
print("3)" + translated_text_three)

The translated texts are:
1)आपकी feedback को comment section में जरूर share करें।
3)मैं उसे bag waiting करने के लिए waiting कर रहा था।
