In [None]:
############# Install the dependencies ##################
# !pip install datasets transformers evaluate sacrebleu

####   if code fails below while creating the training arguments , run the following and restart the kernel   #######
# !pip install --upgrade accelerate


# !pip uninstall -y transformers accelerate
# !pip install transformers accelerate

In [1]:
## Import the dataset on which we want to fine-tune the model
# I have used ai4bharat/samanantar, don't forget to look for the subsets of the dataset and load it accordingly
from datasets import load_dataset
odia=load_dataset("ai4bharat/samanantar","or")



  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
print(odia)
#Our dataset has three features , src has english text while tgt has odia tranlations of it

DatasetDict({
    train: Dataset({
        features: ['idx', 'src', 'tgt'],
        num_rows: 998228
    })
})


In [3]:
print(type(odia))

<class 'datasets.dataset_dict.DatasetDict'>


In [4]:
#Split the dataset into train and test set
odia = odia["train"].train_test_split(test_size=0.2)

In [5]:
##Import Tokenizer
from transformers import AutoTokenizer

checkpoint = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
 # odia["train"]['src']

In [10]:
## Tokenization function 
prefix="translate English to Odia : "
def preprocess(odia):
    #putting all the English sentences into the imput list with the prefix.
    #and the odia translations into the target into the target list.
    inputs=[prefix + text for text in odia['src']]
    target=[text for text in odia['tgt']]

    model_inputs = {
        'id': odia['idx'] ,
        'translation': target,  # assign 'translation' directly to 'target'
        'input_ids': [],
        'attention_mask': [],
        'labels' : []
    }
    
    try:
        # Tokenize all inputs and targets at once, which is more efficient
        tokenized_samples = tokenizer(inputs, text_pair=target, max_length=128, truncation=True)

        # Assign the tokenized samples directly to 'input_ids', 'attention_mask', and 'labels'
        model_inputs['input_ids'] = tokenized_samples['input_ids']
        model_inputs['attention_mask'] = tokenized_samples['attention_mask']
        model_inputs['labels'] = tokenized_samples['input_ids']  # assuming you want to use 'input_ids' as 'labels'

    except Exception as e:
        print(f"Error occurred: {e}")

    return model_inputs


In [11]:
#Applying the tokenization funtion on the dataset, with batch processing 
tokenized_odia=odia.map(preprocess,batched=True)


Map:   0%|          | 0/798582 [00:00<?, ? examples/s]

Map:   0%|          | 0/199646 [00:00<?, ? examples/s]

In [12]:
#after tokenization , the dataset 
print(tokenized_odia)

DatasetDict({
    train: Dataset({
        features: ['idx', 'src', 'tgt', 'id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 798582
    })
    test: Dataset({
        features: ['idx', 'src', 'tgt', 'id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 199646
    })
})


In [13]:
#importing the data collators which creates the batches of inputs to be fed to the seq2seq model.
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='facebook/nllb-200-distilled-600M')

In [14]:
#Importing sacrebleu to evaluate the blue score of the model.
import evaluate

metric = evaluate.load("sacrebleu")

In [15]:
#funtion to calculate and process the blue score
import numpy as np
def postprocess(pred,labels):
  preds=[pred.strip() for pred in preds]
  labels=[[label.strip()] for label in labels]
  return preds,labels


def compute_metrics(eval_preds):
  preds,labels=eval_preds
  if isinstance(preds,tuple):
    preds=preds[0]
  decoded_preds=tokenizer.batch_decode(preds,skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_preds, decoded_labels = postprocess(decoded_preds, decoded_labels)

  result = metric.compute(predictions=decoded_preds, references=decoded_labels)
  result = {"bleu": result["score"]}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)
  result = {k: round(v, 4) for k, v in result.items()}
  return result

In [16]:
#trying to create a trainer using Pytorch
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M')

In [17]:
#tuning the hyperparameters of Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tune_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    # push_to_hub=True,
)
#passing the arguments to trainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_odia["train"],
    eval_dataset=tokenized_odia["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
