In [1]:
!pip install datasets transformers sentencepiece sacrebleu evaluate accelerate -q

In [2]:
import numpy as np
import torch

from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset
from transformers import M2M100ForConditionalGeneration, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

In [3]:
dataset = load_dataset("kde4", lang1="en",lang2="ne")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 89466
    })
})

In [4]:
split_dataset = dataset['train'].train_test_split(train_size=0.8)
split_dataset['validation'] = split_dataset.pop('test')

In [5]:
model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100",src_lang='en',tgt_lang='ne')

In [6]:
en_sentence = split_dataset['train'][1]['translation']['en']
ne_sentence = split_dataset['train'][1]['translation']['ne']

inputs = tokenizer(en_sentence,text_target=ne_sentence)
inputs

{'input_ids': [128022, 11141, 113948, 2], 'attention_mask': [1, 1, 1, 1], 'labels': [128066, 48299, 99867, 86589, 2]}

In [7]:
max_length = 512

def preprocess(examples):
  inputs = [ex['en'] for ex in examples['translation']]
  targets = [ex['ne'] for ex in examples['translation']]
  model_inputs = tokenizer(
      inputs,text_target=targets,
      max_length=max_length,
      truncation=True
  )
  return model_inputs

tok_ds = split_dataset.map(
    preprocess, batched=True,
    remove_columns = split_dataset['train'].column_names
)

tok_ds.set_format('torch')

Map:   0%|          | 0/71572 [00:00<?, ? examples/s]

Map:   0%|          | 0/17894 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

## About the evaluation metric

*   SacreBLEU is a metric commonly used to evaluate the quality of machine translation output.
*   SacreBLEU calculates the precision of n-grams (contiguous sequences of n words) in the machine translation output compared to the reference translations.
*    It uses the geometric mean of the n-gram precisions to compute the final score.



In [9]:
# Load sacreBLEU metric from the evaluate module
metric = evaluate.load('sacrebleu')

# function that takes in a tuple of predictions and labels, decodes them using a tokenizer, and calculates the BLEU score using the sacreBLEU metric.
def compute_metrics(eval_preds):
  # Unpack the tuple into predictions and labels
  preds, labels = eval_preds

  # If predictions are in a tuple format, take only the first item from the tuple
  if isinstance(preds,tuple):
    preds = preds[0]

  # Decode the predicted values using a tokenizer and remove any special tokens
  decoded_preds = tokenizer.batch_decode(preds,skip_special_tokens=True)

  # Replace any -100 values in the labels with the pad token ID from the tokenizer
  labels = np.where(labels != -100, tokenizer.pad_token_id)

  # Decode the labels using a tokenizer and remove any special tokens
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Remove any leading or trailing white space from each predicted value
  decoded_preds = [pred.strip() for pred in decoded_preds]

  # Remove any leading or trailing white space from each label value and put them in a nested list format
  decoded_labels = [[label.strip()] for label in decoded_labels]

  # Calculate the BLEU score using sacreBLEU metric by passing in decoded predictions and labels as arguments
  result = metric.compute(predictions=decoded_preds, references=decoded_labels)

  # Return a dictionary containing the BLEU score under the key 'bleu'
  return {'bleu':result['score']}


In [10]:
args = Seq2SeqTrainingArguments(
    output_dir = f"en-to-ne-translation",
    evaluation_strategy = "no",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to = None
)

In [11]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset = tok_ds['train'],
    eval_dataset = tok_ds['validation'],
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

torch.cuda.empty_cache()
trainer.train()

Step,Training Loss
500,2.73
1000,1.9111
1500,1.7299
2000,1.662
2500,1.5772
3000,1.5051
3500,1.4674
4000,1.4377
4500,1.3815
5000,1.3851


KeyboardInterrupt: ignored

## Inference

In [15]:
english_text = "I am going to the store to buy some groceries."

# Tokenize the input English text using the tokenizer
inputs = tokenizer(english_text, return_tensors='pt')

# Generate the Nepali translation of the input text using the trained model
generated_tokens = model.generate(inputs['input_ids'].to("cuda"),
                                  attention_mask=inputs['attention_mask'].to("cuda"),
                                  num_beams=4,
                                  max_length=512)

# Decode the generated tokens to get the final Nepali translation
nepali_translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
nepali_translation

'म केही खैरो खरीद गर्न दुरीमा जान्छु ।'