# Fine Tuning Mbart50 on Telugu Dataset
* Base Model: facebook/mbart-large-50-many-to-many-mmt (https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt)
* Dataset: https://huggingface.co/datasets/ai4bharat/samanantar

## Setup

In [1]:
!pip install transformers datasets peft torch sacrebleu

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu, peft
Successfully installed peft-0.12.0 portalocker-2.10.1 sacrebleu-2.4.2


In [2]:
from transformers import (
    MBartForConditionalGeneration, 
    MBart50Tokenizer, 
    AutoTokenizer, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments
)
from datasets import (
    load_dataset, 
    load_metric,
    Dataset, 
    DatasetDict
)
import numpy as np
import torch, os

2024-07-31 19:05:55.669168: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 19:05:55.669269: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 19:05:55.799793: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

## Load Dataset

In [4]:
raw_dataset = load_dataset('ai4bharat/samanantar', 'te', split='train',streaming=True, trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.06k [00:00<?, ?B/s]

In [5]:
dataset = raw_dataset.take(10000)

In [6]:
# Convert the IterableDataset to a list
limited_data_list = list(dataset)

# Create a Dataset from the list
limited_data = Dataset.from_list(limited_data_list)

In [7]:
tokenized_datasets = limited_data.train_test_split(test_size=0.2)
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

In [8]:
train_dataset[0]
eval_dataset[0]

{'idx': 7902,
 'src': 'In Goa, while the BJP won three out of four in the bypolls, it also retained the North Goa Lok Sabha seat.',
 'tgt': 'గోవాలో నాలుగు స్థానాలకు ఉపఎన్నికలు జరగ్గా భాజపా మూడు చోట్ల విజయం సాధించింది.'}

## Load Model and Tokenizer

In [9]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples):
    inputs = [ex for ex in examples['src']]
    targets = [ex for ex in examples['tgt']]
    
    # Tokenize inputs with padding and truncation
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    
    # Tokenize targets with padding and truncation
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    
    # Replace padding token id with -100 for labels to ignore padding in loss calculation
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    
    model_inputs["labels"] = labels
    return model_inputs

In [11]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=1,
    predict_with_generate=True,
    logging_dir='./logs',           # Directory for storing logs
    logging_steps=10,               # Log every 10 steps
    generation_max_length=128,      # Set maximum length for generation
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
metric = load_metric("sacrebleu", trust_remote_code=True)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Decode the predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Decode the labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Calculate metrics
    result = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return result

  metric = load_metric("sacrebleu", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [14]:
# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Additional logging for debugging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Train and Save the Model

In [15]:
# Directory to save the model
save_directory = "./finetuned-mbart50-en-tel"

# Create the directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

# Start training
try:
    trainer.train()
except OverflowError as e:
    logger.error(f"OverflowError encountered: {e}")
    for i, batch in enumerate(tokenized_train_dataset):
        try:
            inputs = batch['input_ids']
            labels = batch['labels']
            decoded_inputs = tokenizer.decode(inputs, skip_special_tokens=True)
            decoded_labels = tokenizer.decode(labels, skip_special_tokens=True)
            logger.info(f"Batch {i}:")
            logger.info(f"Inputs: {decoded_inputs}")
            logger.info(f"Labels: {decoded_labels}")
        except OverflowError:
            logger.error(f"OverflowError in batch {i}")

# Save the trained model
trainer.save_model(save_directory)




Epoch,Training Loss,Validation Loss,Score,Counts,Totals,Precisions,Bp,Sys Len,Ref Len
1,3.864,3.849852,3.208745,"[4423, 738, 197, 72]","[17429, 15429, 13433, 11527]","[25.37724482184864, 4.783200466653704, 1.4665376312067298, 0.6246204563199445]",0.988137,17429,17637


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Trainer is attempting to log a value of "[4423, 738, 197, 72]" of type <class 'list'> for key "eval/counts" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[17429, 15429, 13433, 11527]" of type <class 'list'> for key "eval/totals" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[25.37724482184864, 4.783200466653704, 1.4665376312067298, 0.6246204563199445]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


## Load the Model to Generate Translation

In [16]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
# Load your model and tokenizer
model_name =save_directory
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Example English sentence to translate
english_sentence = "Hi, How are you?"

# Tokenize the input sentence
inputs = tokenizer(english_sentence, return_tensors="pt").to(device)

# Generate translation
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)

# Decode the generated tokens
translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"English: {english_sentence}")
print(f"Telugu Translation: {translated_sentence}")

English: Hi, How are you?
Telugu Translation: హై, ఎలా ఉన్నావు?
