### Setup

In [1]:
!pip install transformers datasets peft torch sacrebleu --quiet

In [2]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, load_metric
import numpy as np
import torch, os

2024-08-01 03:28:48.560048: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-01 03:28:48.560192: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-01 03:28:48.687336: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

In [4]:
raw_dataset = load_dataset('ai4bharat/samanantar', 'ta', split='train[:10000]', trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.06k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
dataset = raw_dataset.take(10000)

In [6]:
dataset

Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 10000
})

### Prepare the dataset for the model

In [7]:
tokenized_datasets = dataset.train_test_split(test_size=0.1)
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

### Load the Model and Tokenizer

In [8]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [9]:
# Define a function to tokenize the dataset
def preprocess_function(examples):
    inputs = [ex for ex in examples['src']]
    targets = [ex for ex in examples['tgt']]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids

    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    
    model_inputs["labels"] = labels
    return model_inputs

In [10]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=1,
    predict_with_generate=True,
    logging_dir='./logs',           # Directory for storing logs
    logging_steps=10,               # Log every 10 steps
    generation_max_length=128,      # Set maximum length for generation
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
metric = load_metric("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return result

  metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

The repository for sacrebleu contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/sacrebleu.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [14]:
save_directory = "/kaggle/new_model/saved_model"

os.makedirs(save_directory, exist_ok=True)

In [15]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Score,Counts,Totals,Precisions,Bp,Sys Len,Ref Len
1,2.2335,2.122425,9.092397,"[3609, 1124, 467, 222]","[10078, 9078, 8083, 7122]","[35.81067672157174, 12.381581846221636, 5.7775578374365955, 3.1171019376579614]",0.96185,10078,10470


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Trainer is attempting to log a value of "[3609, 1124, 467, 222]" of type <class 'list'> for key "eval/counts" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[10078, 9078, 8083, 7122]" of type <class 'list'> for key "eval/totals" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[35.81067672157174, 12.381581846221636, 5.7775578374365955, 3.1171019376579614]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=563, training_loss=2.2975752730564367, metrics={'train_runtime': 1386.5509, 'train_samples_per_second': 6.491, 'train_steps_per_second': 0.406, 'total_flos': 2438020988928000.0, 'train_loss': 2.2975752730564367, 'epoch': 1.0})

### Save the Finetuned Model and Tokenizer

In [16]:
# Save the trained model
trainer.save_model(save_directory)

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [17]:
# Save directory
save_directory = './finetuned-MBart50-en-ta'

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Model saved to ./finetuned-MBart50-en-ta


### Load the Finetuned model and check out the translation

In [19]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Load your model and tokenizer
model_name =save_directory
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Example English sentence to translate
english_sentence = "I am trying to translate this to another language"

# Tokenize the input sentence
inputs = tokenizer(english_sentence, return_tensors="pt").to(device)

# Generate translation
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)

# Decode the generated tokens
translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"English: {english_sentence}")
print(f"Tamil Translation: {translated_sentence}")

English: I am trying to translate this to another language
Tamil Translation: இதை வேறு மொழியில் மொழிபெயர்க்க முயற்சிக்கிறேன்
