# Setup

In [1]:
!pip install datasets transformers torch accelerate sacremoses sacrebleu --quiet

In [2]:
from datasets import load_dataset, load_metric
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import numpy as np
import torch, os

## Load the dataset

In [4]:
raw_dataset = load_dataset('ai4bharat/samanantar', 'ta', split='train', streaming=True, trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.06k [00:00<?, ?B/s]

In [5]:
# Take the first 100,000 rows
limited_data = raw_dataset.take(200000)

In [6]:
limited_data

IterableDataset({
    features: ['idx', 'src', 'tgt'],
    n_shards: 1
})

In [7]:
from datasets import Dataset, DatasetDict

# Convert the IterableDataset to a list
limited_data_list = list(limited_data)

# Create a Dataset from the list
limited_data = Dataset.from_list(limited_data_list)

# Create a DatasetDict
dataset_dict = DatasetDict({"train": limited_data})

# Verify the first example to ensure conversion was successful
print(dataset_dict["train"][0])


{'idx': 0, 'src': 'Some 14 months later, the second calf is born.', 'tgt': 'சுமார் 14 மாதங்கள் கழித்து, இரண்டாம் கன்றை ஈனுகிறது.'}


In [8]:
# Convert the dataset to a Pandas DataFrame
train_df = dataset_dict["train"].to_pandas()

# Rename the columns
train_df = train_df.rename(columns={"src": "en", "tgt": "ta"})

# Drop the 'idx' column if it is not needed
train_df = train_df.drop(columns=["idx"])

# Display the first few rows to verify
train_df.head()


Unnamed: 0,en,ta
0,"Some 14 months later, the second calf is born.","சுமார் 14 மாதங்கள் கழித்து, இரண்டாம் கன்றை ஈனு..."
1,"""Senior advocate Kapil Sibal, who was appearin...",‘காா்த்தி சிதம்பரம் எம். பி. யாக உள்ளதால் எங்க...
2,This photo was taken then.,அதன்போது எடுக்கப்பட்ட புகைப்படம் இது.
3,So far two rounds of the JWG meeting have been...,இதுவரை இணைப் பணிக் குழு இரண்டுகட்ட பேச்சுவார்த...
4,The life of the world is nothing but play and ...,உலக வாழ்க்கை வீணும் விளையாட்டுமேயன்றி வேறில்லை...


## Prepare the dataset for the model

In [9]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
train_df, val_df = train_test_split(train_df, test_size=0.1)

# Create Hugging Face Datasets from the DataFrames
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ta', '__index_level_0__'],
        num_rows: 180000
    })
    validation: Dataset({
        features: ['en', 'ta', '__index_level_0__'],
        num_rows: 20000
    })
})

# Load the Model and Tokenizer

In [11]:
model_name = 'Helsinki-NLP/opus-mt-en-mul'
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Define a function to tokenize the dataset
def tokenize_function(examples):
    inputs = examples['en']
    targets = examples['ta']
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, padding="max_length", max_length=128)
    return model_inputs

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Map:   0%|          | 0/180000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['en', 'ta', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 180000
    })
    validation: Dataset({
        features: ['en', 'ta', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
})

In [13]:
import torch, os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

In [14]:
# Load the model
model = MarianMTModel.from_pretrained(model_name)
model.to(device)
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    warmup_steps=500,  # Adjust warmup steps
    gradient_accumulation_steps=2,  # Adjust gradient accumulation steps
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True
)

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define a function to compute metrics
metric = load_metric('sacrebleu', trust_remote_code=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return result

# Create the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



2024-07-30 02:52:35.208144: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 02:52:35.208256: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 02:52:35.329916: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  metric = load_metric('sacrebleu', trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [15]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5203,0.487908,6.237019,31.65285


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


TrainOutput(global_step=5625, training_loss=0.6461056993272569, metrics={'train_runtime': 4814.6888, 'train_samples_per_second': 37.386, 'train_steps_per_second': 1.168, 'total_flos': 6101705687040000.0, 'train_loss': 0.6461056993272569, 'epoch': 1.0})

# Save the Finetuned Model and Tokenizer

In [17]:
# Save directory
save_directory = './finetuned-opusmt-en-to-ta'

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


Model saved to ./finetuned-opusmt-en-to-ta


# Load the finetuned model and check out the translation

In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
# Load your model and tokenizer
model_name =save_directory
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Example English sentence to translate
english_sentence = "My name is Varsha"

# Tokenize the input sentence
inputs = tokenizer(english_sentence, return_tensors="pt").to(device)

# Generate translation
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)

# Decode the generated tokens
translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"English: {english_sentence}")
print(f"Tamil Translation: {translated_sentence}")


English: My name is Varsha
Tamil Translation: என் பெயர் வர்ஷா
