# Setup

In [1]:
!pip install datasets transformers torch accelerate sacremoses sacrebleu tmx --quiet


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer
import numpy as np
import torch, os

2024-07-30 21:17:52.586278: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 21:17:52.586419: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 21:17:52.714107: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Extract the data and add it to DataFrame

In [3]:
# Read the TMX file
tmx_file = '/kaggle/input/tmx-gz/en-gu.tmx'
with open(tmx_file, 'r', encoding='utf-8') as f:
    tmx_content = f.read()

# Parse the TMX content
soup = BeautifulSoup(tmx_content, 'lxml')

# Extract English and Gujarati translations
translations = []
for tu in soup.find_all('tu'):
    en_text = None
    gu_text = None
    for tuv in tu.find_all('tuv'):
        lang = tuv['xml:lang']
        seg = tuv.find('seg').text
        if lang == 'en':
            en_text = seg
        elif lang == 'gu':
            gu_text = seg
    if en_text and gu_text:
        translations.append({'English': en_text, 'Gujarati': gu_text})

# Create DataFrame
df = pd.DataFrame(translations)

df.head()




Unnamed: 0,English,Gujarati
0,Anna Hazare has heavily criticized the Prime m...,અન્ના હજારેએ વડાપ્રધાન પર પ્રહાર કરતા કહ્યું ક...
1,Four years have passed but the government is a...,પરંતુ ચાર વર્ષ વીતી ગયા પણ સરકાર કોઈ ના કોઈ કા...
2,"He added that regarding the Lokpal election, h...",તેમણે કહ્યું કે તે લોકપાલ નિયુક્તને લઈને બીજી ...
3,Anna Hazare wrote a letter to PM Modi on Thurs...,અન્ના હજારેએ વડાપ્રધાન મોદીને ગુરુવારે પત્ર લખ...
4,"Anna wrote that on 16th August 2011, for the e...","અન્નાએ લખ્યું કે, લોકપાલ અને લોકાયુક્તની નિયુક..."


## Convert Dataset to Model Suitable Format

In [4]:
train_df, val_df = train_test_split(df, test_size=0.1)
# Create Hugging Face Datasets from the DataFrames
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['English', 'Gujarati', '__index_level_0__'],
        num_rows: 3610
    })
    validation: Dataset({
        features: ['English', 'Gujarati', '__index_level_0__'],
        num_rows: 402
    })
})

# Load the Model and Tokenizer

In [6]:
model_name = '/kaggle/input/finetuned-opusmt-en-to-hi-model'
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Define a function to tokenize the dataset
def tokenize_function(examples):
    inputs = examples['English']
    targets = examples['Gujarati']
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, padding="max_length", max_length=128)
    return model_inputs

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

Map:   0%|          | 0/402 [00:00<?, ? examples/s]

In [7]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['English', 'Gujarati', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3610
    })
    validation: Dataset({
        features: ['English', 'Gujarati', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 402
    })
})

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

## Training

In [9]:
# Load the model
model = MarianMTModel.from_pretrained(model_name)
model.to(device)
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    warmup_steps=500,  # Adjust warmup steps
    gradient_accumulation_steps=2,  # Adjust gradient accumulation steps
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True
)

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define a function to compute metrics
metric = load_metric('sacrebleu', trust_remote_code=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return result

# Create the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  metric = load_metric('sacrebleu', trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [10]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,No log,4.703207,0.235085,52.965174
2,No log,0.844223,7.632196,65.669154
4,No log,0.750255,9.466346,65.522388
6,No log,0.696945,10.809743,65.838308
8,1.616900,0.657723,12.827876,66.626866
9,1.616900,0.650156,13.094781,65.557214


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


TrainOutput(global_step=560, training_loss=1.51349926676069, metrics={'train_runtime': 1211.3406, 'train_samples_per_second': 29.802, 'train_steps_per_second': 0.462, 'total_flos': 1213086887313408.0, 'train_loss': 1.51349926676069, 'epoch': 9.91150442477876})

In [11]:
# Save directory
save_directory = './finetuned-opusmt-en-hi-gu'

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


Model saved to ./finetuned-opusmt-en-hi-gu


# Check out Translations of the Finetuned Model

In [13]:
# Load your model and tokenizer
model_name =save_directory
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Example English sentence to translate
english_sentence = "The thief ran away from the scene"

# Tokenize the input sentence
inputs = tokenizer(english_sentence, return_tensors="pt").to(device)

# Generate translation
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)

# Decode the generated tokens
translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"English: {english_sentence}")
print(f"Gujarati Translation: {translated_sentence}")

English: The thief ran away from the scene
Gujarati Translation: ચોર સ્થિતિથી ભાગી ગયો
