In [77]:
import pandas as pd
import numpy as np
import transformers

In [117]:
import warnings
import numpy as np
import pandas as  pd

import torch
import transformers

from datasets import Dataset
from datasets import load_metric

from tqdm import tqdm
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [79]:

# Read Parquet file into a DataFrame
df_train = pd.read_parquet('/kaggle/working/Nepali-Roman-Transliteration/data/train-00000-of-00001.parquet')
df_val = pd.read_parquet('/kaggle/working/Nepali-Roman-Transliteration/data/validation-00000-of-00001.parquet')

In [80]:
combined_df = pd.concat([df_train, df_val], ignore_index=True)

In [81]:
combined_df

Unnamed: 0,unique_identifier,native word,english word
0,nep1,मुस्कुराउँदै,muskuraundai
1,nep2,मान्दछन्,mandachhan
2,nep3,भएझैं,bhaejhain
3,nep4,हराउँछ,haraaunchha
4,nep5,मुन्टो,munto
...,...,...,...
2400213,nep2800,शीतलता,shitalta
2400214,nep2801,ट्राउजरमाथिको,trausermathiko
2400215,nep2802,शेखरजीका,shekharjika
2400216,nep2803,हवाइवेमा,hawaiwema


In [112]:
BATCH_SIZE = 8
BLEU = "bleu"
ENGLISH = "roman"
ENGLISH_TEXT = "native word"
EPOCH = "epoch"
INPUT_IDS = "input_ids"
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
MODEL_CHECKPOINT = "GenzNepal/mt5-summarize-nepali"
LABELS = "labels"
PREFIX = ""
PORTUGUESE = "ne"
PORTUGUESE_TEXT = "english word"
SCORE = "score"
SOURCE_LANG = "ne"
TARGET_LANG = "roman"
TRANSLATION = "translation"
UNNAMED_COL = "Unnamed: 0"
MODEL_NAME = MODEL_CHECKPOINT.split("/")[-1]

In [83]:
X=combined_df["native word"]
Y=combined_df['english word']

In [84]:
from sklearn.model_selection import train_test_split

In [94]:
def postprocess_text(preds: list, labels: list) -> tuple:
    """Performs post processing on the prediction text and labels"""

    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def prep_data_for_model_fine_tuning(source_lang: list, target_lang: list) -> list:
    """Takes the input data lists and converts into translation list of dicts"""

    data_dict = dict()
    data_dict[TRANSLATION] = []

    for sr_text, tr_text in zip(source_lang, target_lang):
        temp_dict = dict()
        temp_dict[PORTUGUESE] = sr_text
        temp_dict[ENGLISH] = tr_text

        data_dict[TRANSLATION].append(temp_dict)

    return data_dict


def generate_model_ready_dataset(dataset: list, source: str, target: str,
                                 model_checkpoint: str,
                                 tokenizer: AutoTokenizer):
    """Makes the data training ready for the model"""

    preped_data = []

    for row in dataset:
        inputs = PREFIX + row[source]
        targets = row[target]

        model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH,
                                 truncation=True, padding=True)

        model_inputs[TRANSLATION] = row

        # setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=MAX_INPUT_LENGTH,
                                 truncation=True, padding=True)
            model_inputs[LABELS] = labels[INPUT_IDS]

        preped_data.append(model_inputs)

    return preped_data



def compute_metrics(eval_preds: tuple) -> dict:
    """computes bleu score and other performance metrics """

    metric = load_metric("sacrebleu")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {BLEU: result[SCORE]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result[GEN_LEN] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [89]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, shuffle=True, random_state=100)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.10, shuffle=True, random_state=100)

In [86]:
print("INITIAL X-TRAIN SHAPE: ", x_train.shape)
print("INITIAL Y-TRAIN SHAPE: ", y_train.shape)
print("X-TEST SHAPE: ", x_test.shape)
print("Y-TEST SHAPE: ", y_test.shape)

INITIAL X-TRAIN SHAPE:  (1944176,)
INITIAL Y-TRAIN SHAPE:  (1944176,)
X-TEST SHAPE:  (240022,)
Y-TEST SHAPE:  (240022,)


In [95]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [96]:
training_data = prep_data_for_model_fine_tuning(x_train.values, y_train.values)

validation_data = prep_data_for_model_fine_tuning(x_val.values, y_val.values)

test_data = prep_data_for_model_fine_tuning(x_test.values, y_test.values)

In [97]:
train_data = generate_model_ready_dataset(dataset=training_data[TRANSLATION],
                                          tokenizer=tokenizer,
                                          source=PORTUGUESE,
                                          target=ENGLISH,
                                          model_checkpoint=MODEL_CHECKPOINT)

validation_data = generate_model_ready_dataset(dataset=validation_data[TRANSLATION],
                                               tokenizer=tokenizer,
                                               source=PORTUGUESE,
                                               target=ENGLISH,
                                               model_checkpoint=MODEL_CHECKPOINT)

test_data = generate_model_ready_dataset(dataset=test_data[TRANSLATION],
                                               tokenizer=tokenizer,
                                               source=PORTUGUESE,
                                               target=ENGLISH,
                                               model_checkpoint=MODEL_CHECKPOINT)



In [98]:
train_df = pd.DataFrame.from_records(train_data)
validation_df = pd.DataFrame.from_records(validation_data)
test_df = pd.DataFrame.from_records(test_data)

# Convert DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

In [122]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
for param in model.encoder.parameters():  # Accessing the encoder of MT5 model
    param.requires_grad = False

model_args = Seq2SeqTrainingArguments(
    f"{MODEL_NAME}-finetuned-{SOURCE_LANG}-to-{TARGET_LANG}",
    evaluation_strategy=EPOCH,
    learning_rate=2e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.02,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True
)

# Create a data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [115]:
import torch
import time
import gc
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

def wait_until_enough_gpu_memory(min_memory_available, max_retries=10, sleep_time=5):
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(torch.cuda.current_device())

    for _ in range(max_retries):
        info = nvmlDeviceGetMemoryInfo(handle)
        if info.free >= min_memory_available:
            break
        print(f"Waiting for {min_memory_available} bytes of free GPU memory. Retrying in {sleep_time} seconds...")
        time.sleep(sleep_time)
    else:
        raise RuntimeError(f"Failed to acquire {min_memory_available} bytes of free GPU memory after {max_retries} retries.")

# Usage example
min_memory_available = 2 * 1024 * 1024 * 1024  # 2GB
clear_gpu_memory()
wait_until_enough_gpu_memory(min_memory_available)


In [138]:
# Initialize the Seq2SeqTrainer for fine-tuning
trainer = Seq2SeqTrainer(
    model,
    model_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Commence the model training
trainer.train()

# Save the fine-tuned model
trainer.save_model("FineTunedTransformer")

OutOfMemoryError: CUDA out of memory. Tried to allocate 490.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 7.17 GiB is free. Process 5792 has 7.57 GiB memory in use. Of the allocated memory 6.55 GiB is allocated by PyTorch, and 847.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [140]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)
    # Check if the parameter corresponds to the encoder block 0 layer
    if name.startswith("encoder.block.0.layer"): 
        param.requires_grad = True  # Unfreeze the parameter
        print(name, param.requires_grad)
    # Check if the parameter corresponds to any of the decoder block layers from 0 to 6
    for i in range(7):
        if name.startswith(f"decoder.block.{i}.layer"):
            param.requires_grad = True  # Unfreeze the parameter
            print(name, param.requires_grad)


shared.weight False
encoder.block.0.layer.0.SelfAttention.q.weight False
encoder.block.0.layer.0.SelfAttention.q.weight True
encoder.block.0.layer.0.SelfAttention.k.weight False
encoder.block.0.layer.0.SelfAttention.k.weight True
encoder.block.0.layer.0.SelfAttention.v.weight False
encoder.block.0.layer.0.SelfAttention.v.weight True
encoder.block.0.layer.0.SelfAttention.o.weight False
encoder.block.0.layer.0.SelfAttention.o.weight True
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight False
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight True
encoder.block.0.layer.0.layer_norm.weight False
encoder.block.0.layer.0.layer_norm.weight True
encoder.block.0.layer.1.DenseReluDense.wi_0.weight False
encoder.block.0.layer.1.DenseReluDense.wi_0.weight True
encoder.block.0.layer.1.DenseReluDense.wi_1.weight False
encoder.block.0.layer.1.DenseReluDense.wi_1.weight True
encoder.block.0.layer.1.DenseReluDense.wo.weight False
encoder.block.0.layer.1.DenseReluDe

In [137]:

for name, param in model.named_parameters():
     print(name,param.requires_grad)
     if name.startswith("encoder.block.0.layer"): # choose whatever you like here
        param.requires_grad = False
        print(name,param.requires_grad)
     for i in range (0,7):
         if name.startswith(f"decoder.block.{i}.layer"): # choose whatever you like here
            param.requires_grad = False
            print(name,param.requires_grad)    
    


shared.weight False
encoder.block.0.layer.0.SelfAttention.q.weight False
encoder.block.0.layer.0.SelfAttention.q.weight False
encoder.block.0.layer.0.SelfAttention.k.weight False
encoder.block.0.layer.0.SelfAttention.k.weight False
encoder.block.0.layer.0.SelfAttention.v.weight False
encoder.block.0.layer.0.SelfAttention.v.weight False
encoder.block.0.layer.0.SelfAttention.o.weight False
encoder.block.0.layer.0.SelfAttention.o.weight False
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight False
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight False
encoder.block.0.layer.0.layer_norm.weight False
encoder.block.0.layer.0.layer_norm.weight False
encoder.block.0.layer.1.DenseReluDense.wi_0.weight False
encoder.block.0.layer.1.DenseReluDense.wi_0.weight False
encoder.block.0.layer.1.DenseReluDense.wi_1.weight False
encoder.block.0.layer.1.DenseReluDense.wi_1.weight False
encoder.block.0.layer.1.DenseReluDense.wo.weight False
encoder.block.0.layer.1.Den