In [1]:
import numpy as np
import pandas as pd

In [2]:
def read(path):
    df=pd.read_csv(path,delimiter='\t',header=None)
    df.rename(columns={0:'image_id',1:'x',2:'y',3:'width',4:'height',5:'english',6:'bengali'},inplace=True)
    df.drop(columns=['x','y','width','height'],axis=1,inplace=True)
    return df

In [3]:

train_path=r'https://raw.githubusercontent.com/Sk4467/datasets/main/bengali/bengali-visual-genome-train.txt'
test_path=r'https://raw.githubusercontent.com/Sk4467/datasets/main/bengali/bengali-visual-genome-test.txt'

In [4]:
train=read(train_path)
test=read(test_path)

In [5]:
import datasets
from datasets import Dataset, DatasetDict
train_dataset = datasets.Dataset.from_pandas(train)
train_data = datasets.DatasetDict({'train':train_dataset})

In [6]:
print(type(train_data))
print(train_data.shape)
print(train_data)

<class 'datasets.dataset_dict.DatasetDict'>
{'train': (28930, 3)}
DatasetDict({
    train: Dataset({
        features: ['image_id', 'english', 'bengali'],
        num_rows: 28930
    })
})


In [7]:
test_dataset = datasets.Dataset.from_pandas(test)
test_data = datasets.DatasetDict({'test':test_dataset})
print(type(test_data))
print(test_data.shape)
print(test_data)

<class 'datasets.dataset_dict.DatasetDict'>
{'test': (1595, 3)}
DatasetDict({
    test: Dataset({
        features: ['image_id', 'english', 'bengali'],
        num_rows: 1595
    })
})


In [8]:
from transformers import AutoTokenizer

checkpoint = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
# prefix="translate English to hindi : "
def preprocess(data):

    inputs=[text for text in data['english']]
    target=[text for text in data['bengali']]

    model_inputs = {
        'id': data['image_id'] ,
        'translation': target,  # assign 'translation' directly to 'target'
        'input_ids': [],
        'attention_mask': [],
        'labels' : []
    }

    try:
        # Tokenize all inputs and targets at once, which is more efficient
        tokenized_samples = tokenizer(inputs, max_length=128, truncation=True)
        labels= tokenizer(target,max_length=128,truncation=True)

        # Assign the tokenized samples directly to 'input_ids', 'attention_mask', and 'labels'
        model_inputs['input_ids'] = tokenized_samples['input_ids']
        model_inputs['attention_mask'] = tokenized_samples['attention_mask']
        model_inputs['labels'] = labels['input_ids']  
        
    except Exception as e:
        print(f"Error occurred: {e}")

    return model_inputs

In [10]:
tokenized_train_data=train_data.map(preprocess,batched=True)
tokenized_test_data=test_data.map(preprocess,batched=True)

Map:   0%|          | 0/28930 [00:00<?, ? examples/s]

Map:   0%|          | 0/1595 [00:00<?, ? examples/s]

In [11]:
print(tokenized_train_data)

DatasetDict({
    train: Dataset({
        features: ['image_id', 'english', 'bengali', 'id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 28930
    })
})


In [12]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='facebook/nllb-200-distilled-600M')

In [13]:
import evaluate

metric = evaluate.load("sacrebleu")

In [15]:
import numpy as np
def postprocess(preds,labels):
    preds=[pred.strip() for pred in preds]
    labels=[[label.strip()] for label in labels]
    return preds,labels


def compute_metrics(eval_preds):
    preds,labels=eval_preds
    if isinstance(preds,tuple):
        preds=preds[0]
    decoded_preds=tokenizer.batch_decode(preds,skip_special_tokens=True)
        
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels,skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M')

In [17]:
# import os

# # set the wandb project where this run will be logged
# os.environ["WANDB_PROJECT"]="en-ben_NLLB_fine-tune-1"

# # save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"]="true"

# # turn off watch to log faster
# os.environ["WANDB_WATCH"]="false"

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tune_model",
    # report_to="wandb",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    eval_accumulation_steps=50,
    gradient_accumulation_steps=4,
    predict_with_generate=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data['train'],
    eval_dataset=tokenized_test_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33miftesha1[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,0.6714,0.50445,45.7779,10.8527
1,0.3879,0.480756,47.9451,10.8853
2,0.336,0.490055,47.8033,10.8708


In [None]:
trainer.evalaute()

In [None]:
trainer.save_model('/home/jupyter/notebooks/notebook/fine-tune/')

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
path=r'https://raw.githubusercontent.com/Sk4467/datasets/main/bengali/bengali-visual-genome-test.txt'
pred=read(path)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import corpus_bleu

model_name = '/home/jupyter/notebooks/notebook/fine-tune/'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")


In [None]:
#encoding the inputs test sentences
encoded_inputs=tokenizer(test_sentences,padding=True,truncation=True,return_tensors='pt').to(torch.device("cuda"))


In [None]:
with torch.no_grad():
    outputs=model.generate(
    input_ids=encoded_inputs['input_ids'],
    attention_mask=encoded_inputs['attention_mask'],
    max_length=128
    )

In [None]:
translated_sentences=tokenizera.batch_decode(outputs,skip_special_tokens=True)

In [None]:
import nltk
from nltk.translate.bleu_score import corpus_bleu,sentence_bleu

In [None]:
actual_translations=pred['hindi'].to_list()
predicted_translations=translated_sentences

In [None]:
len(predicted_translations)

In [None]:
actual_tokenized=[[nltk.word_tokenize(sentence)] for sentence in actual_translations]
predicted_tokenized=[nltk.word_tokenize(sentence) for sentence in predicted_translations]

In [None]:

bleu_score=corpus_bleu(actual_tokenized,predicted_tokenized,weights=(1,0,0,0))
print(f"BLEU-score: {bleu_score}")

In [None]:
def write_to_file(sentences,path):
    with open(path,'w',encoding='utf-8') as file:
        for sentence in sentences:
            file.write(sentence+'\n')

In [None]:
path='ben_translation_eval_test.txt'
write_to_file(translated_sentences,path)