In [1]:
import numpy as np
import pandas as pd

In [2]:
def read(path):
    df=pd.read_csv(path)
    df.rename(columns={'src':'english','tgt':'odia'},inplace=True)
    df.drop(columns=['Unnamed: 0.1','Unnamed: 0',],axis=1,inplace=True)
    return df

In [3]:
train_path=r'https://media.githubusercontent.com/media/OdiaGenAI/OdiaGenAI_Interns_2023/main/data-preparation/EN_OD_TRAIN.csv'
test_path=r'https://media.githubusercontent.com/media/OdiaGenAI/OdiaGenAI_Interns_2023/main/data-preparation/EN_OD_TEST.csv'
validation_path=r'https://media.githubusercontent.com/media/OdiaGenAI/OdiaGenAI_Interns_2023/main/data-preparation/EN_OD_VALIDATION.csv'

In [4]:
train=read(train_path)
test=read(test_path)
validation=read(validation_path)

In [8]:
!pip install datasets transformers evaluate sacrebleu

# ##########################################################################
# !pip install --upgrade accelerate
# !pip uninstall -y transformers accelerate
# !pip install transformers accelerate

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB

In [9]:
import datasets
from datasets import Dataset, DatasetDict
train_dataset = datasets.Dataset.from_pandas(train)
train_data = datasets.DatasetDict({'train':train_dataset})
test_dataset = datasets.Dataset.from_pandas(test)
test_data = datasets.DatasetDict({'test':test_dataset})
validation_dataset = datasets.Dataset.from_pandas(validation)
validation_data = datasets.DatasetDict({'validation':validation_dataset})

In [10]:
print(type(validation_data))
print(validation_data.shape)
print(validation_data)

<class 'datasets.dataset_dict.DatasetDict'>
{'validation': (108092, 3)}
DatasetDict({
    validation: Dataset({
        features: ['idx', 'english', 'odia'],
        num_rows: 108092
    })
})


In [11]:
from transformers import AutoTokenizer

checkpoint = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

In [12]:
def preprocess(data):
    if data['english'] is None or data['odia'] is None:
        print("Error: Empty Data")
        return None
    inputs=[text for text in data['english']]
    target=[text for text in data['odia']]


    # Replace None or empty strings with a placeholder
    inputs = [text if text is not None and text.strip() != '' else '<NONE>' for text in data['english']]
    target = [text if text is not None and text.strip() != '' else '<NONE>' for text in data['odia']]


    model_inputs = {
        'id': data['idx'] ,
        'translation': target,  # assign 'translation' directly to 'target'
        'input_ids': [],
        'attention_mask': [],
        'labels' : []
    }

    try:
        # Tokenize all inputs and targets at once, which is more efficient
        tokenized_samples = tokenizer(inputs, max_length=512, truncation=True,padding='max_length')
        labels= tokenizer(target,max_length=512,truncation=True,padding='max_length')


        # Assign the tokenized samples directly to 'input_ids', 'attention_mask', and 'labels'
        if len(tokenized_samples['input_ids']) == len(data['idx']):
          model_inputs['input_ids'] = tokenized_samples['input_ids']
          model_inputs['attention_mask'] = tokenized_samples['attention_mask']
          model_inputs['labels'] = labels['input_ids']
          model_inputs['translation'] = target
        else:
            print(f"Error: Inconsistent batch sizes: {len(tokenized_samples['input_ids'])} and {len(data['idx'])}")
    except Exception as e:
        print(f"Error occurred: {e}")

    return model_inputs

In [None]:
# validation_data['validation']['odia']

In [13]:
tokenized_train_data=train_data['train'].map(preprocess,batched=True)

Map:   0%|          | 0/756644 [00:00<?, ? examples/s]

In [None]:
# from datasets import load_dataset

# def find_none_rows(batch):
#     none_indices_english = [i for i, text in enumerate(batch['english']) if text is None]
#     none_indices_odia = [i for i, text in enumerate(batch['odia']) if text is None]

#     if none_indices_english:
#         print("Found None in English column for these indices:")
#         for i in none_indices_english:
#             print(f"  Index: {i}, ID: {batch['idx'][i]}, English: {batch['english'][i]}, Odia: {batch['odia'][i]}")

#     if none_indices_odia:
#         print("Found None in Odia column for these indices:")
#         for i in none_indices_odia:
#             print(f"  Index: {i}, ID: {batch['idx'][i]}, English: {batch['english'][i]}, Odia: {batch['odia'][i]}")


# train_data['train'].map(find_none_rows, batched=True)


In [14]:
tokenized_test_data=validation_data['validation'].map(preprocess,batched=True)

Map:   0%|          | 0/108092 [00:00<?, ? examples/s]

In [15]:
print(tokenized_train_data)

Dataset({
    features: ['idx', 'english', 'odia', 'id', 'translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 756644
})


In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='facebook/nllb-200-distilled-600M')

In [17]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [18]:
import numpy as np
def postprocess(preds,labels):
    preds=[pred.strip() for pred in preds]
    labels=[[label.strip()] for label in labels]
    return preds,labels


def compute_metrics(eval_preds):
    preds,labels=eval_preds
    if isinstance(preds,tuple):
        preds=preds[0]
    decoded_preds=tokenizer.batch_decode(preds,skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels,skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M')

Downloading (…)lve/main/config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

In [None]:
# import os
# # set the wandb project where this run will be logged
# os.environ["WANDB_PROJECT"]="en-hin_NLLB_fine-tune-1"

# # save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"]="true"

# # turn off watch to log faster
# os.environ["WANDB_WATCH"]="false"

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tune_model_hindi",
    # report_to="wandb",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    eval_accumulation_steps=50,
    gradient_accumulation_steps=4,
    predict_with_generate=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data['train'],
    eval_dataset=tokenized_test_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model('Enter path to save the model')