<a href="https://colab.research.google.com/github/Nid989/Isometric-Multi-task-NMT/blob/main/finetune_it_en_OPUS_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
%%capture
# un-comment below, while working on colab.
!pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece] wandb boto3 --quiet 
!pip install -U nltk # upgrade current version of NLTK

In [29]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader
import torch
from datasets import load_dataset, load_metric
import numpy as np
import datasets
import boto3
import os
import shutil
import random
# from tqdm.notebook import tqdm
from tqdm import tqdm 
import wandb
import logging
import pandas as pd 

In [30]:
# for logging loss to wandb.ai
access_key = "c7deb1bb77ce9433eb246d460385f363659145a8" # enter wandb secret_accces_key
wandb.login(key=access_key)   



True

In [31]:
# data processing
raw_train_datasets = load_dataset("enimai/MuST-C-it", split="train[:50]")
raw_validation_datasets = load_dataset("enimai/MuST-C-it", split="validation[:50]")
sacrebleu = load_metric("sacrebleu")
meteor = load_metric('meteor')

Using custom data configuration enimai--MuST-C-it-7022eab0bf68926b
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/enimai--MuST-C-it-7022eab0bf68926b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)
Using custom data configuration enimai--MuST-C-it-7022eab0bf68926b
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/enimai--MuST-C-it-7022eab0bf68926b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [32]:
# dataset description
print(f"train: {raw_train_datasets}")
print(f"validation: {raw_validation_datasets}")

train: Dataset({
    features: ['en', 'it'],
    num_rows: 50
})
validation: Dataset({
    features: ['en', 'it'],
    num_rows: 50
})


In [33]:
source_lang = "en"
target_lang = "it"

In [34]:
# pre-trained model checkpoints
train_model_checkpoints = f"Helsinki-NLP/opus-mt-en-it"

In [None]:
# load the MarianMT tokenizer
tokenizer = AutoTokenizer.from_pretrained(train_model_checkpoints, src_lang="en_XX", tgt_lang="de_DE")

In [36]:
def add_verbosity(input_list, target_list):
  """
  input: list of source & target sequences
  output: processed source sequence based on the calculated length ratios 
  """
  processed_input = []
  for input, target in zip(input_list, target_list):
    ts_ratio = len(target)/len(input)
    if ts_ratio < 0.95:
      prefix = "short"
    elif ts_ratio >= 0.95 and ts_ratio <= 1.10:
      prefix = "normal"
    else:
      prefix = "long"
    input = prefix + " " + input
    processed_input.append(input)
  return processed_input

In [37]:
# preprocess MUST-C dataset
max_input_length = 128 
max_target_length = 128
def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    inputs = add_verbosity(inputs, targets) # append appropriate prompts 
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# tokenize raw data
tokenized_train_datasets = raw_train_datasets.map(preprocess_function, batched=True)
tokenized_validation_datasets = raw_train_datasets.map(preprocess_function, batched=True)

In [None]:
# training procedure
model = AutoModelForSeq2SeqLM.from_pretrained(train_model_checkpoints)

In [40]:
batch_size = 32 # change batch-size according to GPU availability 
model_name = train_model_checkpoints.split("/")[-1]
epoch = 3

# define training model arguments
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}-testing",
    learning_rate=0.0003,
    lr_scheduler_type="linear",
    warmup_ratio=0.06,
    optim="adafactor",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    report_to="wandb",
    save_total_limit=1,
    predict_with_generate=True    
)

# initialize data-collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

PyTorch: setting up devices


In [41]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    sacrebleu_result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    result = {
        "bleu": sacrebleu_result["score"],
        "meteor": meteor_result["meteor"]
    }
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    print(result)
    return result

In [15]:
# initialize the trainer module
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_validation_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# train the model
trainer.train()

In [None]:
# compress model checkpoint directory
model_checkpoint_directory = f"{model_name}-finetuned-{source_lang}-to-{target_lang}-testing"
print(model_checkpoint_directory)
shutil.make_archive(model_checkpoint_directory, "zip", model_checkpoint_directory)

In [None]:
session = boto3.Session(
    aws_access_key_id='AKIA4QB2WTN5YQGLD77G',
    aws_secret_access_key='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',
)
s3 = session.resource('s3')
key = f"{epoch}_{model_checkpoint_directory}"
filename = f"{model_checkpoint_directory}.zip"
print(key)
s3.meta.client.upload_file(Bucket='tsd2022', Key=key, Filename=filename)

In [45]:
# delete checkpoint directory
current_directory = os.getcwd()
path_to_directory = os.path.join(current_directory, model_checkpoint_directory)
shutil.rmtree(path_to_directory)

In [22]:
# delete zip file
current_directory = os.getcwd()
path_to_zip_file = os.path.join(current_directory, filename)
os.remove(path_to_zip_file)

----

In [23]:
# evaluate model
raw_test_datasets = load_dataset("enimai/MuST-C-it", split="test[:50]")

Using custom data configuration enimai--MuST-C-it-7022eab0bf68926b
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/enimai--MuST-C-it-7022eab0bf68926b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


In [44]:
session = boto3.Session(
    aws_access_key_id='AKIA4QB2WTN5YQGLD77G',
    aws_secret_access_key='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',
)
s3 = session.resource('s3')
key = f"{epoch}_{model_checkpoint_directory}"
filename = f"{model_checkpoint_directory}.zip"
print(key)
s3.meta.client.download_file(Bucket='tsd2022', Key=key, Filename=filename)

In [48]:
current_directory = os.getcwd()
path_to_zipfile = os.path.join(current_directory, f"{model_checkpoint_directory}.zip")
path_to_output_directory = os.path.join(current_directory, f"{model_checkpoint_directory}/")
shutil.unpack_archive(path_to_zipfile, path_to_output_directory)

In [52]:
# pre-trained model checkpoints
evaluation_model_checkpoint = os.path.join(path_to_output_directory, os.listdir(path_to_output_directory)[0])

In [None]:
# load the MarianMT tokenizer
tokenizer = AutoTokenizer.from_pretrained(evaluation_model_checkpoint)

In [59]:
def add_verbosity_eval(input_list, target_list):
  """
  input: list of source & target sequences
  output: processed source sequence based on the calculated length ratios 
  """
  processed_input = []
  for input, target in zip(input_list, target_list):
    ts_ratio = len(target)/len(input)
    prefix = "normal"
    input = prefix + " " + input
    processed_input.append(input)
  return processed_input

In [60]:
# preprocess MUST-C dataset
def preprocess_test_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    inputs = add_verbosity_eval(inputs, targets) # append appropriate prompts 
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [62]:
# tokenize raw data
tokenized_test_dataset = raw_test_datasets.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# training procedure
model = AutoModelForSeq2SeqLM.from_pretrained(evaluation_model_checkpoint)

In [64]:
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, num_workers=0)

In [83]:
# generate model prediction
predictions = []
for batch in tqdm(test_dataloader, total=tokenized_test_dataset.shape[0]):
  translated = model.generate(**tokenizer(batch['en'], return_tensors="pt", padding=True))
  predictions.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

test_source, test_target = [], []
for instance in tqdm(raw_test_datasets, total=raw_test_datasets.shape[0]):
  test_source.append(instance[source_lang])
  test_target.append(instance[target_lang])

# generate output prediction dataframe
df = pd.DataFrame({
    source_lang: test_source,
    target_lang: test_target,
    'pred': predictions
})

100%|██████████| 50/50 [01:32<00:00,  1.85s/it]


In [88]:
path_to_prediction_file = os.path.join(current_directory, f"{model_name}-finetuned-{source_lang}-to-{target_lang}-predictions.csv")
df.to_csv(path_to_prediction_file, index=False)

In [89]:
# upload to s3
session = boto3.Session(
    aws_access_key_id='AKIA4QB2WTN5YQGLD77G',
    aws_secret_access_key='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',
)
s3 = session.resource('s3')
key = f"{model_name}-finetuned-{source_lang}-to-{target_lang}-predictions"
filename = path_to_prediction_file
print(key)
# s3.meta.client.upload_file(Bucket='tsd2022', Key=key, Filename=filename)

opus-mt-en-it-finetuned-en-to-it-predictions


In [92]:
# delete checkpoint directory
shutil.rmtree(path_to_output_directory)

In [93]:
# delete zip files
os.remove(f"{model_checkpoint_directory}.zip") # model checkpoints
os.remove(path_to_prediction_file) # csv file