<a href="https://colab.research.google.com/github/Nid989/Isometric-Multi-task-NMT/blob/main/de_en_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
# un-comment below, while working on colab.
!pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece] wandb boto3 --quiet 

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader
import torch
from datasets import load_dataset, load_metric
import numpy as np
import datasets
import boto3
import os
import shutil
import random
# from tqdm.notebook import tqdm
from tqdm import tqdm 
import wandb
import logging
import pandas as pd

In [3]:
# for logging loss to wandb.ai
access_key = "c7deb1bb77ce9433eb246d460385f363659145a8" # enter wandb secret_accces_key
wandb.login(key=access_key)

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
# data processing
raw_datasets = load_dataset("enimai/MuST-C-and-WMT16-de-en")
metric = load_metric("sacrebleu")



Downloading and preparing dataset csv/enimai--MuST-C-and-WMT16-de-en to /root/.cache/huggingface/datasets/csv/enimai--MuST-C-and-WMT16-de-en-c29686a0af1db3f4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/837k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/enimai--MuST-C-and-WMT16-de-en-c29686a0af1db3f4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

In [6]:
# dataset description
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 4778588
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 5640
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 3592
    })
})


In [7]:
# pre-trained model checkpoints
train_model_checkpoints = "Helsinki-NLP/opus-mt-de-fr"

In [8]:
# load the MarianMT tokenizer
tokenizer = AutoTokenizer.from_pretrained(train_model_checkpoints)

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/808k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/793k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

In [9]:
def add_verbosity(input_list, target_list):
  """
  input: list of source & target sequences
  output: processed source sequence based on the calculated length ratios 
  """
  processed_input = []
  for input, target in zip(input_list, target_list):
    ts_ratio = len(target)/len(input)
    if ts_ratio < 0.95:
      prefix = "short"
    elif ts_ratio >= 0.95 and ts_ratio <= 1.10:
      prefix = "normal"
    else:
      prefix = "long"
    input = prefix + " " + input
    processed_input.append(input)
  return processed_input

In [10]:
# preprocess MUST-C dataset
max_input_length = 128 
max_target_length = 128
source_lang = "en"
target_lang = "de"
def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    inputs = add_verbosity(inputs, targets) # append appropriate prompts 
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# tokenize raw data
tokenized_train_datasets = raw_datasets['train'].map(preprocess_function, batched=True)
tokenized_validation_datasets = raw_datasets['validation'].map(preprocess_function, batched=True)

In [None]:
# training procedure
model = AutoModelForSeq2SeqLM.from_pretrained(train_model_checkpoints)

In [None]:
batch_size = 2 # change batch-size according to GPU availability 
model_name = train_model_checkpoints.split("/")[-1]
epoch = 3

# define training model arguments
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=epoch,
    predict_with_generate=True    
)

# initialize data-collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# initialize the trainer module
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_validation_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# train the model
trainer.train()

In [None]:
# compress model checkpoint directory
model_checkpoint_directory = f"{model_name}-finetuned-{source_lang}-to-{target_lang}"
print(model_checkpoint_directory)
shutil.make_archive(model_checkpoint_directory, "zip", model_checkpoint_directory)

In [None]:
session = boto3.Session(
    aws_access_key_id='AKIA4QB2WTN5YQGLD77G',
    aws_secret_access_key='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',
)
s3 = session.resource('s3')
key = f"{epoch}_{model_checkpoint_directory}"
filename = f"{model_checkpoint_directory}.zip"
s3.meta.client.upload_file(Bucket='tsd2022', Key=key, Filename=filename)

In [None]:
# delete checkpoint directory
current_directory = os.getcwd()
path_to_directory = os.path.join(current_directory, model_checkpoint_directory)
shutil.rmtree(path_to_directory)

In [None]:
# delete zip file
current_directory = os.getcwd()
path_to_zip_file = os.path.join(current_directory, filename)
os.remove(path_to_zip_file)