<a href="https://colab.research.google.com/github/Nid989/Isometric-Multi-task-NMT/blob/main/finetune_de_mBART_paraphrasing_training_%26_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
opusparcus: short sentences; useful to train model on common/ daily used language terms and slang 
paws-x: normal and long sentences; acquaint model with in-depth paraphrase reordering and chnages in wordings
"""

In [None]:
%%capture
!pip install transformers sentencepiece boto3 sacrebleu wandb datasets --quiet

In [None]:
%%capture
!pip install nltk -U --quiet

In [None]:
from datasets import load_dataset, load_metric
import os
import boto3
import shutil
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, MarianMTModel, MarianTokenizer
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
from tqdm.notebook import tqdm
import logging
from torch.utils.data import DataLoader
from datetime import datetime
from sklearn.model_selection import train_test_split
import wandb
import numpy as np

tqdm.pandas()
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [None]:
# current working directory
current_directory = os.getcwd()

In [None]:
# for logging loss to wandb.ai
access_key = "c7deb1bb77ce9433eb246d460385f363659145a8" # enter wandb secret_accces_key
wandb.login(key=access_key)

In [None]:
opusparcus_data = load_dataset("GEM/opusparcus", lang="de", quality=95)

In [None]:
def process_opusparcus_data(data, data_type):
  path_to_data_type = os.path.join(current_directory, f"opusparcus_{data_type}.csv")
  data.to_csv(path_to_data_type, index=False)
  df = pd.read_csv(path_to_data_type)
  df.drop(["lang", "gem_id", "references", "annot_score"], axis=1, inplace=True)
  df.rename(columns={
      'input': 'input_text',
      'target': 'target_text'
  }, inplace=True)
  df.to_csv(path_to_data_type, index=False)

In [None]:
# save train, test and validation data locally for further processing
data_types = ["train", "test", "validation"]
for data_type in tqdm(data_types, total=len(data_types)):
  process_opusparcus_data(opusparcus_data[data_type], data_type=data_type)

In [None]:
pawsx_data = load_dataset("paws-x", "de")

In [None]:
def process_pawsx_data(data, data_type):
  path_to_data_type = os.path.join(current_directory, f"pawsx_{data_type}.csv")
  data.to_csv(path_to_data_type, index=False)
  df = pd.read_csv(path_to_data_type)
  df.drop(["id", "label"], axis=1, inplace=True)
  df.rename(columns={
      'sentence1': 'input_text',
      'sentence2': 'target_text'
  }, inplace=True)
  df.to_csv(path_to_data_type, index=False)

In [None]:
# save train, test and validation data locally for further processing
data_types = ["train", "test", "validation"]
for data_type in tqdm(data_types, total=len(data_types)):
  process_pawsx_data(pawsx_data[data_type], data_type=data_type)

In [None]:
def process_data_opusparcus_and_pawsx_data(data_type):
  path_to_dataset_a_data_type = os.path.join(current_directory, f"{dataset_a}_{data_type}.csv")
  path_to_dataset_b_data_type = os.path.join(current_directory, f"{dataset_b}_{data_type}.csv")
  df_a = pd.read_csv(path_to_dataset_a_data_type)
  df_b = pd.read_csv(path_to_dataset_b_data_type)
  df = pd.concat([df_a, df_b], axis=0)
  df = df.sample(frac=1).reset_index(drop=True)
  os.remove(path_to_dataset_a_data_type)
  os.remove(path_to_dataset_b_data_type)
  path_to_data_type = os.path.join(current_directory, f"{data_type}.csv")
  df.dropna(inplace=True)
  df.to_csv(path_to_data_type, index=False)

In [None]:
# merge opusparcus and pawsx dataset
dataset_a = "opusparcus"
dataset_b = "pawsx"
data_types = ["train", "test", "validation"]
for data_type in tqdm(data_types, total=len(data_types)):
  process_data_opusparcus_and_pawsx_data(data_type)  

In [None]:
train_model_checkpoints = "facebook/mbart-large-50"

In [None]:
# load the MarianMT tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained(train_model_checkpoints, src_lang="de_DE", tgt_lang="de_DE")

In [None]:
def add_prompt(source_list, target_list):
  processed_input = []
  for input, target in zip(source_list, target_list):
    ts_ratio = len(target)/len(input)
    if ts_ratio < 0.95:
      prefix = "paraphrase short"
    elif ts_ratio >= 0.95 and ts_ratio <= 1.10:
      prefix = "paraphrase normal"
    else:
      prefix = "paraphrase long"
    input = prefix + " " + input
    processed_input.append(input)
  return processed_input

In [None]:
# preprocess MUST-C dataset
max_input_length = 128 
max_target_length = 128
def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]
    inputs = add_prompt(inputs, targets) # append appropriate prompts 
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# sample_train = pd.read_csv("train.csv")
# sample_train = sample_train.sample(5000).reset_index(drop=True)
# sample_train.to_csv("sample_train.csv", index=False)
# sample_validation = pd.read_csv("validation.csv")
# sample_validation = sample_validation.sample(1000).reset_index(drop=True)
# sample_validation.to_csv("sample_validation.csv", index=False)

In [None]:
# path_to_train_data = os.path.join(current_directory, "train.csv")
path_to_train_data = os.path.join(current_directory, "sample_train.csv")
# path_to_validation_data = os.path.join(current_directory, "validation.csv")
path_to_validation_data = os.path.join(current_directory, "sample_validation.csv")
raw_train_dataset = load_dataset('csv', data_files={"train": path_to_train_data})
raw_validation_dataset = load_dataset('csv', data_files={"validation": path_to_validation_data})

In [None]:
# tokenize raw data
tokenized_train_datasets = raw_train_dataset["train"].map(preprocess_function, batched=True)
tokenized_validation_datasets = raw_validation_dataset["validation"].map(preprocess_function, batched=True)

In [None]:
# training procedure
model = MBartForConditionalGeneration.from_pretrained(train_model_checkpoints)

In [None]:
batch_size = 2 # change batch-size according to GPU availability 
model_name = train_model_checkpoints.split("/")[-1]
epoch = 2
lang = "de"
strategy = "steps"
steps_ = 500
save_steps_ = 500

# define training model arguments
args = Seq2SeqTrainingArguments(
    f"fewshot-learning-{model_name}-paraphrase-finetuned-for-{lang}",
    learning_rate=5e-5, 
    logging_strategy=strategy,
    logging_steps=steps_,
    # learning_rate=0.0003,
    # lr_scheduler_type="linear",
    # warmup_ratio=0.06,
    optim="adafactor",
    save_strategy=strategy,
    save_steps=save_steps_,
    evaluation_strategy=strategy,
    eval_steps=steps_,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    report_to="wandb",
    save_total_limit=1,
    predict_with_generate=True    
)

# initialize data-collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
sacrebleu = load_metric("sacrebleu")
meteor = load_metric("meteor")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    sacrebleu_result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    result = {
        "bleu": sacrebleu_result["score"],
        "meteor": meteor_result['meteor']
    }
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    print(result)
    return result

In [None]:
# initialize the trainer module
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_validation_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# train the model
trainer.train()

In [None]:
 # compress model checkpoint directory
model_name = f"fewshot-learning-{model_name}-paraphrase-finetuned-for-{lang}"
model_checkpoint_directory = os.path.join(current_directory, f"fewshot-learning-{model_name}-paraphrase-finetuned-for-{lang}")
print(model_checkpoint_directory)
shutil.make_archive(model_checkpoint_directory, "zip", model_checkpoint_directory.split('/')[-1])

In [None]:
session = boto3.Session(
    aws_access_key_id='AKIA4QB2WTN5YQGLD77G',
    aws_secret_access_key='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',
)
s3 = session.resource('s3')
key = f"{epoch}_{model_name}"
filename = f"{model_checkpoint_directory}.zip"
print(key)
s3.meta.client.upload_file(Bucket='tsd2022', Key=key, Filename=filename)

In [None]:
# delete checkpoint directory
current_directory = os.getcwd()
path_to_directory = os.path.join(current_directory, model_checkpoint_directory)
shutil.rmtree(path_to_directory)

In [None]:
# delete zip file
current_directory = os.getcwd()
path_to_zip_file = os.path.join(current_directory, filename)
os.remove(path_to_zip_file)

----

In [None]:
# download MT predictions
session = boto3.Session(
    aws_access_key_id='AKIA4QB2WTN5YQGLD77G',
    aws_secret_access_key='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',
)
s3 = session.resource('s3')
key = "opus-mt-en-de-predictions"
filename = "opus-mt-en-de-predictions.csv"
s3.meta.client.download_file(Bucket='tsd2022', Key=key, Filename=filename)

In [None]:
path_to_predfile = os.path.join(current_directory, "opus-mt-en-de-predictions.csv")
pred_df = pd.read_csv(path_to_predfile)

In [None]:
def check_for_verbosity(input_text, target_text):
  ts_ratio = len(target_text)/len(input_text)
  if not (ts_ratio >= 0.90 and ts_ratio <= 1.10):
    return True
  return False

def append_paraphrase_prompt(input_text, target_text):
  ts_ratio = len(target_text)/len(input_text)
  prefix = None
  if ts_ratio < 0.90:
    prefix = "paraphrase long"
  elif ts_ratio > 1.10:
    prefix = "paraphrase short"
  target_text = prefix + " " + target_text
  return target_text

In [None]:
# check if mt_prediction -> input length ratio is normal
pred_df["is_normal"] = pred_df.progress_apply(
    lambda row: check_for_verbosity(row['en'], row['mt_pred']),
    axis=1
)
not_normal_seq_index = pred_df.index[pred_df['is_normal'] == True].to_list()

In [None]:
columns = ["en", "de", "mt_pred"]
pred_normal_df = pred_df[~pred_df.index.isin(not_normal_seq_index)][columns]
pred_not_normal_df = pred_df[pred_df.index.isin(not_normal_seq_index)][columns]

# apply paraphrase prompt 
pred_not_normal_df["mt_pred"] = pred_not_normal_df.progress_apply(
    lambda row: append_paraphrase_prompt(row['en'], row['mt_pred']),
    axis=1
)

path_to_not_normal_file = os.path.join(current_directory, "test_not_normal.csv")
path_to_normal_file = os.path.join(current_directory, "test_normal.csv")
pred_not_normal_df.to_csv(path_to_not_normal_file, index=False)
pred_normal_df.to_csv(path_to_normal_file, index=False)

In [None]:
epoch = 1
model_name = "mbart-large-50-paraphrase-finetuned-for-de"

In [None]:
session = boto3.Session(
    aws_access_key_id='AKIA4QB2WTN5YQGLD77G',
    aws_secret_access_key='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',
)
s3 = session.resource('s3')
key = f"{epoch}_{model_name}"
filename = f"{model_name}.zip"
print(key)
s3.meta.client.download_file(Bucket='tsd2022', Key=key, Filename=filename)

In [None]:
current_directory = os.getcwd()
path_to_zipfile = os.path.join(current_directory, f"{model_name}.zip")
path_to_output_directory = os.path.join(current_directory, f"{model_name}/")
shutil.unpack_archive(path_to_zipfile, path_to_output_directory)

In [None]:
# pre-trained model checkpoints
evaluation_model_checkpoint = os.path.join(path_to_output_directory, os.listdir(path_to_output_directory)[0])

In [None]:
# load the MarianMT tokenizer
tokenizer = AutoTokenizer.from_pretrained(evaluation_model_checkpoint)

In [None]:
# training procedure
model = MBartForConditionalGeneration.from_pretrained(evaluation_model_checkpoint)

In [None]:
processed_raw_test_dataset = load_dataset('csv', data_files={"test": path_to_not_normal_file})

In [None]:
test_dataloader = DataLoader(processed_raw_test_dataset["test"], batch_size=1, num_workers=0)

In [None]:
# generate model prediction
predictions = []
for batch in tqdm(test_dataloader):
  translated = model.generate(**tokenizer(batch['mt_pred'], return_tensors="pt", padding=True))
  predictions.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

pred_not_normal_df["mt_pred"] = predictions
processed_pred_df = pd.concat([pred_normal_df, pred_not_normal_df]).sort_index()
predfile_name = f"{model_name}-predictions.csv"
path_to_processed_predfile = os.path.join(current_directory, predfile_name)
processed_pred_df.to_csv(path_to_processed_predfile, index=False)

In [None]:
session = boto3.Session(
    aws_access_key_id='AKIA4QB2WTN5YQGLD77G',
    aws_secret_access_key='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',
)
s3 = session.resource('s3')
key = predfile_name.split('.')[0]
filename = path_to_processed_predfile
print(key)
s3.meta.client.upload_file(Bucket='tsd2022', Key=key, Filename=filename)

In [None]:
shutil.rmtree(path_to_output_directory)

In [None]:
os.remove(path_to_zipfile)
os.remove(path_to_predfile)
os.remove(path_to_not_normal_file)
os.remove(path_to_normal_file)
os.remove(path_to_processed_predfile)

##### example 1
----
* en_text: "Heinz Kohut saw the grandiose self as a fixation on a normal childhood stage, while other post-Freudians examined the role of the fixation in aggression and delinquency."
* de_text: "Heinz Kohut sah das grandiose Selbst als Fixierung auf ein normales Kindheitstadium, während andere Post-Freudianer die Rolle der Fixierung bei Aggression und Kriminalität untersuchten."
* target to source ratio: 1.0946745562130178 
----
* short_text: "Das grandiose Selbst als Fixierung auf ein normales Kindheitstadium, während andere Post-Freudianer die Rolle der Fixierung bei Aggression und Kriminalität untersuchten"
* normal_text: "Heinz Kohut sah das grandiose Selbst als Fixierung auf ein normales Kindheitstadium, während andere Post-Freudianer die Rolle der Fixierung bei Aggression und Kriminalität untersuchten."
* long_text: "Heinz Kohut sah das grandiose Selbst als Fixierung auf ein normales Kindheitstadium, während andere Post-Freudianer die Rolle der Fixierung bei Aggression und Kriminalität untersuchten."
* target to source ratio: 
  * short: 0.9940828402366864
  * normal: 1.0946745562130178
  * long: 1.0946745562130178

##### example 2
----
* en_text: "In April 1942 Britten returned to England and shortly thereafter asked Montagu Slater to be his librettist for Peter Grimes."
* de_text: "Im April 1942 kehrte Britten zurück nach England, und kurz danach bat er Montagu Slater, sein Librettist für \"Peter Grimes\" zu werden."
* target to source ratio: 1.0806451612903225
----

* short_text: "In England kehrte er zurück, und kurz danach bat er Montagu Slater, sein Librettist für "Peter Grimes" zu werden."
* normal_text: "In April 1942 kehrte Britten zurück, und kurz danach bat er Montagu Slater, sein Librettist für "Peter Grimes" zu werden."
* long_text: "In England kehrte er zurück, und kurz danach bat er Montagu Slater, sein Librettist für "Peter Grimes" zu werden."
* target to source ratio: 
  * short: 0.9112903225806451, 
  * normal: 0.9758064516129032
  * long: 0.9112903225806451

##### example 3
----
* en_text: "Tell that to his father, Zac MacGuire (Charlie Clausen), and Evie right away."
* de_text: "Sag das sofort seinem Vater, Zac MacGuire (Charlie Clausen), und Evie ."
* target to source ratio: 0.922077922077922
----
* short_text: "Sag es seinem Vater, Zac MacGuire, und Evie"
* normal_text: "Sag es seinem Vater, Zac Guire (Charlie Clausen), und Evie."
* long_text: "Sag es seinem Vater, Zac Guire (Charlie Clausen), und Evie."
* target to source ratio: 
  * short: 0.9112903225806451, 
  * normal: 0.9758064516129032
  * long: 0.9112903225806451