# **Machine Translation**

In [1]:
!nvidia-smi

Wed Nov  1 16:22:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
pip install transformers --upgrade



# **Setting up the Environment**

In [3]:
from pathlib import Path
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount("/content/drive")
    !pip install datasets transformers evaluate wandb accelerate -U -qq

    base_folder = Path("/content/drive/MyDrive/Pooja_HP_Singh_Projects/NLP/HW6")


from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer,TrainingArguments
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, pipeline
from datasets import load_dataset, DatasetDict
import evaluate
from evaluate import evaluator


import wandb
import numpy as np
import pandas as pd
from transformers import GenerationConfig
import gc
from transformers import Seq2SeqTrainingArguments
import torch

!pip install sacrebleu
!pip install bert_score
!pip install "transformers[sentencepiece]"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Function to Load Dataset**

In [4]:
def load_dataset_from_hf(name_of_the_datacard, lang1, lang2):
  data = load_dataset(name_of_the_datacard, lang1=lang1, lang2=lang2)
  return data

# **Function to Split Dataset**

In [5]:
def split_dataset(data):
  test_val_splits = data['train'].train_test_split(test_size=0.4, seed=42)
  train_split= test_val_splits['train']
  test_val_splits = test_val_splits['test'].train_test_split(test_size=0.5, seed=42,)
  val_split = test_val_splits['train']
  test_split = test_val_splits['test']
  return train_split,val_split,test_split


# **Function to Create smaller subset**

In [6]:
def get_small_subset(train_split,val_split,test_split):
  # full test dataset
  test_dataset = test_split
  # combining full train and val
  train_val_dataset = DatasetDict({'train': train_split, 'val': val_split})

  #creating small subsets for all splits
  train_split_small = train_split.shuffle(seed=42).select(range(1000))
  val_split_small = val_split.shuffle(seed=42).select(range(500))
  test_split_small = test_split.shuffle(seed=42).select(range(500))
  # combine train, val splits into one dataset
  train_val_subset = DatasetDict({'train': train_split_small, 'val': val_split_small})

  # create test dataset from test split
  test_subset= DatasetDict({'test': test_split_small})
  return train_val_subset,train_val_dataset,test_dataset


# **Function for Tokenization**

In [7]:
def get_tokenized_dataset(checkpoint, dataset,max_length):

  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_fn(batch):
    inputs = [example['en'] for example in batch['translation']]
    targets = [example['fr'] for example in batch['translation']]
    model_inputs = tokenizer(text = inputs, text_target=targets, truncation = True, max_length=max_length)
    return model_inputs


  tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset['train'].column_names)
  tokenized_dataset.set_format(type="torch")
  return tokenized_dataset

# **Function to Create Datasets**

In [8]:
def setup_dataset(name_of_the_datacard, lang1, lang2):
  #load the dataset
  dataset = load_dataset_from_hf(name_of_the_datacard,lang1,lang2)

  #split the dataset
  train_split,val_split,test_split = split_dataset(dataset)

  #create smaller subset
  train_val_subset,train_val_dataset,test_dataset = get_small_subset(train_split,val_split,test_split)

  return train_val_subset,train_val_dataset,test_dataset

# **Function to Initialize Model**

In [9]:
def initialize_model(checkpoint):
  config = AutoConfig.from_pretrained(checkpoint)
  generation_config = GenerationConfig.from_pretrained(checkpoint)
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint,config=config)
  return model



# **Function to Compute Metrics**

In [10]:
bleu_metric = evaluate.load("sacrebleu")
bert_metric = evaluate.load('bertscore')

def compute_metrics(preds_and_labels):
  # preds are not logits but token ids
    # api is inconsistent here
    # we are not simply using argmax bu use 'beam search'
    preds, labels = preds_and_labels

    # convert predictions into words
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # for any -100 label, replace with pad token id
    labels = np.where( labels != -100, labels, tokenizer.pad_token_id )

    # convert labels into words
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens= True)

    # get rid of extra whitespace
    # and also, put targets into lists

    decoded_preds_cleaned = [pred.strip() for pred in decoded_preds]
    decoded_labels_cleaned = [label.strip() for label in decoded_labels]

    bleu_score = bleu_metric.compute(predictions=decoded_preds_cleaned, references=decoded_labels_cleaned)
    bert_score = bert_metric.compute(predictions=decoded_preds_cleaned, references=decoded_labels_cleaned, lang='fr')

    return{'bleu_score:': bleu_score['score'], 'bert_score': np.mean(bert_score['f1'])}
    # return {'bleu_score:': bleu_score['score']}

# **Function to set Trainer**

In [11]:
def get_trainer(model, training_args, tokenized_dataset, compute_metrics, tokenizer, data_collator):
  # initialize trainer
  trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
  return trainer


# **Function to free memory**

In [12]:
def free_memory():
    """
    Attempts to free up memory by deleting variables and running Python's garbage collector.
    """
    gc.collect()
    for device_id in range(torch.cuda.device_count()):
        torch.cuda.set_device(device_id)
        torch.cuda.empty_cache()
    gc.collect()

# **Function to tokenize dataset and, train and eval models**

In [13]:
def tokenize_train_evaluate_log(training_args, checkpoint, base_folder, max_length,
                                train_val_subset, compute_metrics):
    # 1. Free memory
    free_memory()

    # 2. Setup wandb
    wandb.login()
    %env WANDB_PROJECT = nlp_course_fall_2023-HW6-Part-D-Colab


    # 3. Get Tokenized Dataset and Data Collator
    train_val_tokenized_dataset = get_tokenized_dataset(checkpoint, train_val_subset,max_length)

    # 4. Initialize Model and Tokenizer
    model = initialize_model(checkpoint)
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    # 5. Initialize Trainer
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)
    trainer = get_trainer(model, training_args, train_val_tokenized_dataset,
                          compute_metrics, tokenizer, data_collator)

    # 6. Train and Evaluate
    trainer.train()
    trainer.evaluate(train_val_tokenized_dataset['val'])


    best_model_checkpoint_step = trainer.state.best_model_checkpoint.split('-')[-1]
    wandb.log({"best_model_checkpoint_step": best_model_checkpoint_step})
    print(f"The best model was saved at step {best_model_checkpoint_step}.")

    wandb.finish()

# **Initial Training Arguments**

In [14]:
def training_args_fn(checkpoint,base_folder):
  # Configure training parameters

  # Define the directory where model checkpoints will be saved
  model_folder = base_folder / "Models" / "nlp_fall_2023/kde4/opus-mt-en-fr"

  # Create the directory if it doesn't exist
  model_folder.mkdir(exist_ok=True, parents=True)

  training_args = Seq2SeqTrainingArguments(
      # Training-specific configurations
      num_train_epochs=1,  # Total number of training epochs
      weight_decay=0.01,  # Apply L2 regularization to prevent overfitting
      learning_rate=5e-5,  # Step size for the optimizer during training
      optim="adamw_torch",  # Optimizer,
      warmup_steps=10,
      predict_with_generate=True,
      generation_config=GenerationConfig.from_pretrained(checkpoint),
      # memory and speed related arguments
      # Number of samples per training batch for each device
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,  # Number of samples per eval batch for each device

      gradient_checkpointing=True,  # memory
      # fp16 = True, # Speed
      # bf16=True,
      # tf32=True, # speed
      # evaluation settings
      output_dir=str(model_folder),  # Directory to save model checkpoints
      evaluation_strategy="steps",  # Evaluate model at specified step intervals
      eval_steps=10,  # Perform evaluation every 10 training steps
      # Checkpoint settings
      save_strategy="steps",  # Save model checkpoint at specified step intervals
      save_steps=10,  # Save a model checkpoint every 10 training steps
      load_best_model_at_end=True,  # Reload the best model at the end of training
      save_total_limit=2,  # Retain only the best and the most recent model checkpoints
      # metric_for_best_model=,
      # greater_is_better=,
      # Experiment logging configurations (commented out in this example)
      logging_strategy="steps",
      logging_steps=10,
      report_to="wandb",  # Log metrics and results to Weights & Biases platform
      # Experiment name for Weights & Biases
      run_name="translation-exp1",
  )
  return training_args

# **Experiments**

# **Dataset hyperparameters**

In [15]:
name_of_the_datacard = 'kde4'
lang1 = 'en'
lang2 = 'fr'
train_val_subset, train_val_dataset, test_dataset = setup_dataset(name_of_the_datacard,lang1,lang2)

# **Experiment 1 : with model --> Helsinki-NLP/opus-mt-en-fr and Learning rate = 5e-5**

# **Trainer hyperparameters**

In [16]:
checkpoint = 'Helsinki-NLP/opus-mt-en-fr'
exp1 = 'helsinki-model'
max_length = 128
training_args = training_args_fn(checkpoint, base_folder)
training_args_dict = training_args.to_dict() # Convert TrainingArguments to dictionary

training_args_dict['run_name'] = f'{checkpoint}-{exp1}' # Update the run_name
new_training_args = Seq2SeqTrainingArguments(**training_args_dict)



In [17]:
tokenize_train_evaluate_log(training_args= training_args,
                            checkpoint=checkpoint, base_folder=base_folder, max_length = max_length,
                            train_val_subset=train_val_subset,compute_metrics=compute_metrics)

[34m[1mwandb[0m: Currently logged in as: [33mpxa210024[0m ([33mpooja_rocks[0m). Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=nlp_course_fall_2023-HW6-Part-D-Colab






Step,Training Loss,Validation Loss,Bleu Score:,Bert Score
10,1.9915,1.655252,39.680601,0.863847
20,1.7331,1.596057,39.234273,0.863176
30,1.672,1.556005,40.578745,0.86622
40,1.4423,1.537132,41.174389,0.867369
50,1.4397,1.526197,41.246359,0.867248
60,1.5122,1.520451,41.166549,0.86663




The best model was saved at step 60.


0,1
eval/bert_score,▂▁▆██▇▇
eval/bleu_score:,▃▁▆████
eval/loss,█▅▃▂▁▁▁
eval/runtime,▂█▁▂▄▅▄
eval/samples_per_second,▆▁█▇▄▃▄
eval/steps_per_second,▆▁█▇▄▃▄
train/epoch,▁▁▂▂▄▄▅▅▆▆████
train/global_step,▁▁▂▂▄▄▅▅▆▆█████
train/learning_rate,█▇▅▄▂▁
train/loss,█▅▄▁▁▂

0,1
best_model_checkpoint_step,60.0
eval/bert_score,0.86663
eval/bleu_score:,41.16655
eval/loss,1.52045
eval/runtime,92.2558
eval/samples_per_second,5.42
eval/steps_per_second,0.347
train/epoch,1.0
train/global_step,63.0
train/learning_rate,0.0


# **Experiment 2 : with model --> t5-small and Learning rate = 5e-5**

In [18]:
checkpoint = 't5-small'
exp1 = 't5-model'
max_length = 128
training_args = training_args_fn(checkpoint, base_folder)
training_args_dict = training_args.to_dict() # Convert TrainingArguments to dictionary

training_args_dict['run_name'] = f'{checkpoint}-{exp1}' # Update the run_name
new_training_args = Seq2SeqTrainingArguments(**training_args_dict)



In [19]:
tokenize_train_evaluate_log(training_args= training_args,
                            checkpoint=checkpoint, base_folder=base_folder, max_length = max_length,
                            train_val_subset=train_val_subset,compute_metrics=compute_metrics)

env: WANDB_PROJECT=nlp_course_fall_2023-HW6-Part-D-Colab


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Bleu Score:,Bert Score
10,3.9968,3.170202,3.056221,0.692575
20,3.6976,2.845025,3.648438,0.706263
30,3.3602,2.654505,4.295556,0.710954
40,2.9414,2.566919,5.012549,0.718981
50,2.8861,2.508355,5.495368,0.718796
60,2.7894,2.481039,5.952252,0.725476




The best model was saved at step 60.




0,1
eval/bert_score,▁▄▅▇▇██
eval/bleu_score:,▁▂▄▆▇██
eval/loss,█▅▃▂▁▁▁
eval/runtime,▁▇█▄▅▄▆
eval/samples_per_second,█▁▁▅▄▄▃
eval/steps_per_second,█▁▁▅▄▄▃
train/epoch,▁▁▂▂▄▄▅▅▆▆████
train/global_step,▁▁▂▂▄▄▅▅▆▆█████
train/learning_rate,█▇▅▄▂▁
train/loss,█▆▄▂▂▁

0,1
best_model_checkpoint_step,60.0
eval/bert_score,0.72548
eval/bleu_score:,5.95225
eval/loss,2.48104
eval/runtime,11.89
eval/samples_per_second,42.052
eval/steps_per_second,2.691
train/epoch,1.0
train/global_step,63.0
train/learning_rate,0.0


# **Experiment 3 : with model --> Helsinki-NLP/opus-mt-en-fr, Learning rate = 5e-4**

In [20]:
checkpoint = 'Helsinki-NLP/opus-mt-en-fr'
exp1 = 'helsinki-model'
max_length = 128
training_args = training_args_fn(checkpoint, base_folder)
training_args_dict = training_args.to_dict() # Convert TrainingArguments to dictionary
training_args_dict['learning_rate'] = 5e-4
training_args_dict['run_name'] = f'{checkpoint}-{exp1}' # Update the run_name
new_training_args = Seq2SeqTrainingArguments(**training_args_dict)



In [21]:
tokenize_train_evaluate_log(training_args= training_args,
                            checkpoint=checkpoint, base_folder=base_folder, max_length = max_length,
                            train_val_subset=train_val_subset,compute_metrics=compute_metrics)

env: WANDB_PROJECT=nlp_course_fall_2023-HW6-Part-D-Colab




Step,Training Loss,Validation Loss,Bleu Score:,Bert Score
10,1.9915,1.655252,39.680601,0.863847
20,1.7331,1.596057,39.234273,0.863176
30,1.672,1.556005,40.578745,0.86622
40,1.4423,1.537132,41.174389,0.867369
50,1.4397,1.526197,41.246359,0.867248
60,1.5122,1.520451,41.166549,0.86663




The best model was saved at step 60.


0,1
eval/bert_score,▂▁▆██▇▇
eval/bleu_score:,▃▁▆████
eval/loss,█▅▃▂▁▁▁
eval/runtime,▁█▁▂▅▅▅
eval/samples_per_second,█▁█▆▃▃▃
eval/steps_per_second,█▁█▆▃▃▃
train/epoch,▁▁▂▂▄▄▅▅▆▆████
train/global_step,▁▁▂▂▄▄▅▅▆▆█████
train/learning_rate,█▇▅▄▂▁
train/loss,█▅▄▁▁▂

0,1
best_model_checkpoint_step,60.0
eval/bert_score,0.86663
eval/bleu_score:,41.16655
eval/loss,1.52045
eval/runtime,93.4791
eval/samples_per_second,5.349
eval/steps_per_second,0.342
train/epoch,1.0
train/global_step,63.0
train/learning_rate,0.0


# **Conclusion:**
Compared to helsinki model, t5 model's bert score is relatively less. And for helsinki model changing the learning rate from 5e-5 to 5e-4 didn't make much difference in the model performance.

In [22]:
!sudo apt-get install texlive-xetex texlive-fonts-recommended texlive-plain-generic

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  dvisvgm fonts-droid-fallback fonts-lato fonts-lmodern fonts-noto-mono
  fonts-texgyre fonts-urw-base35 libapache-pom-java libcommons-logging-java
  libcommons-parent-java libfontbox-java libfontenc1 libgs9 libgs9-common
  libidn12 libijs-0.35 libjbig2dec0 libkpathsea6 libpdfbox-java libptexenc1
  libruby3.0 libsynctex2 libteckit0 libtexlua53 libtexluajit2 libwoff1
  libzzip-0-13 lmodern poppler-data preview-latex-style rake ruby
  ruby-net-telnet ruby-rubygems ruby-webrick ruby-xmlrpc ruby3.0
  rubygems-integration t1utils teckit tex-common tex-gyre texlive-base
  texlive-binaries texlive-latex-base texlive-latex-extra
  texlive-latex-recommended texlive-pictures tipa xfonts-encodings
  xfonts-utils
Suggested packages:
  fonts-noto fonts-freefont-otf | fonts-freefont-ttf libavalon-framework-java
  libcommons-logging-java-doc libexcalibu

In [25]:
!jupyter nbconvert --to pdf /content/drive/MyDrive/Pooja_HP_Singh_Projects/NLP/HW6/Pooja_Akkaladevi_HW6d.ipynb

[NbConvertApp] Converting notebook /content/drive/MyDrive/Pooja_HP_Singh_Projects/NLP/HW6/Pooja_Akkaladevi_HW6d.ipynb to pdf
[NbConvertApp] Writing 116533 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 93759 bytes to /content/drive/MyDrive/Pooja_HP_Singh_Projects/NLP/HW6/Pooja_Akkaladevi_HW6d.pdf
