<a href="https://colab.research.google.com/github/Rana-Banerjee/mlpipelines/blob/main/zindi_t5_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers[torch] datasets evaluate rouge_score sacrebleu bleu

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bleu
  Downloading bleu-0.3.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from abc import ABC, abstractmethod
import pandas as pd

class Dataset(ABC):
    _name: str
    _df: pd.DataFrame

    @property
    def name(self)->str:
        return self._name

    @name.setter
    def name(self, value)-> None:
        self._name = value

    @property
    def df(self)->pd.DataFrame:
        return self._df

    @df.setter
    def df(self, value)-> None:
        self._df = value

    @abstractmethod
    def preprocess(self)->None:
        pass

In [4]:
import pandas as pd
import ast
import os
import datasets

class Zindi_Dataset(Dataset):
    """The dyula to french translation dataset for zindi."""
    def __init__(self, data_path:str):
        self.data_path= data_path
        self.src_lang = 'dyu'
        self.target_lang = 'fr'
        self.df=None
        self.ds=None

    def load(self, seperator='|')-> None:
        """Load the csv and return the dataframe."""
        self.df= pd.read_csv(self.data_path, sep=seperator)

    def preprocess(self)-> None:
        columns = self.df.columns
        self.df[self.src_lang]=self.df['translation'].apply(lambda x: f'translate dyula to french: {ast.literal_eval(x)[self.src_lang].lower()}')
        self.df[self.target_lang]=self.df['translation'].apply(lambda x: ast.literal_eval(x)[self.target_lang].lower())
        self.df.drop(columns=columns, inplace=True)
        self.ds = datasets.Dataset.from_pandas(self.df)

In [5]:
import datasets
train_ds = Zindi_Dataset('/content/drive/MyDrive/zindi_opt/t5/data/train.csv')
train_ds.load()
train_ds.preprocess()

test_ds = Zindi_Dataset('/content/drive/MyDrive/zindi_opt/t5/data/validation.csv')
test_ds.load()
test_ds.preprocess()

train_dataset=train_ds.ds
test_dataset=test_ds.ds


In [73]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import AutoConfig

model_checkpoint= 'google-t5/t5-small'
configuration = AutoConfig.from_pretrained(model_checkpoint)
# configuration.classifier_dropout = 0.001
configuration.dropout_rate = 0.13
configuration.max_length=128

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint,config = configuration)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model.config



T5Config {
  "_name_or_path": "google-t5/t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.125,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "max_length": 128,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": tru

In [74]:
def batch_tokenize_fn(examples):
    max_source_length = 128
    max_target_length = 128
    sources = examples['dyu']
    targets = examples['fr']
    model_inputs = tokenizer(sources, max_length=max_source_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
        model_inputs['labels']=labels['input_ids']
    # print(model_inputs.features)
    return model_inputs

train_tokenized_ds = train_dataset.map(batch_tokenize_fn, batched=True, remove_columns=train_dataset.column_names)
test_tokenized_ds = test_dataset.map(batch_tokenize_fn, batched=True,remove_columns=test_dataset.column_names)

Map:   0%|          | 0/8065 [00:00<?, ? examples/s]



Map:   0%|          | 0/1471 [00:00<?, ? examples/s]

In [75]:
from transformers import DataCollatorForSeq2Seq
from dataclasses import dataclass
import torch
import numpy as np
import random
import os

@dataclass
class Config:

    batch_size: int = 32
    num_workers: int = 4
    seed: int = 42
    max_source_length: int = 128
    max_target_length: int = 128

    lr: float = 0.0001
    weight_decay: float = 0.01
    epochs: int = 5000
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def __post_init__(self):
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

config = Config()
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



In [76]:
from transformers import Seq2SeqTrainingArguments
args = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/zindi_opt/t5/model',
    evaluation_strategy="steps",
    learning_rate=config.lr,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    weight_decay=config.weight_decay,
    save_total_limit=2,
    num_train_epochs=config.epochs,
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="rougeL",
    gradient_accumulation_steps=8,
    do_train=True,
    # careful when attempting to train t5 models on fp16 mixed precision,
    # the model was trained on bfloat16 mixed precision, and mixing different mixed precision
    # type might result in nan loss
    # https://discuss.huggingface.co/t/mixed-precision-for-bfloat16-pretrained-models/5315
    fp16=True,
    dataloader_num_workers=2,
    # logging_steps=500
)



In [77]:
import evaluate
rouge_score = evaluate.load("rouge")
bleu_score = evaluate.load("bleu")
sacrebleu_score = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    """
    Compute rouge and bleu metrics for seq2seq model generated prediction.

    tip: we can run trainer.predict on our eval/test dataset to see what a sample
    eval_pred object would look like when implementing custom compute metrics function
    """
    predictions, labels = eval_pred
    # Decode generated summaries, which is in ids into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode labels, a.k.a. reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rouge1", "rouge2", "rougeL"]
    )
    score = sacrebleu_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    result["sacrebleu"] = score["score"]
    # bscore=bleu_score.compute(
    #     predictions=decoded_preds,
    #     references=decoded_labels
    # )
    # result["bleu"] = bscore["bleu"]
    return {k: round(v, 4) for k, v in result.items()}

In [78]:
from transformers import Seq2SeqTrainer,EarlyStoppingCallback
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_tokenized_ds,
    eval_dataset=test_tokenized_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

In [None]:
train_output = trainer.train()

  self.pid = os.fork()


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Sacrebleu
500,2.9922,2.574626,0.1186,0.0192,0.1139,0.9663
1000,2.6035,2.46702,0.1421,0.031,0.1363,1.4847
1500,2.3839,2.415823,0.1573,0.0403,0.1512,2.2694
2000,2.2118,2.388616,0.1764,0.0527,0.1709,2.7832
2500,2.0617,2.373447,0.1893,0.0622,0.1836,3.6723
3000,1.9305,2.377022,0.1917,0.0646,0.1863,3.9439


  self.pid = os.fork()
Non-default generation parameters: {'max_length': 128}
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 128}
  self.pid = os.fork()
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 128}
  self.pid = os.fork()
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 128}
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 128}
  self.pid = os.fork()
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 128}
  self.pid = os.fork()
