In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
import random

In [None]:
%pip install  evaluate==0.4.0



In [None]:
data_set = load_dataset("knkarthick/dialogsum")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
original_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base",torch_dtype=torch.bfloat16)
tokernizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [None]:
def trainable_layer(model):
  trainable_params=0
  all_params=0
  for _,params in model.named_parameters():
    all_params+=params.numel()
    if params.requires_grad:
      trainable_params+=params.numel()

  return f"Trainable pramerters : {trainable_params} \nTotal parameters: {all_params}"

In [None]:
print(trainable_layer(original_model))

Trainable pramerters : 247577856 
Total parameters: 247577856


In [None]:
index= random.randint(0,200)

dialogue= data_set['test'][index]['dialogue']
summary= data_set['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokernizer(prompt,return_tensors='pt')
outputs=tokernizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200
    )[0],skip_special_tokens=True
)

dash="-"*100

print(f'INPUT PROMPT:\n{prompt}')
print(dash)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash)
print(f'MODEL GENERATION - ZERO SHOT:\n{outputs}')

INPUT PROMPT:

Summarize the following conversation.

#Person1#: Is anybody in?
#Person2#: How can I help you?
#Person1#: I have a headache.
#Person2#: Let me take your temperature with a thermometer.
#Person1#: OK.
#Person2#: I think you have a small fever.
#Person1#: I thought so. I felt dizzy this morning.
#Person2#: You should've called in sick! Next time, have either of your parents call the school office.

Summary:

----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person2# checks #Person1#'s physical condition and finds #Person1# has a fever.

----------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
#Person1#: I have a headache. #Person2#: I think you have a small fever.


# **Full Model Tunning - catastropic loss**

In [None]:
 def tokenize_function(example):
  start_prompt="Summarize the following conversation.\n\n"
  end_prompt='\n\nSummary:  '

  prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
  example['input_ids'] = tokernizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
  example['labels'] = tokernizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

  return example

In [None]:
tokenized_dataset = data_set.map(tokenize_function,batched=True)
print(tokenized_dataset)
tokenized_dataset = tokenized_dataset.remove_columns(['id','topic','dialogue','summary'])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 1500
    })
})


In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

In [None]:
tokenized_dataset = tokenized_dataset.filter(lambda example, index: index % 100 ==0, with_indices=True)

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
print(f"Training: {tokenized_dataset['train'].shape}")
print(f"Validation: {tokenized_dataset['validation'].shape}")

Training: (125, 2)
Validation: (5, 2)


Fine-Tuning the Model

In [None]:
logs=f"./dialogue-summary-traiing-{str(int(time.time()))}"

training_args = TrainingArguments(
    output_dir = logs,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation']
)

In [None]:
# It crashes as it requires more ram to train complete model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mrandrothstein17[0m ([33mrandrothstein17-ascendion[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




# Using Lora

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [None]:
output_dir= f'./peft_training_log-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-5,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=original_model,
    args=peft_training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation']
)

In [None]:
peft_trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mrandrothstein17[0m ([33mrandrothstein17-ascendion[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




# Evaluate Model

In [None]:
%pip install rouge_score
rouge = evaluate.load('rouge')