In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
import torch
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
print(dataset['train'])

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})


In [5]:
model_name = 'google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [6]:
def print_no_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    percentage_of_trainable_params = (all_model_params / trainable_model_params) * 100
    return f"Trainable model paramaeters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model params: {percentage_of_trainable_params}% "

print(print_no_of_trainable_model_parameters(original_model))

Trainable model paramaeters: 247577856
all model parameters: 247577856
percentage of trainable model params: 100.0% 


# Test the model with zero shot inferencing

In [7]:
index = 200
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summerize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens = True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'🚩INPUT PROMPT: \n{prompt}')
print(dash_line)
print(f'😎 BASELINE HUMAN SUMMERY: \n{summary}\n')
print(dash_line)
print(f'✅ MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
🚩INPUT PROMPT: 

Summerize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-----------------------------------------------------------------

# 2. Perform Full Fine-Tuning

<a name='2.1'></a>
### 2.1 - Preprocess the Dialog-Summary Dataset

You need to convert the dialog-summary (prompt-response) pairs into explicit instructions for the LLM. Prepend an instruction to the start of the dialog with `Summarize the following conversation` and to the start of the summary with `Summary` as follows:

Training prompt (dialogue):
```
Summarize the following conversation.

    Chris: This is his part of the conversation.
    Antje: This is her part of the conversation.
    
Summary: 
```

Training response (summary):
```
Both Chris and Antje participated in the conversation.
```

Then preprocess the prompt-response dataset into tokens and pull out their `input_ids` (1 per token).

In [8]:
def tokenize_function (example) :
    start_prompt  = 'Summerize the following conversation. \n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map: 100%|██████████| 1500/1500 [00:01<00:00, 1127.62 examples/s]


In [9]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Filter: 100%|██████████| 1500/1500 [00:01<00:00, 1096.38 examples/s]


In [10]:
print(f'Shapes of the datastes: ')
print(f'Training: {tokenized_datasets['train'].shape}')
print(f'Validation: {tokenized_datasets['validation'].shape}')
print(f'Test: {tokenized_datasets['test'].shape}')

print(tokenized_datasets)

Shapes of the datastes: 
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


### 2.2 Fine-Tuning the Model with the preprocessed DataSet

In [11]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = 1e-5,
    num_train_epochs = 1,
    weight_decay = 0.01,
    logging_steps = 1,
    max_steps = 1
)

trainer = Trainer(
    model = original_model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

max_steps is given, it will override any value given in num_train_epochs


Training a fully fine-tuned version of the model would take a few hours on a GPU. To save time, download a checkpoint of the fully fine-tuned model to use in the rest of this notebook. This fully fine-tuned model will also be referred to as the **instruct model** in this lab.

### trainer.train()