In [2]:
!pip install --upgrade pip
!pip install datasets



In [4]:
!pip install --upgrade pip
!pip install torch torchaudio torchvision torchdata transformers datasets evaluate rouge_score loralib peft --quiet





In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("knkarthick/dialogsum")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
model_name = "google/flan-t5-base"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"
print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [5]:
index = 200
dialogue = ds["test"][index]["dialogue"]
summary = ds["test"][index]["summary"]
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors="pt")
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)
dashline = '-'.join('' for x in range(100))
print(dashline)
print(f"INPUT PROMPT:\n{prompt}")
print(dashline)
print(f"BASELINE HUMAN SUMMARY:\n{summary}\n")
print(dashline)
print(f"MODEL GENERATION - ZERO SHOT:\n{output}")

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [14]:
prompt = "Cars are a popular mode of transportation that come in various types and styles, including sedans, SUVs, trucks, and sports cars. They provide a convenient way to travel long distances quickly and comfortably. Modern cars are equipped with advanced technology such as GPS navigation, infotainment systems, and safety features like airbags and anti-lock braking systems. The automotive industry is continuously evolving, with a growing focus on electric vehicles and sustainable technologies to reduce environmental impact."
inputs = tokenizer(prompt, return_tensors="pt")
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)
print(output)

The automotive industry is continuously evolving with a growing focus on electric vehicles and sustainable technologies to reduce environmental impact


In [6]:
def tokenized_function(example):
  strat_prompt = "Summarize the following conversation.\n\n"
  end_prompt = "\n\nSummary: "
  prompt = [strat_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
  example["input_ids"] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
  example["labels"] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
  return example

tokenized_dataset = ds.map(tokenized_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["id", "topic", "dialogue", "summary",])

Map: 100%|██████████| 500/500 [00:00<00:00, 899.01 examples/s]


In [7]:
tokenized_dataset = tokenized_dataset.filter(lambda example, index: index % 100 == 0, with_indices=True)

Filter: 100%|██████████| 500/500 [00:00<00:00, 1219.11 examples/s]


In [8]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_dataset['train'].shape}")
print(f"Validation: {tokenized_dataset['validation'].shape}")
print(f"Test: {tokenized_dataset['test'].shape}")

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)


In [10]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)
trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation']
)

max_steps is given, it will override any value given in num_train_epochs


In [11]:
trainer.train()

100%|██████████| 1/1 [13:32<00:00, 812.03s/it]

{'loss': 47.5, 'grad_norm': 414.0, 'learning_rate': 0.0, 'epoch': 0.06}


100%|██████████| 1/1 [13:35<00:00, 815.14s/it]

{'train_runtime': 815.1218, 'train_samples_per_second': 0.01, 'train_steps_per_second': 0.001, 'train_loss': 47.5, 'epoch': 0.06}





TrainOutput(global_step=1, training_loss=47.5, metrics={'train_runtime': 815.1218, 'train_samples_per_second': 0.01, 'train_steps_per_second': 0.001, 'total_flos': 5478058819584.0, 'train_loss': 47.5, 'epoch': 0.0625})

In [13]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)


('./dialogue-summary-training-1725456780\\tokenizer_config.json',
 './dialogue-summary-training-1725456780\\special_tokens_map.json',
 './dialogue-summary-training-1725456780\\tokenizer.json')

In [30]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

# Initialize tokenizer
#tokenizer = AutoTokenizer.from_pretrained("D:/LLM/dialogue-summary-training-1725456780")

# Load models
#original_model = AutoModelForSeq2SeqLM.from_pretrained("D:/LLM/dialogue-summary-training-1725456780", torch_dtype=torch.bfloat16)
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("D:/LLM/dialogue-summary-training-1725456780", torch_dtype=torch.bfloat16)

# Sample index
index = 200
dialogue = ds["test"][index]["dialogue"]
summary = ds["test"][index]["summary"]

# Create prompt
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

# Tokenize input
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Set generation configuration
gen_config = GenerationConfig(max_new_tokens=200, num_beams=1)

# Generate summaries
original_model_output_ids = original_model.generate(input_ids=input_ids, generation_config=gen_config)[0]
original_model_text_output = tokenizer.decode(original_model_output_ids, skip_special_tokens=True)

instruct_model_output_ids = instruct_model.generate(input_ids=input_ids, generation_config=gen_config)[0]
instruct_model_text_output = tokenizer.decode(instruct_model_output_ids, skip_special_tokens=True)

# Print outputs
print("BASELINE HUMAN SUMMARY:\n", summary, "\n")
print("ORIGINAL MODEL:\n", original_model_text_output)
print("INSTRUCT MODEL:\n", instruct_model_text_output)


BASELINE HUMAN SUMMARY:
 #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system. 

ORIGINAL MODEL:
 #Person1#: I'm thinking of upgrading my computer.
INSTRUCT MODEL:
 #Person1#: I'm thinking of upgrading my computer.


In [33]:
rouge = evaluate.load('rouge')

In [35]:
dialogues = ds["test"][0:10]['dialogue']
human_baseline_summaries = ds["test"][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_outputs = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_outputs)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_outputs = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_outputs)

zipped_summaries = list(zip(human_baseline_summaries,original_model_summaries,instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1#: I need to take a dictation for you.,#Person1#: I need to take a dictation for you.
1,In order to prevent employees from wasting tim...,#Person1#: I need to take a dictation for you.,#Person1#: I need to take a dictation for you.
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1#: I need to take a dictation for you.,#Person1#: I need to take a dictation for you.
3,#Person2# arrives late because of traffic jam....,The traffic jam at the Carrefour intersection ...,The traffic jam at the Carrefour intersection ...
4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam at the Carrefour intersection ...,The traffic jam at the Carrefour intersection ...
5,#Person2# complains to #Person1# about the tra...,The traffic jam at the Carrefour intersection ...,The traffic jam at the Carrefour intersection ...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,"#Person1#: Happy birthday, Brian. #Person2#: I...","#Person1#: Happy birthday, Brian. #Person2#: I..."
