In [1]:
!pip install transformers
!pip install datasets
!pip install rouge_score
!pip install rouge
!pip install nltk
!pip install peft
!pip install bitsandbytes

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b8b5f044b6f00c121df71346ca139a8eeb94f9c4c59ad3a644d8574625ed6c3a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, BitsAndBytesConfig
from rouge import Rouge
from itertools import islice
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType

import numpy as np
import torch
import psutil

In [3]:
dataset = load_dataset("knkarthick/dialogsum", trust_remote_code=True)

README.md:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/442k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [4]:
print(dataset['train'])
print(len(dataset['train']))

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})
12460


In [5]:
print(dataset['train'][0])
print()
print(dataset['train'][0]['dialogue'])
print()
print(dataset['train'][0]['summary'])
                

{'id': 'train_0', 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.", 'summary': "Mr. Smith's 

In [6]:
model_name='google/flan-t5-large'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

device = torch.device("cuda")
model.to(device)

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



trainable model parameters: 783150080
all model parameters: 783150080
percentage of trainable model parameters: 100.00%


In [7]:
import time
def get_memory_usage():
    process = psutil.Process()
    ram_usage = process.memory_info().rss / (1024 ** 2)  # Convert bytes to MB
    return ram_usage

# Initialize timers and counters
batch_times = []
start_time = time.time()

# Measure initial memory usage
initial_ram_usage = get_memory_usage()
initial_gpu_memory = torch.cuda.memory_allocated() / (1024 ** 2)  # Convert bytes to MB

In [None]:
batched_summaries=[]
example_indices = range(len(dataset['train']))  # In the end replace it with len(dataset['train'])
batch_size = 32

# Function to yield batches
def batched_indices(iterable, size):
    iterator = iter(iterable)
    for first in iterator:
        yield [first] + list(islice(iterator, size - 1))

# Process each batch of indices
for batch_num, batch in enumerate(batched_indices(example_indices, batch_size)):
    print(batch_num + 1, end=" ")

    # Collect dialogues and summaries for the batch
    dialogues = [dataset['train'][index]['dialogue'] for index in batch]
    summaries = [dataset['train'][index]['summary'] for index in batch]

    # Tokenize all dialogues in the batch simultaneously
    inputs = tokenizer(dialogues, return_tensors='pt', padding=True, truncation=True).to(device)
    # inputs = tokenizer(dialogues, return_tensors='pt', padding=True, truncation=True)

    # Generate summaries for all dialogues in the batch at once
    outputs = model.generate(inputs["input_ids"], max_new_tokens=50)

    # Decode and print each example in the batch
    for i, (dialogue, summary, output) in enumerate(zip(dialogues, summaries, outputs)):
        decoded_output = tokenizer.decode(output, skip_special_tokens=True)
        batched_summaries.append(decoded_output)
        
for i in batched_summaries:
    print(i)

In [None]:
# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Lists to store the scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Calculate ROUGE scores for each pair of predicted and reference summaries
for pred, ref in zip(batched_summaries, dataset['train']['summary']):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate the average ROUGE scores across all pairs
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)

# Display results
print("Average ROUGE-1 F1 Score:", avg_rouge1)
print("Average ROUGE-2 F1 Score:", avg_rouge2)
print("Average ROUGE-L F1 Score:", avg_rougeL)


In [None]:
# Total number of examples and batch size
total_examples = 10 #in the final one replace this with the total size of the training set
batch_size = 8
dash_line = '-' * 100
zero_shot_summaries_batch = []

# Generate example indices for the full dataset
example_indices = range(total_examples)

# Function to yield batches
def batched_indices(iterable, size):
    iterator = iter(iterable)
    for first in iterator:
        yield [first] + list(islice(iterator, size - 1))

# Process each batch of indices
for batch_num, batch in enumerate(batched_indices(example_indices, batch_size)):
    print(f'{batch_num + 1}')

    # Collect dialogues and summaries for the batch
    dialogues = [dataset['train'][index]['dialogue'] for index in batch]
    summaries = [dataset['train'][index]['summary'] for index in batch]

    # Construct prompts for each dialogue in the batch
    prompts = [f"""
      Summarize the following conversation.

      {dialogue}

      Summary:
    """ for dialogue in dialogues]

    # Tokenize all prompts in the batch simultaneously
    # inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True)
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True).to(device)

    # Generate summaries for all prompts in the batch at once
    outputs = model.generate(inputs["input_ids"], max_new_tokens=50)

    # Decode and print each example in the batch
    for i, (prompt, summary, output) in enumerate(zip(prompts, summaries, outputs)):
        decoded_output = tokenizer.decode(output, skip_special_tokens=True)
        zero_shot_summaries_batch.append(decoded_output)

    # Optional: Break after a certain number of batches for testing (remove or comment for full run)
    # if batch_num >= some_value:
    #     break
for i in zero_shot_summaries_batch:
    print(i)

In [8]:
# Total number of examples and batch size
total_examples = len(dataset['train']) #in the final one replace this with the total size of the training set
batch_size = 32
dash_line = '-' * 100
zero_shot_changed_summaries_batch = []

# Generate example indices for the full dataset
example_indices = range(total_examples)

# Function to yield batches
def batched_indices(iterable, size):
    iterator = iter(iterable)
    for first in iterator:
        yield [first] + list(islice(iterator, size - 1))

# Process each batch of indices
for batch_num, batch in enumerate(batched_indices(example_indices, batch_size)):
    batch_start_time = time.time()
    print(batch_num + 1, end=" ")

    # Collect dialogues and summaries for the batch
    dialogues = [dataset['train'][index]['dialogue'] for index in batch]
    summaries = [dataset['train'][index]['summary'] for index in batch]

    # Construct prompts for each dialogue in the batch
    prompts = [f"""
      Dialogue

      {dialogue}

      What was going on?
    """ for dialogue in dialogues]

    # Tokenize all prompts in the batch simultaneously
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True).to(device)

    # Generate summaries for all prompts in the batch at once
    outputs = model.generate(inputs["input_ids"], max_new_tokens=100)

    # Decode and append summaries
    for i, (prompt, summary, output) in enumerate(zip(prompts, summaries, outputs)):
        decoded_output = tokenizer.decode(output, skip_special_tokens=True)
        zero_shot_changed_summaries_batch.append(decoded_output)

    # Record batch processing time
    batch_times.append(time.time() - batch_start_time)

# Measure final memory usage
final_ram_usage = get_memory_usage()
final_gpu_memory = torch.cuda.memory_allocated() / (1024 ** 2)  # Convert bytes to MB
total_time = time.time() - start_time

# Calculate metrics
average_batch_time = sum(batch_times) / len(batch_times)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 

In [9]:
print(f"Initial RAM Usage (MB): {initial_ram_usage:.2f}")
print(f"Final RAM Usage (MB): {final_ram_usage:.2f}")
print(f"Initial GPU Memory Usage (MB): {initial_gpu_memory:.2f}")
print(f"Final GPU Memory Usage (MB): {final_gpu_memory:.2f}")
print(f"Average Inference Time per Batch (s): {average_batch_time:.2f}")
print(f"Total Inference Time (s): {total_time:.2f}")

Initial RAM Usage (MB): 1306.48
Final RAM Usage (MB): 1731.53
Initial GPU Memory Usage (MB): 3132.48
Final GPU Memory Usage (MB): 3140.70
Average Inference Time per Batch (s): 5.61
Total Inference Time (s): 2225.87


In [10]:
# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Lists to store the scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Calculate ROUGE scores for each pair of predicted and reference summaries
for pred, ref in zip(zero_shot_changed_summaries_batch, dataset['train']['summary']):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate the average ROUGE scores across all pairs
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)

# Display results
print("Average ROUGE-1 F1 Score:", avg_rouge1)
print("Average ROUGE-2 F1 Score:", avg_rouge2)
print("Average ROUGE-L F1 Score:", avg_rougeL)


Average ROUGE-1 F1 Score: 0.37189147733080397
Average ROUGE-2 F1 Score: 0.16438961586164394
Average ROUGE-L F1 Score: 0.3105265972874096
