In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
# importing the required data

train_data=pd.read_csv("/content/drive/MyDrive/conversation_summarization/samsum-train.csv")
test_data=pd.read_csv("/content/drive/MyDrive/conversation_summarization/samsum-test.csv")

train_data.head()

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."


In [5]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

In [6]:
print(train_data.shape)
print(test_data.shape)


(14732, 3)
(819, 3)


In [7]:
# lets say we want only 5000 samples for the training
train_data=train_data.sample(n=5000,random_state=42).reset_index(drop=True)

# consider lets say we need only 500
test_data=test_data.sample(n=500,random_state=42).reset_index(drop=True)

# new size of data
print(train_data.shape)
print(test_data.shape)

(5000, 3)
(500, 3)


In [8]:
# the dialogue is not clean, we need to text preprocessing
import re

def clean_text(text):
    # we can make lower case as well
    # to remove \r or \n etc.
    text = re.sub(r'\r\n',' ',text)
    # remove extra spaces
    text = re.sub(r'\s+',' ',text)
    # remove HTMl tags
    text = re.sub(r'<.*?>','',text)
    # tokens
    text = text.strip().lower()
    return text

# applying cleanign text for test and training data as well
train_data['dialogue'] = train_data['dialogue'].apply(clean_text)
train_data['summary'] = train_data['summary'].apply(clean_text)

test_data['dialogue'] = test_data['dialogue'].apply(clean_text)
test_data['summary'] = test_data['summary'].apply(clean_text)


Tokenizer

In [9]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
# dialogue and summary are in text convert into number
def tokenize_function(examples):
    inputs = tokenizer(examples['dialogue'],padding='max_length',truncation=True,max_length=512)
    targets = tokenizer(examples['summary'],padding='max_length',truncation=True, max_length=512)

# here inputs and targets have input_ids, attention_mask,we create new column 'lables' amd store inout_ids of target
    inputs['labels']=targets['input_ids']
    return inputs

# after applying preprocessing
train_dataset = train_data.apply(tokenize_function,axis=1)
test_dataset = test_data.apply(tokenize_function,axis=1)


In [13]:
print(train_dataset[0])

{'input_ids': [25208, 10, 7102, 55, 3, 23, 764, 640, 48, 403, 17, 77, 31, 7, 1108, 11, 3, 23, 816, 24, 25, 429, 253, 34, 1477, 25208, 10, 3, 7997, 15, 10, 7102, 55, 3, 10, 61, 2049, 6, 68, 3, 23, 31, 162, 641, 608, 34, 5, 3, 10, 61, 3, 7997, 15, 10, 68, 2049, 21, 1631, 81, 140, 3, 10, 61, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Fine Tuning Model

In [14]:
# load the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
training_args = TrainingArguments(
    output_dir="./results",             # where to save checkpoints
    num_train_epochs=3,                 # fewer epochs (adjust if needed)
    per_device_train_batch_size=8,      # batch size per device
    per_device_eval_batch_size=8,       # eval batch size
    logging_dir="./logs",               # logs directory
    logging_steps=100,                  # log every 100 steps
    save_strategy="epoch",              # save at end of each epoch        # evaluate at end of each epoch
    report_to="none"                    # ✅ disable wandb/logging issues
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,        # your dataset
    eval_dataset=test_dataset     # your validation dataset
)
# Train
trainer.train()


Step,Training Loss
100,2.7237
200,0.1676
300,0.1415
400,0.1271
500,0.1238
600,0.122
700,0.119
800,0.1195
900,0.1155
1000,0.1184


TrainOutput(global_step=1875, training_loss=0.25971559346516926, metrics={'train_runtime': 1249.8757, 'train_samples_per_second': 12.001, 'train_steps_per_second': 1.5, 'total_flos': 2030127022080000.0, 'train_loss': 0.25971559346516926, 'epoch': 3.0})

Saving and Loading the model

In [19]:
model.save_pretrained("./saved_summary_model")
tokenizer.save_pretrained("./saved_summary_model")

('./saved_summary_model/tokenizer_config.json',
 './saved_summary_model/special_tokens_map.json',
 './saved_summary_model/spiece.model',
 './saved_summary_model/added_tokens.json')

In [20]:
# loding tje modle and tokenizer

model= T5ForConditionalGeneration.from_pretrained("./saved_summary_model")
tokenizer=T5Tokenizer.from_pretrained("./saved_summary_model")

Summarization System

In [21]:
# Ensure the model is on the correct device(is GPU is available)
device=model.device

def summarize_dialogue(dialogue):
    dialogue=clean_text(dialogue) # first is cleaning the text
    # next setep is to toknize the text
    inputs=tokenizer(dialogue,return_tensors="pt",truncation=True,padding="max_length",max_length=512)
    # move tensors to same device as the model
    inputs = {key : value.to(device) for key, value in inputs.items()}
    # generate the summary
    outputs=model.generate(
        inputs["input_ids"],
        max_length=150,
        num_beams=4,
        early_stopping=True
    )
    # Decode the generated summary
    summary=tokenizer.decode(outputs[0],skip_special_tokens=True)
    return summary






In [22]:

sample_dialogue = """
John: Hey Sarah, have you seen the latest tech gadget reviews? I found this new smartwatch that's supposed to have amazing health tracking features.
John: It tracks heart rate, blood oxygen levels, sleep patterns, and even stress levels! It sounds like something right up your alley.
Sarah: That sounds really interesting! But I’ve been trying to cut down on tech distractions. I’ve heard these devices can be really overwhelming sometimes.
Sarah: I do think it’s cool that they can track so many health metrics though. I’m curious how accurate they really are.
John: Yeah, me too! There are also some new smartphones coming out with even better cameras and longer battery life. The new flagship model from XYZ brand has some insane specs.
Sarah: Ooh, I haven’t kept up with phones recently, but I’ve heard the camera quality is getting ridiculously good. It’s almost like a professional camera in your pocket now!
Sarah: Still, I feel like I’m fine with my current phone for now. I don’t really feel the need to upgrade unless something really groundbreaking comes out.
John: Totally understand that. It’s the same with me. But I think the battery life improvements are enough to make me consider it. I hate running out of battery when I’m out and about.
Sarah: That’s fair! I’m always worried about battery life too. Honestly, I think phones should last at least two full days on a single charge by now.
John: I agree! It’s so annoying when your phone dies in the middle of the day. I wonder if we’ll ever get to a point where we don’t have to charge our phones every day.
Sarah: That would be amazing! I think as tech improves, battery tech might also catch up. Let’s hope the next generation of phones can last longer !"""

summary = summarize_dialogue(sample_dialogue)
print("Summary :",summary)


Summary : sarah has seen the latest tech gadget reviews. john found this smartwatch that's supposed to have amazing health tracking features. there are also some new smartphones coming out with even better cameras and longer battery life.


In [23]:
sample_dialogue="""Alex: Hey! Are you coming to the team lunch tomorrow?
Sam: I was planning to, but I have a massive report due at 3 PM. I don't think I can make it.
Alex: Oh, come on! It’s just for an hour. You can finish the report after.
Sam: I wish, but Sarah wants to review the final draft at 2. I might just grab something quick at my desk.
Alex: That's no fun. How about this—I’ll come to your desk around 11, we can brainstorm the conclusion for 15 minutes, and then you can take a real lunch break?
Sam: That actually helps a lot. If we finish the conclusion early, I might have time.
Alex: Perfect. See you at 11 then!
Sam: Thanks, Alex. See you. """

summary = summarize_dialogue(sample_dialogue)
print(summary)

alex will come to the team lunch tomorrow at 3 pm. sarah wants to review the final draft at 2.


In [24]:
import shutil
model_dir="./saved_summary_model"
output_zip_path='saved_summary_model.zip'

shutil.make_archive(base_name="saved_summary_model",format="zip",root_dir=model_dir)


'/content/saved_summary_model.zip'