In [1]:
import pandas as pd
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer 
from transformers import Trainer
from transformers import TrainingArguments

import re

In [2]:
df_train = r"/kaggle/input/samsum-dataset-text-summarization/archive/samsum-train.csv"
df_validation = r"/kaggle/input/samsum-dataset-text-summarization/archive/samsum-validation.csv"

df_train = pd.read_csv(df_train)
df_validation = pd.read_csv(df_validation)


df_train.head(10)

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."
5,13716343,"Neville: Hi there, does anyone remember what d...",Wyatt reminds Neville his wedding anniversary ...
6,13611672,John: Ave. Was there any homework for tomorrow...,John didn't show up for class due to some work...
7,13730463,Sarah: I found a song on youtube and I think y...,Sarah sends James an instrumental song he migh...
8,13809976,Noah: When and where are we meeting? :)\r\nMad...,"Noah wants to meet, he quit his job, because h..."
9,13809912,Matt: Do you want to go for date?\r\nAgnes: Wo...,Matt invites Agnes for a date to get to know e...


In [3]:
df_train.shape

(14732, 3)

In [4]:
df_validation.head(10)

Unnamed: 0,id,dialogue,summary
0,13817023,"A: Hi Tom, are you busy tomorrow’s afternoon?\...",A will go to the animal shelter tomorrow to ge...
1,13716628,Emma: I’ve just fallen in love with this adven...,Emma and Rob love the advent calendar. Lauren ...
2,13829420,Jackie: Madison is pregnant\r\nJackie: but she...,Madison is pregnant but she doesn't want to ta...
3,13819648,Marla: <file_photo>\r\nMarla: look what I foun...,Marla found a pair of boxers under her bed.
4,13728448,Robert: Hey give me the address of this music ...,Robert wants Fred to send him the address of t...
5,13814197,"Keith: Meg, pls buy some milk and cereals, I s...",Megan needn't buy milk and cereals. They're in...
6,13820419,Samantha: <file_video>\r\nEvelyn: LOL\r\nHolly...,Samantha and Evelyn after watching the video c...
7,13864382,Theresa: have you been at Tom's new place?\nLu...,Tom's new place is in Fiesole. Luis and Marion...
8,13729454,"Jane: Hello\r\nVegano Resto: Hello, how may I ...",Jane made a 9 PM reservation for 6 people toni...
9,13810148,"Nancy: Howdy, how y'all doin'?\r\nTina: Is tha...","Nancy's working in Texas, but the kids laugh a..."


In [5]:
df_validation.shape

(818, 3)

In [6]:
df_train = df_train.sample(n=5000 , random_state=42).reset_index(drop = True)
df_validation = df_validation.sample(n=500 , random_state=42).reset_index(drop = True)

In [7]:
df_train.shape

(5000, 3)

In [8]:
df_validation.shape

(500, 3)

# Data Preprocessing

In [9]:
def preprocessing(text) :
    text = re.sub(r"\r\n" , " " , text)
    text = re.sub(r"\s+" , " " , text)
    text = re.sub(r"<.*?>" , "" , text)
    text = text.strip().lower()
    return text

df_train["dialogue"] = df_train["dialogue"].apply(preprocessing)
df_train["summary"] = df_train["summary"].apply(preprocessing)


df_validation["dialogue"] = df_validation["dialogue"].apply(preprocessing)
df_validation["summary"] = df_validation["summary"].apply(preprocessing)



In [10]:
df_train.head(10)

Unnamed: 0,id,dialogue,summary
0,13811908,violet: hi! i came across this austin's articl...,violet sent claire austin's article.
1,13716431,pat: so does anyone know when the stream is go...,pat and lou are waiting for the stream but kev...
2,13810214,jane: jane: whaddya think? shona: this ur tin...,jane is updating her tinder profile tonight an...
3,13729823,"adam: do u have a map of paris? tom: yes, why?...",tom has a map of paris.
4,13681400,"frank: hi, how's the family? mike: great! sam'...","mike is happy, because sam's moved out. mike a..."
5,13716070,paul: lucky you! john: ? pete: our classess ha...,"john, pete and paul's classes have been cancel..."
6,13727976,jasper: i miss you so much already :( karen: i...,karen will be back on sunday. karen and jasper...
7,13681231,ken: how long do you need? jude: i think about...,ken will wait inside as jude needs 10 more min...
8,13862652,"victoria: hey, i am in the toilet...and.. skyl...",victoria is in a restaurant toilet and texts s...
9,13728508,sandra: do u need any help with the party tomo...,ronda does not need any help with the party to...


# Tokenization

In [12]:
tok = T5Tokenizer.from_pretrained("t5-small")

In [15]:
df_train["dialogue"].apply(lambda x: len(tok.encode(x))).max()

1224

In [16]:
def preprocessing_fun(examples) :
    inputs = tok(examples["dialogue"] , padding="max_length" , truncation=True , max_length=512)
    targets = tok(examples["summary"] , padding="max_length" , truncation=True , max_length=150)
    inputs["labels"] = targets["input_ids"]

    return inputs


df_train = df_train.apply(preprocessing_fun , axis=1)
df_validation = df_validation.apply(preprocessing_fun , axis=1)


In [17]:
df_train[0]

{'input_ids': [25208, 10, 7102, 55, 3, 23, 764, 640, 48, 403, 17, 77, 31, 7, 1108, 11, 3, 23, 816, 24, 25, 429, 253, 34, 1477, 25208, 10, 3, 7997, 15, 10, 7102, 55, 3, 10, 61, 2049, 6, 68, 3, 23, 31, 162, 641, 608, 34, 5, 3, 10, 61, 3, 7997, 15, 10, 68, 2049, 21, 1631, 81, 140, 3, 10, 61, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [18]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## Fine Tuning Model

In [22]:
tra_arg = TrainingArguments(
    output_dir="./results",
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    save_steps=500 , 
    eval_steps=50,
    evaluation_strategy="epoch"
)


trainer = Trainer(
    model=model , 
    args=tra_arg ,
    train_dataset=df_train , 
    eval_dataset=df_validation
)


trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111367910000101, max=1.0)…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.445256
2,3.480400,0.366183
3,3.480400,0.356645
4,0.389800,0.352384
5,0.370000,0.349549
6,0.370000,0.347588
7,0.357700,0.346919
8,0.354400,0.346749


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=2504, training_loss=0.9894023570485009, metrics={'train_runtime': 1286.8506, 'train_samples_per_second': 31.084, 'train_steps_per_second': 1.946, 'total_flos': 5413672058880000.0, 'train_loss': 0.9894023570485009, 'epoch': 8.0})

# Save and load model

In [23]:
model.save_pretrained("./saved_summary_model")
tok.save_pretrained("./saved_summary_model")

('./saved_summary_model/tokenizer_config.json',
 './saved_summary_model/special_tokens_map.json',
 './saved_summary_model/spiece.model',
 './saved_summary_model/added_tokens.json')

In [24]:
model = T5ForConditionalGeneration.from_pretrained("./saved_summary_model")
tok = T5Tokenizer.from_pretrained("./saved_summary_model")

## Summarization System

In [27]:
device = model.device

def summary_inputs(texts) :
    texts = preprocessing(texts)
    inputs = tok(texts , return_tensors="pt" , truncation=True , padding="max_length" , max_length=512)

    inputs = {k : v.to(device) for k , v in inputs.items()}

    outputs = model.generate(
        inputs["input_ids"] , 
        max_length=150 , 
        num_beams = 4 , 
        early_stopping = True
    )

    summary = tok.decode(outputs[0], skip_special_tokens=True)
    return summary

In [28]:
text = """
Violet: Hey Claire! I was reading an article about Austin and thought you might find it interesting! 
Violet: It's about the current trends in urban development and how cities are planning for the future.
Violet: Here, let me share the link: <file_other>
Claire: Oh wow, that sounds like an insightful read. But I've actually already read that one last week. 
Claire: It was really interesting though, especially the part about sustainable architecture in cities. 
Claire: You know, I've been following these urban planning discussions for a while now.
Violet: Oh, I didn’t know that! Well, I’ll look for something else then, maybe something about eco-friendly cities or tech innovations.
Claire: That would be awesome! Let me know if you find something cool.
Violet: Sure, I’ll keep you posted. Thanks for the feedback!
"""

summary = summary_inputs(texts=text)
print(summary)

violet was reading an article about austin and thought it might be interesting. he's already read that one last week. he's been following urban planning discussions for a while.


In [33]:
ss = """
John: Hey Sarah, have you seen the latest tech gadget reviews? I found this new smartwatch that's supposed to have amazing health tracking features.
John: It tracks heart rate, blood oxygen levels, sleep patterns, and even stress levels! It sounds like something right up your alley. 
Sarah: That sounds really interesting! But I’ve been trying to cut down on tech distractions. I’ve heard these devices can be really overwhelming sometimes.
Sarah: I do think it’s cool that they can track so many health metrics though. I’m curious how accurate they really are.
John: Yeah, me too! There are also some new smartphones coming out with even better cameras and longer battery life. The new flagship model from XYZ brand has some insane specs.
Sarah: Ooh, I haven’t kept up with phones recently, but I’ve heard the camera quality is getting ridiculously good. It’s almost like a professional camera in your pocket now!
Sarah: Still, I feel like I’m fine with my current phone for now. I don’t really feel the need to upgrade unless something really groundbreaking comes out.
John: Totally understand that. It’s the same with me. But I think the battery life improvements are enough to make me consider it. I hate running out of battery when I’m out and about.
Sarah: That’s fair! I’m always worried about battery life too. Honestly, I think phones should last at least two full days on a single charge by now.
John: I agree! It’s so annoying when your phone dies in the middle of the day. I wonder if we’ll ever get to a point where we don’t have to charge our phones every day.
Sarah: That would be amazing! I think as tech improves, battery tech might also catch up. Let’s hope the next generation of phones can last longer!
"""

summary = summary_inputs(texts=ss)
print(summary)

sarah has seen the latest tech gadget reviews. john found this smartwatch that's supposed to have amazing health tracking features. it tracks heart rate, blood oxygen levels, sleep patterns, and even stress levels. there are also new smartphones coming out with even better cameras and longer battery life.


In [34]:
cc = """
Reporter: In today's news, the latest climate change report reveals alarming global temperature rises. According to the Intergovernmental Panel on Climate Change (IPCC), the Earth’s temperature is on track to rise by 1.5°C within the next two decades.
Reporter: This is expected to lead to more frequent and severe heatwaves, flooding, and extreme weather events. Coastal cities are at particular risk due to rising sea levels.
Expert: The report emphasizes that immediate action is needed to prevent catastrophic consequences. We need to significantly reduce carbon emissions and transition to renewable energy sources.
Expert: If global temperatures increase by more than 1.5°C, we could face irreversible damage to ecosystems, agriculture, and water supply. It will have a devastating impact on biodiversity as well.
Reporter: The IPCC also stresses the importance of individual action. Governments must set stronger policies, but individuals can help by reducing waste, conserving water, and supporting green initiatives.
Expert: It's not just about the big changes; small actions like using public transportation, reducing meat consumption, and recycling can collectively make a significant difference.
Reporter: With the next UN Climate Summit coming up next month, world leaders will need to prioritize climate action. The stakes have never been higher for our planet’s future.
"""

summary = summary_inputs(texts=cc)
print(summary)

the latest climate change report reveals alarming global temperature rises. the earth’s temperature is on track to rise by 1.5°c within the next two decades. report stresses the importance of individual action.
