In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.4


# Import Tools

In [2]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Load Datasets

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle datasets download -d nileshmalode1/samsum-dataset-text-summarization


Dataset URL: https://www.kaggle.com/datasets/nileshmalode1/samsum-dataset-text-summarization
License(s): unknown
Downloading samsum-dataset-text-summarization.zip to /content
  0% 0.00/7.99M [00:00<?, ?B/s]
100% 7.99M/7.99M [00:00<00:00, 1.00GB/s]


In [4]:
import zipfile
zip_ref = zipfile.ZipFile('/content/samsum-dataset-text-summarization.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [5]:
# Load dataset (example, adjust path as needed)
train_data = pd.read_csv("/content/samsum-train.csv")
validation_data = pd.read_csv("/content/samsum-validation.csv")
test_data = pd.read_csv("/content/samsum-test.csv")

# Display a sample
train_data.head()

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."


In [6]:
train_data.shape

(14732, 3)

In [7]:
validation_data.shape

(818, 3)

In [8]:
test_data.shape

(819, 3)

In [9]:
train_data = train_data.sample(n=4000,random_state=42).reset_index(drop=True)
validation_data = validation_data.sample(n=800, random_state=42).reset_index(drop=True)

# Data Preprocessing

In [10]:
# Clean the text by removing unwanted characters
import re

def clean_text(text):
    text = re.sub(r'\r\n', ' ', text)  # Remove carriage returns and line breaks
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'<.*?>', '', text)  # Remove any XML tags
    text = text.strip().lower()  # Strip and convert to lower case
    return text

# Apply cleaning to dialogue and summary columns
train_data['dialogue'] = train_data['dialogue'].astype('str').apply(clean_text)
train_data['summary'] = train_data['summary'].astype('str').apply(clean_text)

validation_data['dialogue'] = validation_data['dialogue'].astype('str').apply(clean_text)
validation_data['summary'] = validation_data['summary'].astype('str').apply(clean_text)


# Display a sample after cleaning
train_data

Unnamed: 0,id,dialogue,summary
0,13811908,violet: hi! i came across this austin's articl...,violet sent claire austin's article.
1,13716431,pat: so does anyone know when the stream is go...,pat and lou are waiting for the stream but kev...
2,13810214,jane: jane: whaddya think? shona: this ur tin...,jane is updating her tinder profile tonight an...
3,13729823,"adam: do u have a map of paris? tom: yes, why?...",tom has a map of paris.
4,13681400,"frank: hi, how's the family? mike: great! sam'...","mike is happy, because sam's moved out. mike a..."
...,...,...,...
5995,13716770,"sam: so, who's ur favourite superhero? lonnie:...","lonnie's favourite superhero is batman, randy'..."
5996,13715831,geri: where are you? i'm already at the theate...,"geri is already at the theater, amy will be th..."
5997,13816234,joe: do you feel like going to the cinema? kat...,joe and kathy will meet at the cinema at 6 pm ...
5998,13828416,andy: a photo lennie took this morning from ou...,andy took a bus to work and was nearly two hou...


# Tokenization

In [11]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
input_max_len = max(len(tokenizer.encode(text)) for text in train_data['dialogue'])
output_max_len = max(len(tokenizer.encode(text)) for text in train_data['summary'])

input_max_len, output_max_len

Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors


(1224, 103)

In [13]:
# Preprocessing function for tokenization
def preprocess_function(examples):
    # Tokenize the dialogue and summary
    inputs = tokenizer(examples["dialogue"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=200)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the preprocessing
train_dataset = train_data.apply(preprocess_function, axis=1)
val_dataset = validation_data.apply(preprocess_function, axis=1)

In [14]:
train_dataset[0]

{'input_ids': [25208, 10, 7102, 55, 3, 23, 764, 640, 48, 403, 17, 77, 31, 7, 1108, 11, 3, 23, 816, 24, 25, 429, 253, 34, 1477, 25208, 10, 3, 7997, 15, 10, 7102, 55, 3, 10, 61, 2049, 6, 68, 3, 23, 31, 162, 641, 608, 34, 5, 3, 10, 61, 3, 7997, 15, 10, 68, 2049, 21, 1631, 81, 140, 3, 10, 61, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Fine Tuning Model

In [15]:
pip install --upgrade transformers



In [16]:
!pip install -U transformers --quiet


In [17]:
# Model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
pip install transformers==4.38.0

Collecting transformers==4.38.0
  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/131.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.0)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.38.0-py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers

## code for the TPU
to process lard dataset

In [19]:
!pip install tensorflow



In [31]:
import os
import tensorflow as tf
from transformers import Trainer, TrainingArguments
from evaluate import load
from transformers import DataCollatorWithPadding

# Ensure TPU runtime is enabled (Runtime > Change runtime type > TPU)
# Verify TPU availability
print("TPU devices:", tf.config.list_physical_devices('TPU'))

# Initialize TPU
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()  # Auto-detect TPU in Colab
except ValueError:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All TPU devices:", tf.config.list_logical_devices('TPU'))

# Define compute_metrics function for accuracy
def compute_metrics(eval_pred):
    metric = load("accuracy")
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Convert datasets to tf.data.Dataset for TPU compatibility
def convert_to_tf_dataset(dataset):
    def gen():
        for example in dataset:
            yield {
                'input_ids': example['input_ids'],
                'attention_mask': example['attention_mask'],
                'labels': example['labels']
            }
    return tf.data.Dataset.from_generator(
        gen,
        output_types={'input_ids': tf.int32, 'attention_mask': tf.int32, 'labels': tf.int32},
        output_shapes={'input_ids': [None], 'attention_mask': [None], 'labels': [None]}
    ).cache().prefetch(tf.data.AUTOTUNE)

# Assume tokenizer is defined (replace with your tokenizer)
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained('t5-base')  # Example
train_dataset = convert_to_tf_dataset(train_dataset)
val_dataset = convert_to_tf_dataset(val_dataset)

# Define data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments with TPU optimization
training_args = TrainingArguments(
    output_dir="./results",                 # Local storage in Colab
    num_train_epochs=10,                    # 10 epochs
    per_device_train_batch_size=8,          # TPU-friendly batch size
    per_device_eval_batch_size=8,           # Evaluation batch size
    eval_strategy="epoch",                  # Evaluate at the end of each epoch
    save_strategy="epoch",                  # Save at the end of each epoch
    save_total_limit=1,                     # Keep only the latest checkpoint
    load_best_model_at_end=True,            # Load the best model at the end
    metric_for_best_model="eval_accuracy",  # Correct metric name
    greater_is_better=True,                 # Higher accuracy is better
    logging_steps=10,                       # Log every 10 steps
    fp16=True,                              # Mixed precision with bfloat16 for TPUs
    gradient_accumulation_steps=4,          # Effective batch size of 32 (8×4)
    gradient_checkpointing=True,            # Save memory
    tpu_num_cores=8,                        # Use all 8 cores of TPU v2-8
)

# Trainer setup with TPUStrategy
strategy = tf.distribute.TPUStrategy(resolver)
with strategy.scope():
    # Ensure model is TPU-compatible (replace with your model)
    # from transformers import T5ForConditionalGeneration
    # model = T5ForConditionalGeneration.from_pretrained('t5-base')
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

# Train the model
trainer.train()

TPU devices: []


KeyError: 'COLAB_TPU_ADDR'

In [30]:
import tensorflow as tf
print("TPU devices:", tf.config.list_physical_devices('TPU'))

TPU devices: []


In [29]:
!pip install --upgrade tensorflow



# code for T4 GPU
can be usable for smaller dataset

In [20]:
import os
import torch
from transformers import Trainer, TrainingArguments
from evaluate import load

# Clear GPU memory before training
torch.cuda.empty_cache()

# Set environment variable to reduce memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Define compute_metrics function for accuracy
def compute_metrics(eval_pred):
    metric = load("accuracy")
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define training arguments with GPU optimization
training_args = TrainingArguments(
    output_dir="./results",                 # Directory to save model checkpoints
    num_train_epochs=10,                    # 10 epochs
    per_device_train_batch_size=4,          # Reduced batch size to avoid OOM
    per_device_eval_batch_size=4,           # Reduced batch size for evaluation
    eval_strategy="epoch",                  # Evaluate at the end of each epoch
    save_strategy="epoch",                  # Save at the end of each epoch
    save_total_limit=1,                     # Keep only the latest checkpoint
    load_best_model_at_end=True,            # Load the best model at the end
    metric_for_best_model="eval_accuracy",  # Correct metric name
    greater_is_better=True,                 # Higher accuracy is better
    logging_steps=10,                       # Log every 10 steps
    fp16=True,                              # Enable mixed precision training
    tf32=False,                             # Disable TF32 (incompatible GPU)
    gradient_accumulation_steps=8,          # Accumulate gradients for effective batch size of 32 (4×8)
    gradient_checkpointing=True,            # Enable gradient checkpointing to save memory
)

# Trainer setup
trainer = Trainer(
    model=model,                            # Your model (e.g., BART, T5)
    args=training_args,
    train_dataset=train_dataset,            # Your training dataset
    eval_dataset=val_dataset,               # Your validation dataset
    compute_metrics=compute_metrics,        # Compute accuracy during evaluation
)

# Train the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrazakhanzada100[0m ([33mrazakhanzada100-mehran-university-of-engineering-and-tec[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.84 GiB. GPU 0 has a total capacity of 14.74 GiB of which 5.84 GiB is free. Process 2229 has 8.90 GiB memory in use. Of the allocated memory 6.77 GiB is allocated by PyTorch, and 1.99 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Save and load model

In [None]:
# Save the fine-tuned model
model.save_pretrained("./saved_summary_model")
tokenizer.save_pretrained("./saved_summary_model")

('./saved_summary_model/tokenizer_config.json',
 './saved_summary_model/special_tokens_map.json',
 './saved_summary_model/spiece.model',
 './saved_summary_model/added_tokens.json')

In [None]:
# Load the saved model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./saved_summary_model")
tokenizer = T5Tokenizer.from_pretrained("./saved_summary_model")

# Summarization System

In [None]:
# Ensure the model is on the correct device (GPU if available)
device = model.device  # Get the device the model is on

def summarize_dialogue(dialogue):
    dialogue = clean_text(dialogue)  # Assuming clean_text is defined
    inputs = tokenizer(dialogue, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate summary
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [None]:
# Test with a sample input
sample_dialogue = """
Violet: Hey Claire! I was reading an article about Austin and thought you might find it interesting!
Violet: It's about the current trends in urban development and how cities are planning for the future.
Violet: Here, let me share the link: <file_other>
Claire: Oh wow, that sounds like an insightful read. But I've actually already read that one last week.
Claire: It was really interesting though, especially the part about sustainable architecture in cities.
Claire: You know, I've been following these urban planning discussions for a while now.
Violet: Oh, I didn’t know that! Well, I’ll look for something else then, maybe something about eco-friendly cities or tech innovations.
Claire: That would be awesome! Let me know if you find something cool.
Violet: Sure, I’ll keep you posted. Thanks for the feedback!
"""

summary = summarize_dialogue(sample_dialogue)
print("Summary:", summary)


Summary: claire was reading an article about austin and thought she might find it interesting. violet has already read that one last week.


In [None]:
# Test with a dialogue on a different topic
sample_dialogue = """
John: Hey Sarah, have you seen the latest tech gadget reviews? I found this new smartwatch that's supposed to have amazing health tracking features.
John: It tracks heart rate, blood oxygen levels, sleep patterns, and even stress levels! It sounds like something right up your alley.
Sarah: That sounds really interesting! But I’ve been trying to cut down on tech distractions. I’ve heard these devices can be really overwhelming sometimes.
Sarah: I do think it’s cool that they can track so many health metrics though. I’m curious how accurate they really are.
John: Yeah, me too! There are also some new smartphones coming out with even better cameras and longer battery life. The new flagship model from XYZ brand has some insane specs.
Sarah: Ooh, I haven’t kept up with phones recently, but I’ve heard the camera quality is getting ridiculously good. It’s almost like a professional camera in your pocket now!
Sarah: Still, I feel like I’m fine with my current phone for now. I don’t really feel the need to upgrade unless something really groundbreaking comes out.
John: Totally understand that. It’s the same with me. But I think the battery life improvements are enough to make me consider it. I hate running out of battery when I’m out and about.
Sarah: That’s fair! I’m always worried about battery life too. Honestly, I think phones should last at least two full days on a single charge by now.
John: I agree! It’s so annoying when your phone dies in the middle of the day. I wonder if we’ll ever get to a point where we don’t have to charge our phones every day.
Sarah: That would be amazing! I think as tech improves, battery tech might also catch up. Let’s hope the next generation of phones can last longer!
"""

summary = summarize_dialogue(sample_dialogue)
print("Summary:", summary)


Summary: sarah has seen the latest tech gadget reviews. john found this smartwatch that's supposed to have amazing health tracking features. there are also new smartphones coming out with even better cameras and longer battery life.


In [None]:
# Test with a dialogue on a current news topic
sample_dialogue = """
Reporter: In today's news, the latest climate change report reveals alarming global temperature rises. According to the Intergovernmental Panel on Climate Change (IPCC), the Earth’s temperature is on track to rise by 1.5°C within the next two decades.
Reporter: This is expected to lead to more frequent and severe heatwaves, flooding, and extreme weather events. Coastal cities are at particular risk due to rising sea levels.
Expert: The report emphasizes that immediate action is needed to prevent catastrophic consequences. We need to significantly reduce carbon emissions and transition to renewable energy sources.
Expert: If global temperatures increase by more than 1.5°C, we could face irreversible damage to ecosystems, agriculture, and water supply. It will have a devastating impact on biodiversity as well.
Reporter: The IPCC also stresses the importance of individual action. Governments must set stronger policies, but individuals can help by reducing waste, conserving water, and supporting green initiatives.
Expert: It's not just about the big changes; small actions like using public transportation, reducing meat consumption, and recycling can collectively make a significant difference.
Reporter: With the next UN Climate Summit coming up next month, world leaders will need to prioritize climate action. The stakes have never been higher for our planet’s future.
"""

summary = summarize_dialogue(sample_dialogue)
print("Summary:", summary)


Summary: the latest climate change report reveals alarming global temperature rises. the earth’s temperature is on track to rise by 1.5°c within the next two decades. experts urges governments to set stronger policies, but individuals can help by reducing waste, conserving water, and supporting green initiatives.


# Download Mode To Your Machine

In [None]:
import shutil

# Path to the directory containing the fine-tuned model
model_dir = "./saved_summary_model"

# Output zip file path
output_zip_path = "saved_summary_model.zip"

# Create a zip archive
shutil.make_archive(base_name="saved_summary_model", format="zip", root_dir=model_dir)

'/kaggle/working/saved_summary_model.zip'

In [None]:
from IPython.display import FileLink

# Display a download link
FileLink(output_zip_path)