<a href="https://colab.research.google.com/github/SahilSuvarna1023/T5-BART-Summarizer/blob/main/Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install rouge_score
!pip install evaluate
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartForConditionalGeneration, BartTokenizer
import evaluate
from evaluate import load

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m 

In [3]:
def clean_dataset(df):
    # Drop missing values
    df = df.dropna()
    # Remove duplicates
    df = df.drop_duplicates()
    # Trim whitespace
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    return df

In [4]:
def load_dataset(file_path, sample_size=50):
    df = pd.read_csv(file_path)
    print("Columns in dataset:", df.columns.tolist())  # Debugging line

    # Ensure correct column name
    if 'article' in df.columns:
        df = df.rename(columns={'article': 'text'})
    else:
        raise KeyError("Expected column 'article' not found in dataset. Please check column names.")

    df = clean_dataset(df)
    df = df.sample(n=sample_size, random_state=42)  # Select 50 random samples
    return df

In [5]:
def summarize_text_t5(text, model, tokenizer, max_length=150):
    input_text = "summarize: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def summarize_text_bart(text, model, tokenizer, max_length=150):
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def evaluate_summaries(references, predictions):
    rouge = load("rouge")
    results = rouge.compute(predictions=predictions, references=references)
    return results


In [None]:
def main():
    # Load dataset (only 50 samples)
    dataset_path = "/content/test.csv"  # Update with correct column names
    df = load_dataset(dataset_path, sample_size=50)

    # Load models
    t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
    t5_tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)
    bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
    bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

    # Summarize and evaluate
    df['t5_summary'] = df['text'].apply(lambda x: summarize_text_t5(x, t5_model, t5_tokenizer))
    df['bart_summary'] = df['text'].apply(lambda x: summarize_text_bart(x, bart_model, bart_tokenizer))

    # Evaluate with Rouge
    rouge_scores = evaluate_summaries(df['text'].tolist(), df['t5_summary'].tolist())
    print("ROUGE Scores:", rouge_scores)

    # Save results
    df.to_csv("/content/summarized_results.csv", index=False)
    print("Summarization complete. Results saved.")

if __name__ == "__main__":
    main()

Columns in dataset: ['id', 'article', 'highlights']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]