<a href="https://colab.research.google.com/github/SahilSuvarna1023/T5-BART-Summarizer/blob/main/Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install rouge_score
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartForConditionalGeneration, BartTokenizer
import evaluate
from evaluate import load



In [26]:
def clean_dataset(df):
    # Drop missing values
    df = df.dropna()
    # Remove duplicates
    df = df.drop_duplicates()
    # Trim whitespace
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    return df

In [27]:
def load_dataset(file_path, sample_size=50):
    df = pd.read_csv(file_path)
    print("Columns in dataset:", df.columns.tolist())  # Debugging line

    # Ensure correct column name
    if 'article' in df.columns:
        df = df.rename(columns={'article': 'text'})
    else:
        raise KeyError("Expected column 'article' not found in dataset. Please check column names.")

    df = clean_dataset(df)
    df = df.sample(n=sample_size, random_state=42)  # Select 50 random samples
    return df

In [28]:
def summarize_text_t5(text, model, tokenizer, max_length=150):
    input_text = "summarize: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def summarize_text_bart(text, model, tokenizer, max_length=150):
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def evaluate_summaries(references, predictions):
    rouge = load("rouge")
    results = rouge.compute(predictions=predictions, references=references)
    return results


In [29]:
def main():
    # Load dataset (only 50 samples)
    dataset_path = "/content/test.csv"  # Update with correct column names
    df = load_dataset(dataset_path, sample_size=50)

    # Load models
    t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
    t5_tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)
    bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
    bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

    # Summarize and evaluate
    df['t5_summary'] = df['text'].apply(lambda x: summarize_text_t5(x, t5_model, t5_tokenizer))
    df['bart_summary'] = df['text'].apply(lambda x: summarize_text_bart(x, bart_model, bart_tokenizer))

    # Evaluate with Rouge
    rouge_scores = evaluate_summaries(df['text'].tolist(), df['t5_summary'].tolist())
    print("ROUGE Scores:", rouge_scores)

    # Save results
    df.to_csv("/content/summarized_results.csv", index=False)
    print("Summarization complete. Results saved.")

if __name__ == "__main__":
    main()

Columns in dataset: ['id', 'article', 'highlights']
ROUGE Scores: {'rouge1': 0.16387567273483772, 'rouge2': 0.1414180912330464, 'rougeL': 0.15038435459887795, 'rougeLsum': 0.15125620515092308}
Summarization complete. Results saved.
