# Text Summarization System
## Task 2: CNN/Daily Mail Dataset
**Objective**: Create extractive and abstractive summarization models

## 1. Setup Environment

In [None]:
# Mount Google Drive & Install Dependencies
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install required libraries
!pip install nltk spacy
!python -m spacy download en_core_web_sm  # Use small model for faster processing

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')  # Fixes the "punkt_tab" error

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## 2. Load Dataset from Zip File

In [None]:
# Import libraries
import zipfile
import os
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import spacy
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Path to your zip file in Google Drive
zip_path = '/content/drive/MyDrive/archive (6).zip'  # 👈 Update this path!

In [None]:
# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/cnndm_dataset')

# Verify extracted files
print("Extracted Files:")
print(os.listdir('/content/cnndm_dataset'))

Extracted Files:
['cnn_dailymail']


In [None]:
# Load the dataset
dataset_path = '/content/cnndm_dataset/cnn_dailymail/train.csv'  # 👈 Update if filenames differ
df = pd.read_csv(dataset_path)
print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (287113, 3)


Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


## 3. Preprocess Textual Data (Fixed)

In [None]:
# Initialize spaCy with small model (faster)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'lemmatizer', 'tagger'])
nlp.add_pipe('sentencizer')  # Add sentencizer for sentence splitting

<spacy.pipeline.sentencizer.Sentencizer at 0x7a36e01c2f90>

In [None]:
# Preprocessing function
def preprocess_text(text):
    """Clean text for summarization"""
    try:
        # Limit text length to prevent memory issues
        text = str(text)[:100000]  # Process first 100k characters (reduce if needed)

        # Tokenize sentences using NLTK (faster than spaCy for large datasets)
        sentences = sent_tokenize(text)

        # Remove short sentences and stopwords
        min_length = 15
        stop_words = set(stopwords.words('english'))
        cleaned = [
            ' '.join([word.lower() for word in sent.split()
                     if word.lower() not in stop_words and len(word) > 2])
            for sent in sentences if len(sent) > min_length
        ]
        return ' '.join(cleaned)
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return ""

In [None]:
# Apply preprocessing in batches
def preprocess_batch(df, column, batch_size=1000):
    """Process DataFrame in batches to avoid memory issues"""
    processed = []
    for i in tqdm(range(0, len(df), batch_size)):
        batch = df[column].iloc[i:i+batch_size].tolist()
        processed.extend([preprocess_text(text) for text in batch])
    return processed

In [None]:
# Process articles and highlights in batches
BATCH_SIZE = 1000  # Adjust based on Colab's memory
df['cleaned_article'] = preprocess_batch(df, 'article', BATCH_SIZE)
df['cleaned_highlights'] = preprocess_batch(df, 'highlights', BATCH_SIZE)

# Save intermediate results to avoid losing progress
df.to_csv('/content/drive/MyDrive/preprocessed_data.csv', index=False)
print("Preprocessing completed and results saved!")

100%|██████████| 288/288 [05:42<00:00,  1.19s/it]
100%|██████████| 288/288 [00:55<00:00,  5.19it/s]


Preprocessing completed and results saved!


## 4. Extractive Summarization (spaCy Optimized)

In [None]:
def extractive_summary(text, max_sentences=3):
    """Generate summary using TF-IDF and cosine similarity"""
    try:
        # Tokenize sentences
        sentences = sent_tokenize(text)
        if len(sentences) == 0:
            return ""

        # Create TF-IDF vectors
        vectorizer = TfidfVectorizer(stop_words='english')
        sentence_vectors = vectorizer.fit_transform(sentences)
        doc_vector = vectorizer.transform([' '.join(sentences)])

        # Calculate cosine similarity
        similarities = cosine_similarity(sentence_vectors, doc_vector).flatten()

        # Select top sentences
        top_indices = similarities.argsort()[-max_sentences:][::-1]
        return ' '.join([sentences[i] for i in sorted(top_indices)])
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return ""

# Process in batches for Colab stability
BATCH_SIZE = 100  # Adjust based on Colab's memory
extractive_summaries = []
for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batch = df['cleaned_article'].iloc[i:i+BATCH_SIZE].tolist()
    extractive_summaries.extend([extractive_summary(text) for text in batch])

df['extractive_summary'] = extractive_summaries

# Save intermediate results to avoid losing progress
df.to_csv('/content/drive/MyDrive/extractive_summaries.csv', index=False)
print("Extractive summarization completed and results saved!")

100%|██████████| 2872/2872 [21:33<00:00,  2.22it/s]


Extractive summarization completed and results saved!


## 5. Abstractive Summarization (T5 Transformer)

In [None]:
# Install required libraries
!pip install transformers[torch]

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cufft_cu12

In [None]:
# Import libraries
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from tqdm import tqdm
# Load Pegasus model
model_name = 'google/pegasus-xsum'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [None]:
# Move model to GPU
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_nor

In [None]:
def generate_summary(text):
    """Generate abstractive summary using Pegasus"""
    inputs = tokenizer(text, return_tensors='pt', max_length=256, truncation=True).to(device)
    outputs = model.generate(
        inputs['input_ids'],
        max_length=150,
        num_beams=2,  # Reduced beam width
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Process in chunks
CHUNK_SIZE = 100  # Increased chunk size
abstractive_summaries = []
for i in tqdm(range(0, len(df), CHUNK_SIZE)):
    chunk = df['cleaned_article'].iloc[i:i+CHUNK_SIZE].tolist()
    abstractive_summaries.extend([generate_summary(text) for text in chunk])

df['abstractive_summary'] = abstractive_summaries

# Save resul
df.to_csv('/content/drive/MyDrive/abstractive_summaries.csv', index=False)
print("Abstractive summarization completed and results saved!")

  4%|▍         | 118/2872 [1:56:12<44:58:08, 58.78s/it]

## 6. Evaluation & Results

In [None]:
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt

# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def calculate_rouge(row):
    return {
        'extractive_rougeL': scorer.score(row['extractive_summary'], row['cleaned_highlights'])['rougeL'].fmeasure,
        'abstractive_rougeL': scorer.score(row['abstractive_summary'], row['cleaned_highlights'])['rougeL'].fmeasure
    }

df = pd.concat([df, df.apply(calculate_rouge, axis=1, result_type='expand')], axis=1)

# Visualization
plt.figure(figsize=(10,6))
plt.hist(df['extractive_rougeL'], alpha=0.5, label='Extractive')
plt.hist(df['abstractive_rougeL'], alpha=0.5, label='Abstractive')
plt.title('ROUGE-L Score Distribution')
plt.legend()
plt.show()

# Sample comparison
sample = df.sample(1)
print(f"\nArticle Excerpt: {sample['cleaned_article'].values[0][:200]}...")
print(f"\nExtractive Summary: {sample['extractive_summary'].values[0]}")
print(f"\nAbstractive Summary: {sample['abstractive_summary'].values[0]}")
print(f"\nReference Summary: {sample['cleaned_highlights'].values[0]}")

## 7. Save Outputs

In [None]:
output_path = '/content/drive/MyDrive/results/summarization_output.csv'  # 👈 UPDATE
df.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")