# Multi-Document Summarization Test

This notebook tests multiple multi-document summarization methods on:
1. **Topic groups**: Articles belonging to the same BERTopic-discovered semantic category
2. **Event clusters**: Articles covering the same news event across multiple sources

Methods tested:
- **PRIMERA** (`allenai/primera`) - Designed for multi-doc summarization
- **LED** (`allenai/led-base-16384`) - Long-context transformer
- **LongT5** (`google/long-t5-tglobal-base`) - Long-context T5
- **OpenAI GPT-4o** - LLM-based
- **Gemini** (2.0 Flash) - LLM-based

## Section 1: Setup & Data Loading

In [1]:
# Cell 1: Imports
import sys
sys.path.append('..')

from src.db import Database
from src.config import load_config
from src.multi_doc_summarization import (
    PRIMERASummarizer,
    LEDMultiDocSummarizer,
    LongT5MultiDocSummarizer,
    OpenAIMultiDocSummarizer,
    GeminiMultiDocSummarizer
)
from dashboard.components.source_mapping import SOURCE_NAMES

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Imports complete")

✓ Imports complete


In [2]:
# Cell 2: Cache configuration
import json
import os
from pathlib import Path
from datetime import datetime

# Cache directory for storing results (separate file per model)
CACHE_DIR = Path("cache/multi_doc_summarization")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

USE_CACHE = True  # Set to False to regenerate all summaries

# Methods to test
METHODS = ['primera', 'led', 'longt5', 'openai', 'gemini']

def get_cache_path(method: str, group_type: str) -> Path:
    """Get cache file path for method and group type (topics/clusters)."""
    return CACHE_DIR / f"{method}_{group_type}.json"

def load_cache(method: str, group_type: str) -> dict:
    """Load cached summaries for a specific method if available."""
    cache_path = get_cache_path(method, group_type)
    if not USE_CACHE or not cache_path.exists():
        return None
    
    with open(cache_path, 'r') as f:
        data = json.load(f)
    print(f"  ✓ Loaded {method} {group_type} from cache ({cache_path.name})")
    return data

def save_cache(method: str, group_type: str, results: list, 
               version_id: str, version_name: str, groups_tested: list):
    """Save summaries for a specific method to cache with metadata."""
    cache_path = get_cache_path(method, group_type)
    
    cache_data = {
        "metadata": {
            "method": method,
            "group_type": group_type,
            "version_id": version_id,
            "version_name": version_name,
            "groups_tested": groups_tested,
            "created_at": datetime.now().isoformat(),
            "result_count": len(results)
        },
        "results": results
    }
    
    with open(cache_path, 'w') as f:
        json.dump(cache_data, f, indent=2, default=str)
    print(f"  ✓ Saved {method} {group_type} to cache ({cache_path.name})")

def load_all_topic_results() -> dict:
    """Load all cached topic results."""
    results = {}
    for method in METHODS:
        cache_data = load_cache(method, 'topics')
        if cache_data:
            results[method] = cache_data['results']
    return results if len(results) == len(METHODS) else None

def load_all_cluster_results() -> dict:
    """Load all cached cluster results."""
    results = {}
    for method in METHODS:
        cache_data = load_cache(method, 'clusters')
        if cache_data:
            results[method] = cache_data['results']
    return results if len(results) == len(METHODS) else None

def check_cache_status():
    """Check which cache files exist."""
    print("Cache status:")
    for group_type in ['topics', 'clusters']:
        print(f"\n  {group_type.capitalize()}:")
        for method in METHODS:
            cache_path = get_cache_path(method, group_type)
            status = "✓" if cache_path.exists() else "✗"
            print(f"    {status} {method}")

print(f"Cache directory: {CACHE_DIR}")
print(f"Cache mode: {'ENABLED' if USE_CACHE else 'DISABLED'}\n")
check_cache_status()

Cache directory: cache/multi_doc_summarization
Cache mode: ENABLED

Cache status:

  Topics:
    ✗ primera
    ✗ led
    ✗ longt5
    ✗ openai
    ✗ gemini

  Clusters:
    ✗ primera
    ✗ led
    ✗ longt5
    ✗ openai
    ✗ gemini


In [3]:
# Cell 2: Verify API keys

from src.config import load_config
import os

config = load_config()

openai_key = config.get('openai', {}).get('api_key') or os.environ.get('OPENAI_API_KEY')
if openai_key and openai_key != 'YOUR_OPENAI_API_KEY_HERE':
    print('✓ OPENAI_API_KEY is configured')
else:
    print('⚠️  OPENAI_API_KEY not set (OpenAI cells will fail)')

gemini_key = config.get('gemini', {}).get('api_key') or os.environ.get('GOOGLE_API_KEY')
if gemini_key and gemini_key != 'YOUR_GOOGLE_API_KEY_HERE':
    print('✓ GOOGLE_API_KEY is configured')
else:
    print('⚠️  GOOGLE_API_KEY not set (Gemini cells will fail)')

✓ OPENAI_API_KEY is configured
✓ GOOGLE_API_KEY is configured


In [4]:
# Cell 3: Load config and connect to database
config = load_config()
db = Database()
db.connect()

print("✓ Database connected")
print(f"Schema: {db.config['schema']}")

✓ Database connected
Schema: media_bias


In [5]:
# Cell 4: Select versions for testing
# Get completed topic versions
schema = db.config['schema']
with db.cursor() as cur:
    cur.execute(f"""
        SELECT id, name, description, configuration, pipeline_status
        FROM {schema}.result_versions
        WHERE analysis_type = 'topics'
          AND (pipeline_status->>'topics')::boolean = true
          AND configuration->'embeddings'->>'model' = 'google/embeddinggemma-300m'
        ORDER BY created_at DESC
        LIMIT 1
    """)
    topic_version = cur.fetchone()

if not topic_version:
    raise ValueError("No completed topic version found with google/embeddinggemma-300m")

topic_version_id = str(topic_version['id'])
print(f"Selected topic version: {topic_version['name']}")
print(f"  ID: {topic_version_id}")
print(f"  Description: {topic_version['description']}")

# Get completed clustering versions
with db.cursor() as cur:
    cur.execute(f"""
        SELECT id, name, description, configuration, pipeline_status
        FROM {schema}.result_versions
        WHERE analysis_type = 'clustering'
          AND (pipeline_status->>'clustering')::boolean = true
        ORDER BY created_at DESC
        LIMIT 1
    """)
    clustering_version = cur.fetchone()

if not clustering_version:
    raise ValueError("No completed clustering version found")

clustering_version_id = str(clustering_version['id'])
print(f"\nSelected clustering version: {clustering_version['name']}")
print(f"  ID: {clustering_version_id}")
print(f"  Description: {clustering_version['description']}")

Selected topic version: v0130-1438
  ID: 5559335a-50ea-47b5-a7bc-69dc85b54f2c
  Description: 

Selected clustering version: v0128-2256
  ID: e6fdc5df-7453-4d17-b786-0894ea52150b
  Description: 


In [6]:
# Cell 5: Load sample topics (10-20 articles per topic)
topics = db.get_all_topics_with_counts(topic_version_id, min_article_count=10)
topics_df = pd.DataFrame(topics)

# Filter to topics with 10-20 articles (not too small, not too large)
topics_df = topics_df[(topics_df['article_count'] >= 10) & (topics_df['article_count'] <= 20)]

print(f"Found {len(topics_df)} topics with 10-20 articles")
print("\nTop 10 topics by article count:")
display(topics_df.head(10)[['name', 'article_count', 'keywords']])

# Select 5 diverse topics for testing
selected_topics = topics_df.head(5)['id'].tolist()
print(f"\nSelected {len(selected_topics)} topics for testing")

Found 19 topics with 10-20 articles

Top 10 topics by article count:


Unnamed: 0,name,article_count,keywords
25,"Deputy, Imports, Recent Extreme Weather, Commo...",20,"[deputy, imports, recent extreme weather, comm..."
26,"Sector, Yields, Securities, Wealth, Engineering",20,"[sector, yields, securities, wealth, engineeri..."
27,"Incidents, Affairs, Disruptions, Regarding, In...",19,"[incidents, affairs, disruptions, regarding, i..."
28,"Flooding, Flood, Sector, Floodwater, Aftermath",19,"[flooding, flood, sector, floodwater, aftermat..."
29,"Recovery Efforts, Resilience, Economic Recover...",18,"[recovery efforts, resilience, economic recove..."
30,"Devastation, Recovery Efforts, Impacted, Const...",18,"[devastation, recovery efforts, impacted, cons..."
31,"Flooding, Flood, Cyclones, Flood Waters, Rescue",17,"[flooding, flood, cyclones, flood waters, resc..."
32,"Flooding, Flood, Telecommunication, Disruption...",17,"[flooding, flood, telecommunication, disruptio..."
33,"Devastation Caused, Devastation, External Affa...",15,"[devastation caused, devastation, external aff..."
34,"Diplomatic, Dialogue, Recovery Reconstruction,...",14,"[diplomatic, dialogue, recovery reconstruction..."



Selected 5 topics for testing


In [7]:
# Cell 6: Load sample event clusters (3-10 articles per cluster)
clusters = db.get_all_clusters_with_counts(clustering_version_id, min_article_count=3)
clusters_df = pd.DataFrame(clusters)

# Filter to clusters with 3-10 articles
clusters_df = clusters_df[(clusters_df['article_count'] >= 3) & (clusters_df['article_count'] <= 10)]

print(f"Found {len(clusters_df)} clusters with 3-10 articles")
print("\nTop 10 clusters by article count:")
display(clusters_df.head(10)[['cluster_name', 'article_count', 'sources_count', 'date_start', 'date_end']])

# Select 5 diverse clusters for testing
selected_clusters = clusters_df.head(5)['id'].tolist()
print(f"\nSelected {len(selected_clusters)} clusters for testing")

Found 180 clusters with 3-10 articles

Top 10 clusters by article count:


Unnamed: 0,cluster_name,article_count,sources_count,date_start,date_end
62,Sri Lankan economy still vulnerable: IMF,10,4,2025-12-22,2025-12-30
63,Two financial contributions received for the ‘...,10,4,2025-12-18,2025-12-29
64,Govt. announces robust post-disaster rebuildin...,10,3,2025-12-02,2025-12-10
65,The burden of neglect,10,4,2025-12-04,2025-12-16
66,World Bank to inject US$120 Mn in emergency su...,10,4,2025-12-08,2025-12-18
67,A harsh reflection of Sri Lanka’s early-warnin...,10,3,2025-12-14,2025-12-25
68,SL post-Ditwah: Top economists call for halt t...,10,4,2025-12-17,2025-12-28
69,Pakistan sends fresh relief cargo and elite re...,10,4,2025-12-03,2025-12-16
70,Flooding across Asia leaves 600 dead and hundr...,10,2,2025-11-28,2025-12-02
71,When the waters rose,9,4,2025-12-01,2025-12-07



Selected 5 clusters for testing


In [8]:
# Cell 7: Fetch articles for selected groups
# Store topic groups
topic_groups = {}
for topic_id in selected_topics:
    articles = db.get_articles_by_topic(topic_id, topic_version_id)
    topic_name = topics_df[topics_df['id'] == topic_id].iloc[0]['name']
    topic_groups[topic_name] = articles
    print(f"Topic '{topic_name}': {len(articles)} articles from {len(set(a['source_id'] for a in articles))} sources")

print()

# Store cluster groups
cluster_groups = {}
for cluster_id in selected_clusters:
    articles = db.get_articles_by_cluster(cluster_id, clustering_version_id)
    cluster_name = clusters_df[clusters_df['id'] == cluster_id].iloc[0]['cluster_name']
    cluster_groups[cluster_name] = articles
    print(f"Cluster '{cluster_name}': {len(articles)} articles from {len(set(a['source_id'] for a in articles))} sources")

Topic 'Deputy, Imports, Recent Extreme Weather, Commodities, Affected Recent': 20 articles from 3 sources
Topic 'Sector, Yields, Securities, Wealth, Engineering': 20 articles from 4 sources
Topic 'Incidents, Affairs, Disruptions, Regarding, Involved': 19 articles from 4 sources
Topic 'Flooding, Flood, Sector, Floodwater, Aftermath': 19 articles from 4 sources
Topic 'Recovery Efforts, Resilience, Economic Recovery, Recovery, Sector': 18 articles from 4 sources

Cluster 'Sri Lankan economy still vulnerable: IMF': 10 articles from 4 sources
Cluster 'Two financial contributions received for the ‘Rebuilding Sri Lanka’': 10 articles from 4 sources
Cluster 'Govt. announces robust post-disaster rebuilding fund': 10 articles from 3 sources
Cluster 'The burden of neglect': 10 articles from 4 sources
Cluster 'World Bank to inject US$120 Mn in emergency support to Sri Lanka': 10 articles from 4 sources


## Section 2: Method Comparison on Topics

In [9]:
# Cell: Load cached topic results or prepare for fresh generation
print("Loading topic results from cache...\n")

# Try to load cached results for each method
primera_topic_results = None
led_topic_results = None
longt5_topic_results = None
openai_topic_results = None
gemini_topic_results = None

if USE_CACHE:
    primera_cache = load_cache('primera', 'topics')
    led_cache = load_cache('led', 'topics')
    longt5_cache = load_cache('longt5', 'topics')
    openai_cache = load_cache('openai', 'topics')
    gemini_cache = load_cache('gemini', 'topics')
    
    if primera_cache:
        primera_topic_results = primera_cache['results']
    if led_cache:
        led_topic_results = led_cache['results']
    if longt5_cache:
        longt5_topic_results = longt5_cache['results']
    if openai_cache:
        openai_topic_results = openai_cache['results']
    if gemini_cache:
        gemini_topic_results = gemini_cache['results']

# Summary
cached = [m for m, r in [('PRIMERA', primera_topic_results), ('LED', led_topic_results), 
                          ('LongT5', longt5_topic_results), ('OpenAI', openai_topic_results),
                          ('Gemini', gemini_topic_results)] if r is not None]
missing = [m for m, r in [('PRIMERA', primera_topic_results), ('LED', led_topic_results), 
                           ('LongT5', longt5_topic_results), ('OpenAI', openai_topic_results),
                           ('Gemini', gemini_topic_results)] if r is None]

print(f"\n✓ Cached: {', '.join(cached) if cached else 'None'}")
print(f"✗ Missing (will generate): {', '.join(missing) if missing else 'None'}")

Loading topic results from cache...


✓ Cached: None
✗ Missing (will generate): PRIMERA, LED, LongT5, OpenAI, Gemini


In [10]:
# Cell 8: Test PRIMERA on topics
if primera_topic_results is not None:
    print("✓ PRIMERA topic results already loaded from cache - skipping generation")
else:
    print("Testing PRIMERA on topic groups...\n")

    # Initialize PRIMERA summarizer
    primera_config = {
        'method': 'primera'
    }
    primera = PRIMERASummarizer(primera_config)

    primera_topic_results = []

    for topic_name, articles in topic_groups.items():
        print(f"\n{'='*80}")
        print(f"Topic: {topic_name}")
        print(f"Articles: {len(articles)}")
        print(f"{'='*80}")
        
        # Extract documents and sources
        documents = [a['content'] for a in articles]
        sources = [SOURCE_NAMES.get(a['source_id'], a['source_id']) for a in articles]
        
        # Show article titles
        print("\nArticles in group:")
        for i, article in enumerate(articles):
            source_name = SOURCE_NAMES.get(article['source_id'], article['source_id'])
            print(f"  {i+1}. [{source_name}] {article['title'][:70]}...")
        
        # Generate summary
        start_time = time.time()
        summary = primera.summarize_multiple(documents, sources)
        processing_time = time.time() - start_time
        
        print(f"\nPRIMERA Summary ({processing_time:.2f}s):")
        print(summary)
        
        primera_topic_results.append({
            'group': topic_name,
            'article_count': len(articles),
            'summary': summary,
            'processing_time': processing_time,
            'word_count': primera.count_words(summary)
        })

    avg_time = sum(r['processing_time'] for r in primera_topic_results) / len(primera_topic_results)
    print(f"\n\nAverage processing time: {avg_time:.2f}s")
    
    # Save immediately after generation
    save_cache('primera', 'topics', primera_topic_results, 
               topic_version_id, topic_version['name'], list(topic_groups.keys()))

Testing PRIMERA on topic groups...



Token indices sequence length is longer than the specified maximum sequence length for this model (6067 > 4096). Running this sequence through the model will result in indexing errors


Loaded allenai/primera on CPU (max 4096 tokens)

Topic: Deputy, Imports, Recent Extreme Weather, Commodities, Affected Recent
Articles: 20

Articles in group:
  1. [Daily FT] Govt. assures steady food supply despite cyclone disruptions...
  2. [The Morning] Agri. Min.: All affected cultivators to be compensated...
  3. [The Morning] Expedited crop compensation launched for farmers affected by Cyclone D...
  4. [The Morning] Seasonal demand: Consumer group wants unaffected areas cultivated...
  5. [The Morning] Food security: Rice sufficient for few months...
  6. [The Morning] No critical shortage of essentials as of now...
  7. [The Morning] Agriculture: Over 200,000 hectares of crops destroyed...
  8. [The Morning] 25% reduction in vegetable harvest in Nuwara Eliya due to adverse weat...
  9. [The Morning] Livestock Min: No egg shortage, no imports...
  10. [The Morning] PHIs raise the alarm: Wet dried grains, thawed-refrozen meats...
  11. [Daily News] Farmers to be compensated for 

In [11]:
# Cell 9: Test LED on topics
if led_topic_results is not None:
    print("✓ LED topic results already loaded from cache - skipping generation")
else:
    print("Testing LED on topic groups...\n")

    # Initialize LED summarizer
    led_config = {
        'method': 'led'
    }
    led = LEDMultiDocSummarizer(led_config)

    led_topic_results = []

    for topic_name, articles in topic_groups.items():
        print(f"\n{'='*80}")
        print(f"Topic: {topic_name}")
        print(f"{'='*80}")
        
        documents = [a['content'] for a in articles]
        sources = [SOURCE_NAMES.get(a['source_id'], a['source_id']) for a in articles]
        
        start_time = time.time()
        summary = led.summarize_multiple(documents, sources)
        processing_time = time.time() - start_time
        
        print(f"LED Summary ({processing_time:.2f}s):")
        print(summary)
        
        led_topic_results.append({
            'group': topic_name,
            'article_count': len(articles),
            'summary': summary,
            'processing_time': processing_time,
            'word_count': led.count_words(summary)
        })

    avg_time = sum(r['processing_time'] for r in led_topic_results) / len(led_topic_results)
    print(f"\n\nAverage processing time: {avg_time:.2f}s")
    
    # Save immediately after generation
    save_cache('led', 'topics', led_topic_results, 
               topic_version_id, topic_version['name'], list(topic_groups.keys()))

Testing LED on topic groups...



Input ids are automatically padded from 6149 to 7168 to be a multiple of `config.attention_window`: 1024


Loaded allenai/led-base-16384 on CPU (max 16384 tokens)

Topic: Deputy, Imports, Recent Extreme Weather, Commodities, Affected Recent


Input ids are automatically padded from 11653 to 12288 to be a multiple of `config.attention_window`: 1024


LED Summary (121.35s):
[Daily for shortage,000 farmers’ to be affected by drought, drought, floods, and cyclone’s to the extent of food supply, no shortage, no shortages, no stocks, no crops, no supplies, no food supply.[ Daily for shortage of crops, No shortage, food supply and no shortage of food, no lack of supplies, No shortages, any shortage, any shortages, No supply, any food supply or lack of supply. No shortage of vegetables and no shortages of food. No shortages of crops.No shortage of goods, no supply of crops or no shortage.No shortages of vegetables, no need of supplies. No supply of food to be expected, no deliveries, no delays, no disruptions, no floods, no grains, no disasters, no crop, no famine, no drought, no disaster, no shocks, no flooding, no destruction, no disturbances, no damage, no loss of crops and no crops to be caused by drought and no disasters.“No shortage” of crops” Minister of Agriculture, that he explained that there was no shortage in agriculture, no f

Input ids are automatically padded from 8008 to 8192 to be a multiple of `config.attention_window`: 1024


LED Summary (239.70s):
[Daily] Bank’s rate of interest rate on top of the previous day‘s market price of Rs50,000, but not as much as expected by the market. On the day of the same, Bank.27% of the market price on top. The Bank. 27% of interest rates on top the previous days’ market price, Rs50.00, but the market was still closed. The market was also closed to the day. The bank. 27.00% of market price was also close to the market, but still closed to market. The Market was closed to a day of trading, but was not closed to any market.The market was closed at the same time as the market is closed.The Bank was closing to the Day of the week, but it was not yet closed to business. The banks were closing to a market price.The bank was closed the market to the same day, but also closed the day to market with the market being closed to no market.On the day after the market closed, the banks were closed to trading, But the bank was closing. The main market of the day was closed by the same pri

Token indices sequence length is longer than the specified maximum sequence length for this model (17069 > 16384). Running this sequence through the model will result in indexing errors


LED Summary (154.90s):
”[The Ministry’s Department of Public Safety and Civil Protection] is concerned about the lack of information about the public safety and protection of children”, The Ministry of Public Security and Civil Defence is concerned with the following:”We’re concerned about lack of Information about the need to protect children,”The Ministry of public safety, Civil Defence and Civil Defense are concerned about a lack of info about the Public Safety, Civil Defense is concerned over the public Safety and Protection of children.”” We’d been concerned about what happened to children, we’ve been concerned that there was no information about any information about children, We are concerned over what happened, we are concerned that children are not safe, we have been concerned over how to protect the children. We have been worried about what happen to children‛, we were concerned about how to respond to any information that children were not safe. We are also concerned that Ch

Input ids are automatically padded from 8651 to 9216 to be a multiple of `config.attention_window`: 1024


LED Summary (358.26s):
ray health, health, social services, public health, hospitals, and public health departments, along with the rest of the country’s hospitals, social health, education, healthcare, and social health services, all of the above mentioned areas, along health, Health, social care, health and education, all the abovementioned areas, all over the world, along Health, Health and social services. On the other side of the nation, along the same areas, social Health, education and health services. Along health, healthcare and social needs, all around the country, all other areas, none of the aforementioned areas, All over the same as the following areas, Along health and health systems, along on the same side of country, Along the same region, Along Health, health services and socials, along to the same places, All around the same time, along as the same regions, Along with the other countries, along and the same with the following countries, Along to the other regions, alo

In [12]:
# Cell 10: Test LongT5 on topics
if longt5_topic_results is not None:
    print("✓ LongT5 topic results already loaded from cache - skipping generation")
else:
    print("Testing LongT5 on topic groups...\n")

    # Initialize LongT5 summarizer
    longt5_config = {
        'method': 'longt5'
    }
    longt5 = LongT5MultiDocSummarizer(longt5_config)

    longt5_topic_results = []

    for topic_name, articles in topic_groups.items():
        print(f"\n{'='*80}")
        print(f"Topic: {topic_name}")
        print(f"{'='*80}")
        
        documents = [a['content'] for a in articles]
        sources = [SOURCE_NAMES.get(a['source_id'], a['source_id']) for a in articles]
        
        start_time = time.time()
        summary = longt5.summarize_multiple(documents, sources)
        processing_time = time.time() - start_time
        
        print(f"LongT5 Summary ({processing_time:.2f}s):")
        print(summary)
        
        longt5_topic_results.append({
            'group': topic_name,
            'article_count': len(articles),
            'summary': summary,
            'processing_time': processing_time,
            'word_count': longt5.count_words(summary)
        })

    avg_time = sum(r['processing_time'] for r in longt5_topic_results) / len(longt5_topic_results)
    print(f"\n\nAverage processing time: {avg_time:.2f}s")
    
    # Save immediately after generation
    save_cache('longt5', 'topics', longt5_topic_results, 
               topic_version_id, topic_version['name'], list(topic_groups.keys()))

Testing LongT5 on topic groups...

Loaded google/long-t5-tglobal-base on CPU (max 4096 tokens)

Topic: Deputy, Imports, Recent Extreme Weather, Commodities, Affected Recent
   Input: 6,421 tokens (4,533 words)
   Limit: 4,096 tokens
   Lost: 2,325 tokens (~36.2% of content)
   Suggestion: Use LED (16K tokens) or Claude/Gemini for longer inputs
Loaded google/long-t5-tglobal-base on CPU (max 4096 tokens)

Topic: Deputy, Imports, Recent Extreme Weather, Commodities, Affected Recent
   Input: 6,421 tokens (4,533 words)
   Limit: 4,096 tokens
   Lost: 2,325 tokens (~36.2% of content)
   Suggestion: Use LED (16K tokens) or Claude/Gemini for longer inputs
LongT5 Summary (139.99s):

Topic: Sector, Yields, Securities, Wealth, Engineering
   Input: 11,857 tokens (7,694 words)
   Limit: 4,096 tokens
   Lost: 7,761 tokens (~65.5% of content)
   Suggestion: Use LED (16K tokens) or Claude/Gemini for longer inputs
LongT5 Summary (139.99s):

Topic: Sector, Yields, Securities, Wealth, Engineering
   In

In [13]:
# Cell 11: Test OpenAI on topics
if openai_topic_results is not None:
    print("✓ OpenAI topic results already loaded from cache - skipping generation")
else:
    print("Testing OpenAI GPT-4o on topic groups...\n")

    # Initialize OpenAI summarizer
    openai_config = {
        'method': 'openai',
        'llm_model': 'gpt-4o',
        'llm_temperature': 0.0
    }
    openai = OpenAIMultiDocSummarizer(openai_config)

    openai_topic_results = []

    for topic_name, articles in topic_groups.items():
        print(f"\n{'='*80}")
        print(f"Topic: {topic_name}")
        print(f"{'='*80}")
        
        documents = [a['content'] for a in articles]
        sources = [SOURCE_NAMES.get(a['source_id'], a['source_id']) for a in articles]
        
        start_time = time.time()
        summary = openai.summarize_multiple(documents, sources)
        processing_time = time.time() - start_time
        
        print(f"OpenAI Summary ({processing_time:.2f}s):")
        print(summary)
        
        openai_topic_results.append({
            'group': topic_name,
            'article_count': len(articles),
            'summary': summary,
            'processing_time': processing_time,
            'word_count': openai.count_words(summary)
        })

    avg_time = sum(r['processing_time'] for r in openai_topic_results) / len(openai_topic_results)
    print(f"\n\nAverage processing time: {avg_time:.2f}s")
    
    # Save immediately after generation
    save_cache('openai', 'topics', openai_topic_results, 
               topic_version_id, topic_version['name'], list(topic_groups.keys()))

Testing OpenAI GPT-4o on topic groups...


Topic: Deputy, Imports, Recent Extreme Weather, Commodities, Affected Recent

Topic: Deputy, Imports, Recent Extreme Weather, Commodities, Affected Recent
OpenAI Summary (7.00s):
In the wake of Cyclone Ditwah, Sri Lanka's government has assured the public that there is no immediate shortage of essential commodities, despite significant agricultural damage. The Trade, Commerce, Food Security, and Cooperative Development Ministry emphasized that importation and distribution channels remain operational, urging the public to avoid panic buying. Meanwhile, the Agriculture Ministry has committed to compensating all affected farmers, extending beyond the usual six crops covered by regulations. The Morning highlighted the severe impact on paddy, maize, and vegetable crops, with over 200,000 hectares damaged, while the Daily News reported compensation efforts totaling over Rs. 381 million for affected farmers. Despite these challenges, Deputy Minister 

In [14]:
# Cell 12: Test Gemini on topics
if gemini_topic_results is not None:
    print("✓ Gemini topic results already loaded from cache - skipping generation")
else:
    print("Testing Gemini on topic groups...\n")

    # Initialize Gemini summarizer
    gemini_config = {
        'method': 'gemini',
        'llm_model': 'gemini-2.0-flash',
        'llm_temperature': 0.0
    }
    gemini = GeminiMultiDocSummarizer(gemini_config)

    gemini_topic_results = []

    for topic_name, articles in topic_groups.items():
        print(f"\n{'='*80}")
        print(f"Topic: {topic_name}")
        print(f"{'='*80}")
        
        documents = [a['content'] for a in articles]
        sources = [SOURCE_NAMES.get(a['source_id'], a['source_id']) for a in articles]
        
        start_time = time.time()
        summary = gemini.summarize_multiple(documents, sources)
        processing_time = time.time() - start_time
        
        print(f"Gemini Summary ({processing_time:.2f}s):")
        print(summary)
        
        gemini_topic_results.append({
            'group': topic_name,
            'article_count': len(articles),
            'summary': summary,
            'processing_time': processing_time,
            'word_count': gemini.count_words(summary)
        })

    avg_time = sum(r['processing_time'] for r in gemini_topic_results) / len(gemini_topic_results)
    print(f"\n\nAverage processing time: {avg_time:.2f}s")
    
    # Save immediately after generation
    save_cache('gemini', 'topics', gemini_topic_results, 
               topic_version_id, topic_version['name'], list(topic_groups.keys()))

Testing Gemini on topic groups...




Topic: Deputy, Imports, Recent Extreme Weather, Commodities, Affected Recent
Gemini Summary (3.81s):
Following Cyclone Ditwah and subsequent flooding, Sri Lanka's agricultural sector has suffered significant damage, with initial estimates suggesting over 200,000 hectares of farmland affected, including substantial paddy field losses. The government, through the Agriculture and Trade Ministries, has assured the public that there is no critical shortage of essential goods, although vegetable prices have seen temporary spikes due to disrupted supply chains. To mitigate potential food shortages, the government is implementing compensation programs for affected farmers, extending support beyond the traditionally covered crops like paddy, maize, and potatoes. Compensation rates vary, with higher amounts allocated for vegetable crop damage to encourage replanting. While some officials express confidence in the ability to recover and replant damaged fields, particularly paddy, others warn of 

In [15]:
# Cell 13: Compare topic summaries
print("\nTopic Summaries Comparison\n" + "="*80 + "\n")

# Create comparison dataframe
comparison_data = []
for method_name, results in [
    ('PRIMERA', primera_topic_results),
    ('LED', led_topic_results),
    ('LongT5', longt5_topic_results),
    ('OpenAI', openai_topic_results),
    ('Gemini', gemini_topic_results)
]:
    for result in results:
        comparison_data.append({
            'Method': method_name,
            'Group': result['group'][:30] + '...' if len(result['group']) > 30 else result['group'],
            'Time (s)': f"{result['processing_time']:.2f}",
            'Words': result['word_count'],
            'Articles': result['article_count']
        })

comparison_df = pd.DataFrame(comparison_data)
display(comparison_df)

# Summary statistics
print("\n" + "="*80)
print("Summary Statistics")
print("="*80)
for method in ['PRIMERA', 'LED', 'LongT5', 'OpenAI', 'Gemini']:
    method_data = comparison_df[comparison_df['Method'] == method]
    avg_time = method_data['Time (s)'].astype(float).mean()
    avg_words = method_data['Words'].mean()
    print(f"{method:12s} - Avg time: {avg_time:5.2f}s, Avg words: {avg_words:6.1f}")


Topic Summaries Comparison



Unnamed: 0,Method,Group,Time (s),Words,Articles
0,PRIMERA,"Deputy, Imports, Recent Extrem...",195.16,200,20
1,PRIMERA,"Sector, Yields, Securities, We...",197.85,318,20
2,PRIMERA,"Incidents, Affairs, Disruption...",194.34,417,19
3,PRIMERA,"Flooding, Flood, Sector, Flood...",196.23,130,19
4,PRIMERA,"Recovery Efforts, Resilience, ...",192.01,79,18
5,LED,"Deputy, Imports, Recent Extrem...",121.35,396,20
6,LED,"Sector, Yields, Securities, We...",239.7,393,20
7,LED,"Incidents, Affairs, Disruption...",154.9,407,19
8,LED,"Flooding, Flood, Sector, Flood...",358.26,364,19
9,LED,"Recovery Efforts, Resilience, ...",173.9,435,18



Summary Statistics
PRIMERA      - Avg time: 195.12s, Avg words:  228.8
LED          - Avg time: 209.62s, Avg words:  399.0
LongT5       - Avg time: 132.26s, Avg words:  252.8
OpenAI       - Avg time: 12.39s, Avg words:  195.6
Gemini       - Avg time:  3.36s, Avg words:  179.6


## Section 3: Method Comparison on Event Clusters

In [16]:
# Cell: Load cached cluster results or prepare for fresh generation
print("Loading cluster results from cache...\n")

# Initialize cluster results dictionary
cluster_results_by_method = {
    'PRIMERA': None,
    'LED': None,
    'LongT5': None,
    'OpenAI': None,
    'Gemini': None
}

if USE_CACHE:
    primera_cache = load_cache('primera', 'clusters')
    led_cache = load_cache('led', 'clusters')
    longt5_cache = load_cache('longt5', 'clusters')
    openai_cache = load_cache('openai', 'clusters')
    gemini_cache = load_cache('gemini', 'clusters')
    
    if primera_cache:
        cluster_results_by_method['PRIMERA'] = primera_cache['results']
    if led_cache:
        cluster_results_by_method['LED'] = led_cache['results']
    if longt5_cache:
        cluster_results_by_method['LongT5'] = longt5_cache['results']
    if openai_cache:
        cluster_results_by_method['OpenAI'] = openai_cache['results']
    if gemini_cache:
        cluster_results_by_method['Gemini'] = gemini_cache['results']

# Summary
cached = [m for m, r in cluster_results_by_method.items() if r is not None]
missing = [m for m, r in cluster_results_by_method.items() if r is None]

print(f"\n✓ Cached: {', '.join(cached) if cached else 'None'}")
print(f"✗ Missing (will generate): {', '.join(missing) if missing else 'None'}")

Loading cluster results from cache...


✓ Cached: None
✗ Missing (will generate): PRIMERA, LED, LongT5, OpenAI, Gemini


In [17]:
# Cell 14: Test all methods on event clusters

# Check which methods need generation
methods_to_run = [m for m, r in cluster_results_by_method.items() if r is None]

if not methods_to_run:
    print("✓ All cluster results already loaded from cache - skipping generation")
else:
    print(f"Generating cluster summaries for: {', '.join(methods_to_run)}\n")
    
    # Initialize summarizers only for methods that need generation
    summarizers = {}
    if 'PRIMERA' in methods_to_run:
        summarizers['PRIMERA'] = PRIMERASummarizer({'method': 'primera'})
        cluster_results_by_method['PRIMERA'] = []
    if 'LED' in methods_to_run:
        summarizers['LED'] = LEDMultiDocSummarizer({'method': 'led'})
        cluster_results_by_method['LED'] = []
    if 'LongT5' in methods_to_run:
        summarizers['LongT5'] = LongT5MultiDocSummarizer({'method': 'longt5'})
        cluster_results_by_method['LongT5'] = []
    if 'OpenAI' in methods_to_run:
        summarizers['OpenAI'] = OpenAIMultiDocSummarizer({'method': 'openai', 'llm_model': 'gpt-4o', 'llm_temperature': 0.0})
        cluster_results_by_method['OpenAI'] = []
    if 'Gemini' in methods_to_run:
        summarizers['Gemini'] = GeminiMultiDocSummarizer({'method': 'gemini', 'llm_model': 'gemini-2.0-flash', 'llm_temperature': 0.0})
        cluster_results_by_method['Gemini'] = []

    for cluster_name, articles in cluster_groups.items():
        print(f"\n{'='*80}")
        print(f"Cluster: {cluster_name}")
        print(f"Articles: {len(articles)} from {len(set(a['source_id'] for a in articles))} sources")
        print(f"{'='*80}\n")
        
        # Show article titles
        print("Articles in cluster:")
        for i, article in enumerate(articles):
            source_name = SOURCE_NAMES.get(article['source_id'], article['source_id'])
            print(f"  {i+1}. [{source_name}] {article['title'][:70]}...")
        print()
        
        documents = [a['content'] for a in articles]
        sources = [SOURCE_NAMES.get(a['source_id'], a['source_id']) for a in articles]
        
        # Test each method that needs generation
        for method_name, summarizer in summarizers.items():
            print(f"{method_name}:")
            start_time = time.time()
            summary = summarizer.summarize_multiple(documents, sources)
            processing_time = time.time() - start_time
            
            print(f"  ({processing_time:.2f}s) {summary}")
            print()
            
            cluster_results_by_method[method_name].append({
                'group': cluster_name,
                'article_count': len(articles),
                'summary': summary,
                'processing_time': processing_time,
                'word_count': summarizer.count_words(summary)
            })

    # Save results for each newly generated method
    method_file_map = {'PRIMERA': 'primera', 'LED': 'led', 'LongT5': 'longt5', 'OpenAI': 'openai', 'Gemini': 'gemini'}
    groups_tested = list(cluster_groups.keys())
    
    print("\nSaving cluster results...")
    for method_name in methods_to_run:
        save_cache(method_file_map[method_name], 'clusters', cluster_results_by_method[method_name],
                   clustering_version_id, clustering_version['name'], groups_tested)

    print("\n" + "="*80)
    print("Cluster testing complete")
    print("="*80)

Generating cluster summaries for: PRIMERA, LED, LongT5, OpenAI, Gemini

Loaded allenai/primera on CPU (max 4096 tokens)
Loaded allenai/primera on CPU (max 4096 tokens)
Loaded allenai/led-base-16384 on CPU (max 16384 tokens)
Loaded allenai/led-base-16384 on CPU (max 16384 tokens)
Loaded google/long-t5-tglobal-base on CPU (max 4096 tokens)

Cluster: Sri Lankan economy still vulnerable: IMF
Articles: 10 from 4 sources

Articles in cluster:
  1. [Daily News] The need for external financing...
  2. [Daily FT] IMF says hard-won economic gains vulnerable to fresh shocks...
  3. [The Morning] Budget 2026: Reflections on classical economic thinking...
  4. [Daily FT] IMF warns debt risks remain high as repayment capacity tightens...
  5. [The Island] Sri Lankan economy still vulnerable: IMF...
  6. [Daily News] A reasonable request...
  7. [Daily FT] Debt relief as disaster relief: A timely call by international experts...
  8. [The Morning] 2026: Recovery or relapse?...
  9. [Daily News] MSME 

Token indices sequence length is longer than the specified maximum sequence length for this model (12408 > 4096). Running this sequence through the model will result in indexing errors


   Input: 12,408 tokens (9,632 words)
   Limit: 4,096 tokens
   Lost: 8,312 tokens (~67.0% of content)
   Suggestion: Use LED (16K tokens) or Claude/Gemini for longer inputs

LED:

LED:


Input ids are automatically padded from 12450 to 13312 to be a multiple of `config.attention_window`: 1024


  (273.33s) “We’re not sure how much we’ll be able to recover from this crisis.” We are not sure whether we will be expected to recover. We’ve been told that we will not be required to rebuild the economy.We are not certain whether we can recover from the current crisis. We will be expecting to recover the economy from this year’s recession. We are expected to be needed to rebuild. We have been told to recover to the economy, but we are not expected to rebuild it.We will be assured that the economy will recover from these years’ recession.We have been assured that we would not be necessary to rebuild our economy. We would be expecting the economy to recover but we will still need to rebuild to the extent possible. We need to recover our economy from the recession. we will also be expected that the economic recovery will be gradual.We need not be expected the economic growth to recover, we need not to rebuild but we need to be ready to recover soon. We want to recover and recover our ec

Input ids are automatically padded from 2667 to 3072 to be a multiple of `config.attention_window`: 512


  (3.37s) Sri Lanka faces significant economic challenges following Cyclone Ditwah, which has exacerbated existing vulnerabilities despite recent macroeconomic stabilization. While GDP grew in 2024 and early 2025, projections indicate a slowdown in 2026 due to the cyclone's impact on key sectors like agriculture and tourism. The IMF has provided emergency financial support, but stresses the need for transparent and well-targeted spending, urging adherence to fiscal discipline. Several sources highlight the potential for increased inflation and a widening current account deficit. Some economists are calling for a suspension of Sri Lanka's debt repayments to provide crucial fiscal space for recovery efforts, while others emphasize the importance of continued structural reforms and private sector-led growth. The long-term economic impact of Ditwah is estimated to be substantial, potentially increasing poverty and requiring significant reconstruction costs. The MSME sector is requesting lo

Input ids are automatically padded from 2707 to 3072 to be a multiple of `config.attention_window`: 1024


  (155.51s) A group of businessmen has extended support to the “Rebuild Sri Lanka” programme by handing over their donations to the Minister of Agriculture, Livestock, Lands and Irrigation, K.D. Lalkantha, at a special event.¯¯¯¯.Kandy Spices Agro (Pvt) Ltd donated Rs. 4 million, while the Minindoru Association contributed Rs. 1 million towards the initiative. In addition, Orange donated electrical equipment valued at over Rs. 15 million, targeting the Badulla, Nuwara Eliya and Kandy districts.Addressing the gathering, Minister K. D.Lalkantha urged the public and the business community to continue supporting the Rebuild Sri Lanka programme, assuring that the government is utilizing these funds responsibly and transparently.“During the cyclone, the general public came forward voluntarily to help. This sense of solidarity is part of our DNA and is not commonly seen in some other countries. Today, we are gradually recovering and breathing a sigh of relief from this disaster,” the Minister

Input ids are automatically padded from 7191 to 8192 to be a multiple of `config.attention_window`: 1024


  (204.74s) The Management Committee of the ‘Rebuilding Sri Lanka’ Fund, established on the instructions of President Anura Kumara Dissanayake to assist the country’s recovery following Cyclone Ditwah, met yesterday morning at the Presidential Secretariat. The meeting was chaired by Labour Minister and Finance and Planning Deputy Minister Dr. Anil Jayantha Fernando. . The meeting further discussed the various ways in which State and private sector institutions, as well as individuals in Sri Lanka, could contribute to the national rebuilding process. ངསཁརོཤལའབགུཛྷདཧཨམཊཥ཈཯཮ཕཆཡཽཉཛིཞྲྀཚཀཙཅ཭ཋཀྵཇཱིཌྷ཰ཪླྀཌཱཿབྷཝདྷེགྷཫཬཎཟཻནཹཔཱུཾཷཐཏ�аໄ྄່໙ྤ໘ྨྲྙໜྦྼྣ໚྘໨࿄༁ໍ໲ໝྭྯྮྱྜ༄࿲༚༈༙༘༨༜�ྴ྿໴໤ྖྜྷ༢ྥྚ࿽ಲಿನತ�.ŽŁŽĀ໮༼༲࿴ಮ࿿ല্രীനලമප�, കാ

LED:
  (153.55s) , to Management in Sri Lanka’s public sector, and to the extent of the public sector and the Government of Sri Lanka, we’re not aware of the need for any of them. We’m not aware that the government of the Sri Lanka is not aware about the need of any of the government, we are not aware

Input ids are automatically padded from 13340 to 14336 to be a multiple of `config.attention_window`: 1024


  (302.94s) “We’re not sure how much we’ll be able to recover from this disaster, but we are not sure.” We’ve not been able to see how much they’d recover from the disaster. We are not aware of how many we will be unable to recover but we will not be sure. We have been unable to see that much we were not able to know. We were not sure of how much We are now able to assess the disaster, But we have not been sure of the need to recover. We will not know the extent of the damage caused by the floods.We have not seen that much We were unable to know that much of the disaster was caused by a lack of information, we are now unable to explain.We were not aware that any of us were able to explain the damage to the country, we were also unable to comprehend the extent to the damage that was caused. We had not seen any of the disasters, we was unable to account for any of them. We also were not ready to explain that much.We are not seen how much of this disaster is caused by any of our nation’s 

Input ids are automatically padded from 4868 to 5120 to be a multiple of `config.attention_window`: 1024


  (221.49s) The government has approved the activation of emergency response components within ongoing World Bank–funded projects to accelerate post-disaster reconstruction following the widespread floods and landslides caused by Cyclone Ditwah.*/(The disaster resulted in extensive damage to agricultural land, livelihoods, public infrastructure and housing, alongside significant displacement.SourceFileWith bilateral and multilateral partners already extending assistance, the Cabinet agreed that leveraging existing foreign-funded project mechanisms would expedite rebuilding efforts.¯¯¯¯The President, in his capacity as Minister of Finance, Plan Implementation and Economic Development, received approval to activate the relevant World Bank project components and obtain between $92 million and $112 million to implement emergency recovery activities. རངསོབཁགཤད཈མའལིཛྷཕུཨཊ཯཮ཞཆཡཚཽཉཥཀཙཌྷཛཅཋཧཀྵཇ཭ྲྀཌཪདྷླྀགྷཿཎཝབྷནཹེཻཫཾ཰ཱཟཏཬཱིཐཔཱུཷ྄ໍ໯໚ྈ໙໘ྙ྘ໜྨྲྤྼ໨࿽ಲ༁ໄ༈ྭྯྜໞ໲࿲ໝ྽ைನ༄ྞྦྌ໴྿໮ྮༀ໤།༲ಿಯ༚࿻ಜ༘༙༜࿒ಮ࿘ಚ༼ྜྷ༨ತ� ྦྷಘ

LED:
  (104.10

In [18]:
# Cell 15: Compare cluster summaries
print("\nEvent Cluster Summaries Comparison\n" + "="*80 + "\n")

# Create comparison dataframe
cluster_comparison_data = []
for method_name, results in cluster_results_by_method.items():
    for result in results:
        cluster_comparison_data.append({
            'Method': method_name,
            'Cluster': result['group'][:30] + '...' if len(result['group']) > 30 else result['group'],
            'Time (s)': f"{result['processing_time']:.2f}",
            'Words': result['word_count'],
            'Articles': result['article_count']
        })

cluster_comparison_df = pd.DataFrame(cluster_comparison_data)
display(cluster_comparison_df)

# Summary statistics
print("\n" + "="*80)
print("Summary Statistics (Event Clusters)")
print("="*80)
for method in ['PRIMERA', 'LED', 'LongT5', 'OpenAI', 'Gemini']:
    method_data = cluster_comparison_df[cluster_comparison_df['Method'] == method]
    avg_time = method_data['Time (s)'].astype(float).mean()
    avg_words = method_data['Words'].mean()
    print(f"{method:12s} - Avg time: {avg_time:5.2f}s, Avg words: {avg_words:6.1f}")


Event Cluster Summaries Comparison



Unnamed: 0,Method,Cluster,Time (s),Words,Articles
0,PRIMERA,Sri Lankan economy still vulne...,213.25,394,10
1,PRIMERA,Two financial contributions re...,155.51,160,10
2,PRIMERA,Govt. announces robust post-di...,204.74,81,10
3,PRIMERA,The burden of neglect,200.49,157,10
4,PRIMERA,World Bank to inject US$120 Mn...,221.49,103,10
5,LED,Sri Lankan economy still vulne...,273.33,427,10
6,LED,Two financial contributions re...,48.18,378,10
7,LED,Govt. announces robust post-di...,153.55,445,10
8,LED,The burden of neglect,302.94,440,10
9,LED,World Bank to inject US$120 Mn...,104.1,450,10



Summary Statistics (Event Clusters)
PRIMERA      - Avg time: 199.10s, Avg words:  179.0
LED          - Avg time: 176.42s, Avg words:  428.0
LongT5       - Avg time: 144.33s, Avg words:  351.4
OpenAI       - Avg time:  6.73s, Avg words:  202.0
Gemini       - Avg time:  3.26s, Avg words:  175.2


## Section 4: Performance Analysis

In [19]:
# Cell 16: Processing time comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Topics
topic_times = {}
for method, results in [
    ('PRIMERA', primera_topic_results),
    ('LED', led_topic_results),
    ('LongT5', longt5_topic_results),
    ('OpenAI', openai_topic_results),
    ('Gemini', gemini_topic_results)
]:
    topic_times[method] = [r['processing_time'] for r in results]

axes[0].bar(topic_times.keys(), [sum(times)/len(times) for times in topic_times.values()])
axes[0].set_title('Average Processing Time - Topics')
axes[0].set_ylabel('Time (seconds)')
axes[0].tick_params(axis='x', rotation=45)

# Clusters
cluster_times = {}
for method, results in cluster_results_by_method.items():
    cluster_times[method] = [r['processing_time'] for r in results]

axes[1].bar(cluster_times.keys(), [sum(times)/len(times) for times in cluster_times.values()])
axes[1].set_title('Average Processing Time - Event Clusters')
axes[1].set_ylabel('Time (seconds)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [20]:
# Cell 18: Summary statistics
# Word count distribution per method
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Topics
topic_word_counts = {}
for method, results in [
    ('PRIMERA', primera_topic_results),
    ('LED', led_topic_results),
    ('LongT5', longt5_topic_results),
    ('OpenAI', openai_topic_results),
    ('Gemini', gemini_topic_results)
]:
    topic_word_counts[method] = [r['word_count'] for r in results]

axes[0].bar(topic_word_counts.keys(), [sum(counts)/len(counts) for counts in topic_word_counts.values()])
axes[0].set_title('Average Summary Length - Topics')
axes[0].set_ylabel('Word Count')
axes[0].tick_params(axis='x', rotation=45)

# Clusters
cluster_word_counts = {}
for method, results in cluster_results_by_method.items():
    cluster_word_counts[method] = [r['word_count'] for r in results]

axes[1].bar(cluster_word_counts.keys(), [sum(counts)/len(counts) for counts in cluster_word_counts.values()])
axes[1].set_title('Average Summary Length - Event Clusters')
axes[1].set_ylabel('Word Count')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [21]:
# Cell 21: Close database connection
db.close()
print("✓ Database connection closed")
print("\nNotebook complete!")

✓ Database connection closed

Notebook complete!
