# Overall Corpus Topic Modelling

This notebook runs BERTopic on the **full Ditwah cyclone corpus** (all outlets combined),
identifies the 10 most discussed topics, and uses an LLM to label each topic with a short
aspect phrase and longer description.

## Approach
1. Load pre-computed embeddings for all articles across all outlets
2. Fit a single BERTopic model on the full corpus
3. Select the top 10 topics by article count
4. Extract keywords and representative articles per topic
5. Use an LLM to generate aspect labels and descriptions for each topic

## 1. Setup

In [None]:
import sys
import json
import warnings
import numpy as np
import pandas as pd

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
sys.path.append('..')

from src.db import get_db
from src.llm import get_llm
from dashboard.components.source_mapping import SOURCE_NAMES, SOURCE_COLORS

print("All imports successful")

In [2]:
# Configuration
EMBEDDING_MODEL = "google/embeddinggemma-300m"
N_TOPICS = 15
TOP_WORDS = 30
TOP_ARTICLES = 10

# BERTopic parameters for full corpus (~1600 articles)
MIN_TOPIC_SIZE = 10
MIN_DF = 5
UMAP_N_NEIGHBORS = 15
RANDOM_SEED = 42

## 2. Version Selection & Data Loading

In [None]:
# Load all embeddings by model name (shared across versions)
print(f"Loading embeddings for model '{EMBEDDING_MODEL}'...")
with get_db() as db:
    emb_count = db.get_embedding_count(embedding_model=EMBEDDING_MODEL)
    if emb_count == 0:
        raise RuntimeError(
            f"No embeddings found for model '{EMBEDDING_MODEL}'. "
            f"Generate them first:\n"
            f"  python3 scripts/embeddings/01_generate_embeddings.py --model {EMBEDDING_MODEL}"
        )
    all_data = db.get_all_embeddings(embedding_model=EMBEDDING_MODEL)

documents = [f"{d['title']}\n\n{d['content']}" for d in all_data]
embeddings = np.array([d['embedding'] for d in all_data])
article_ids = [d['article_id'] for d in all_data]
titles = [d['title'] for d in all_data]
contents = [d['content'] for d in all_data]
sources = [d['source_id'] for d in all_data]

print(f"Corpus size: {len(documents)} articles")
print(f"Embedding dimensions: {embeddings.shape[1]}")
for src_id, src_name in SOURCE_NAMES.items():
    count = sum(1 for s in sources if s == src_id)
    print(f"  {src_name}: {count} articles")

## 3. Fit BERTopic

In [4]:
import random
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Load embedding model for topic representation
print(f"Loading SentenceTransformer model ({EMBEDDING_MODEL})...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

custom_stop_words = sorted(set(ENGLISH_STOP_WORDS) | {"sri", "lanka", "lankan", "lankans", "cyclone", "ditwah", "cyclone ditwah", "disaster", "affected"})

umap_model = UMAP(
    n_neighbors=UMAP_N_NEIGHBORS,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=RANDOM_SEED,
)

hdbscan_model = HDBSCAN(
    min_cluster_size=MIN_TOPIC_SIZE,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
    core_dist_n_jobs=1,
)

vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    stop_words=custom_stop_words,
    min_df=MIN_DF,
)

model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    representation_model=KeyBERTInspired(),
    top_n_words=TOP_WORDS,
    verbose=False,
)

print("Fitting BERTopic on full corpus...")
topics, probs = model.fit_transform(documents, embeddings)

topic_info = model.get_topic_info()
n_topics = len(topic_info[topic_info['Topic'] != -1])
n_outliers = sum(1 for t in topics if t == -1)
print(f"\nTopics discovered: {n_topics}")
print(f"Outlier articles: {n_outliers}/{len(documents)} ({100*n_outliers/len(documents):.1f}%)")
print(f"Assigned articles: {len(documents) - n_outliers}")

Loading SentenceTransformer model (google/embeddinggemma-300m)...
Fitting BERTopic on full corpus...

Topics discovered: 44
Outlier articles: 396/1657 (23.9%)
Assigned articles: 1261


## 4. Top 10 Topics

In [5]:
# Get top 10 non-outlier topics by article count
top_topics_df = (
    topic_info[topic_info['Topic'] != -1]
    .sort_values('Count', ascending=False)
    .head(N_TOPICS)
    .reset_index(drop=True)
)

print(f"Top {N_TOPICS} topics by article count:")
print(f"{'='*80}")
for i, row in top_topics_df.iterrows():
    tid = row['Topic']
    words = model.get_topic(tid)
    keyword_str = ', '.join(w for w, _ in words) if words else ''
    print(f"  Topic {tid:3d} | {row['Count']:4d} articles | {keyword_str}")

total_in_top = top_topics_df['Count'].sum()
total_assigned = len(documents) - n_outliers
print(f"\nTop {N_TOPICS} topics cover {total_in_top}/{total_assigned} assigned articles ({100*total_in_top/total_assigned:.1f}%)")

display(top_topics_df[['Topic', 'Count', 'Name']])

Top 15 topics by article count:
  Topic   0 |   82 articles | country, government rebuilding, sanath, secretary president, financial, chief executive, secretary, chief, finance, corporate
  Topic   1 |   64 articles | ceylon, development ministry, disruptions, gov, government, ministry, sector, impacted, export, national
  Topic   2 |   63 articles | humanitarian assistance, indian naval, indian air force, naval, india, colombo, relief supplies, indian army, search rescue, operation sagar bandhu
  Topic   3 |   55 articles | rescue, landslide, flood, landslides, nuwara eliya, displaced, colombo, report, anuradhapura, ongoing
  Topic   4 |   54 articles | military, secretary, country, ministry, national, navy, defence, today, recovery, president
  Topic   5 |   52 articles | srilankan, nature, resilience, srilankan airlines, colombo, nuwara eliya, wildlife, natural, destination, risk
  Topic   6 |   51 articles | maldives, relief efforts, singapore, relief supplies, colombo, humanitaria

Unnamed: 0,Topic,Count,Name
0,0,82,0_country_government rebuilding_sanath_secreta...
1,1,64,1_ceylon_development ministry_disruptions_gov
2,2,63,2_humanitarian assistance_indian naval_indian ...
3,3,55,3_rescue_landslide_flood_landslides
4,4,54,4_military_secretary_country_ministry
5,5,52,5_srilankan_nature_resilience_srilankan airlines
6,6,51,6_maldives_relief efforts_singapore_relief sup...
7,7,42,7_foreign_infrastructure_rs trillion_internati...
8,8,42,8_cyclonic storm_heavy rainfall_heavy rain_cyc...
9,9,41,9_disasters_climate_resilience_foreign


## 5. Keywords & Representative Articles per Topic

In [6]:
topics_arr = np.array(topics)
probs_arr = np.array(probs) if probs is not None else None

topic_details = []

for _, row in top_topics_df.iterrows():
    tid = row['Topic']

    # Get keywords with scores
    words = model.get_topic(tid)
    keywords = [(w, float(s)) for w, s in words[:TOP_WORDS]] if words else []

    # Get articles assigned to this topic
    mask = topics_arr == tid
    indices = np.where(mask)[0]

    # Sort by probability if available, otherwise just take first N
    if probs_arr is not None and probs_arr.ndim == 2 and probs_arr.shape[1] > 0:
        # Multi-topic probabilities - find column for this topic
        topic_ids_sorted = sorted(set(topics) - {-1})
        if tid in topic_ids_sorted:
            col_idx = topic_ids_sorted.index(tid)
            topic_probs = probs_arr[indices, col_idx] if col_idx < probs_arr.shape[1] else np.ones(len(indices))
        else:
            topic_probs = np.ones(len(indices))
        sorted_order = np.argsort(-topic_probs)
    elif probs_arr is not None and probs_arr.ndim == 1:
        topic_probs = probs_arr[indices]
        sorted_order = np.argsort(-topic_probs)
    else:
        sorted_order = np.arange(len(indices))

    top_indices = indices[sorted_order[:TOP_ARTICLES]]

    rep_articles = []
    for idx in top_indices:
        rep_articles.append({
            'article_id': article_ids[idx],
            'title': titles[idx],
            'excerpt': contents[idx][:500],
            'source': SOURCE_NAMES.get(sources[idx], sources[idx]),
        })

    detail = {
        'topic_id': tid,
        'article_count': int(row['Count']),
        'keywords': keywords,
        'representative_articles': rep_articles,
    }
    topic_details.append(detail)

# Display summary
for d in topic_details:
    print(f"\n{'='*80}")
    print(f"Topic {d['topic_id']} ({d['article_count']} articles)")
    print(f"{'='*80}")
    kw_str = ', '.join(f"{w} ({s:.3f})" for w, s in d['keywords'])
    print(f"Keywords: {kw_str}")
    print(f"\nRepresentative articles:")
    for i, art in enumerate(d['representative_articles'], 1):
        print(f"  {i}. [{art['source']}] {art['title'][:80]}")


Topic 0 (82 articles)
Keywords: country (0.354), government rebuilding (0.352), sanath (0.343), secretary president (0.342), financial (0.338), chief executive (0.336), secretary (0.336), chief (0.335), finance (0.334), corporate (0.333)

Representative articles:
  1. [Daily FT] Rs. 20 m donated to Rebuilding Sri Lanka
  2. [The Island] Bandaranaike Memorial National Foundation donates Rs. 250 million to the Governm
  3. [Daily News] Naturub Group of Companies donates Rs. 100 Million to ‘Rebuilding Sri Lanka’ Fun
  4. [Daily News] BNMF donates Rs. 250 Mn for recovery efforts
  5. [Daily News] Thilakawardena Textiles donates to the ‘Rebuilding Sri Lanka’ Fund
  6. [Daily FT] Anunine Holdings donates Rs. 25 m to ‘Rebuilding Sri Lanka’ Fund
  7. [The Island] Lakbima Rice Mills (Pvt) Ltd donates Rs 100 million to the ‘Rebuilding Sri Lanka
  8. [Daily News] Union Chemicals donates Rs. 1 Mn to ‘Rebuilding Sri Lanka’ Fund
  9. [Daily FT] Yaden Laboratories donates Rs. 20 m to ’Rebuilding Sri

## 6. LLM Aspect Labelling

In [7]:
llm = get_llm()
print(f"LLM loaded: {llm.model} ({type(llm).__name__})")

SYSTEM_PROMPT = """You are an expert media analyst studying how Sri Lankan English newspapers covered \
Cyclone Ditwah (November 2025 - December 2025). The cyclone caused significant damage in Sri Lanka, \
triggering government response, international aid, and extensive media coverage across multiple outlets.

You are analysing topics discovered by BERTopic from the full corpus of ~1600 articles. \
For each topic, you will receive its top keywords (with importance scores) and representative \
article titles and excerpts.

Your task is to identify what specific aspect of the Ditwah cyclone event this topic captures."""

llm_results = []

for i, d in enumerate(topic_details):
    print(f"\nLabelling topic {d['topic_id']} ({i+1}/{len(topic_details)})...")

    # Build keyword section
    kw_lines = [f"  {w} (score: {s:.4f})" for w, s in d['keywords']]
    kw_section = "\n".join(kw_lines)

    # Build articles section
    art_lines = []
    for j, art in enumerate(d['representative_articles'], 1):
        art_lines.append(f"  Article {j} [{art['source']}]:")
        art_lines.append(f"    Title: {art['title']}")
        art_lines.append(f"    Excerpt: {art['excerpt'][:400]}")
    art_section = "\n".join(art_lines)

    prompt = f"""Below is a topic discovered from Sri Lankan newspaper coverage of Cyclone Ditwah.

TOPIC {d['topic_id']} ({d['article_count']} articles)

Top {TOP_WORDS} keywords (with BERTopic importance scores):
{kw_section}

Top {len(d['representative_articles'])} representative articles:
{art_section}

Based on the keywords and articles above, provide:
1. A short aspect phrase (2-5 words) that captures what this topic is about
2. A 1-2 sentence description explaining what this topic covers in the context of Cyclone Ditwah coverage

Respond with JSON in this exact format:
{{"aspect": "short phrase here", "description": "1-2 sentence description here"}}"""

    try:
        response = llm.generate(prompt, system_prompt=SYSTEM_PROMPT, json_mode=True)
        result = json.loads(response.content)
        aspect = result.get('aspect', 'Unknown')
        description = result.get('description', '')
        print(f"  -> {aspect}")
    except Exception as e:
        print(f"  LLM error: {e}")
        # Fallback: use top keywords as label
        aspect = ', '.join(w for w, _ in d['keywords'][:3])
        description = f"Topic characterised by keywords: {', '.join(w for w, _ in d['keywords'][:8])}"
        print(f"  -> Fallback: {aspect}")

    llm_results.append({
        'topic_id': d['topic_id'],
        'aspect': aspect,
        'description': description,
        'article_count': d['article_count'],
        'keywords': d['keywords'],
    })

print(f"\nAll {len(llm_results)} topics labelled.")

LLM loaded: gpt-4o (OpenAILLM)

Labelling topic 0 (1/15)...
  -> Corporate Donations for Rebuilding

Labelling topic 1 (2/15)...
  -> Industrial sector recovery

Labelling topic 2 (3/15)...
  -> India's Humanitarian Aid

Labelling topic 3 (4/15)...
  -> Casualties and Displacement

Labelling topic 4 (5/15)...
  -> Education sector recovery

Labelling topic 5 (6/15)...
  -> Tourism impact and recovery

Labelling topic 6 (7/15)...
  -> Chinese aid and support

Labelling topic 7 (8/15)...
  -> Fiscal response to cyclone

Labelling topic 8 (9/15)...
  -> Meteorological updates and forecasts

Labelling topic 9 (10/15)...
  -> Economic recovery and aid

Labelling topic 10 (11/15)...
  -> Infrastructure and Road Recovery

Labelling topic 11 (12/15)...
  -> IMF Emergency Aid Request

Labelling topic 12 (13/15)...
  -> Flood and irrigation restoration

Labelling topic 13 (14/15)...
  -> Disaster Preparedness Critique

Labelling topic 14 (15/15)...
  -> Christmas and Cyclone Aftermath

All 15 to

## 7. Results

In [10]:
# Build results table
results_rows = []
for r in llm_results:
    top_kw = ', '.join(w for w, _ in r['keywords'][:8])
    results_rows.append({
        'Topic ID': r['topic_id'],
        'Aspect': r['aspect'],
        'Description': r['description'],
        'Articles': r['article_count'],
        'Top Keywords': top_kw,
    })

results_df = pd.DataFrame(results_rows)
with pd.option_context('display.max_colwidth', None):
    display(results_df[['Aspect', 'Description']])

Unnamed: 0,Aspect,Description
0,Corporate Donations for Rebuilding,"This topic covers the financial contributions made by various corporations and organizations to the 'Rebuilding Sri Lanka' Fund, established by the government to aid in the recovery and rebuilding efforts following the devastation caused by Cyclone Ditwah. It highlights the involvement of key figures and entities in supporting national recovery initiatives."
1,Industrial sector recovery,"This topic covers the impact of Cyclone Ditwah on Sri Lanka's industrial sector, focusing on government efforts to collect data on affected industries and provide financial support for recovery. It highlights the Ministry of Industry's initiatives to assess damage and facilitate the restoration of businesses, including micro, small, and medium-sized enterprises."
2,India's Humanitarian Aid,"This topic covers India's extensive humanitarian assistance to Sri Lanka in the aftermath of Cyclone Ditwah, primarily through 'Operation Sagar Bandhu'. It highlights the deployment of Indian naval and air force resources to deliver relief supplies, conduct search and rescue operations, and provide medical support to the affected regions."
3,Casualties and Displacement,"This topic covers the human toll of Cyclone Ditwah, focusing on the number of deaths, missing persons, and the widespread displacement caused by the cyclone. It highlights the reports from the Disaster Management Centre on the impact of floods and landslides across various districts, particularly emphasizing the severe effects in areas like Kandy."
4,Education sector recovery,"This topic covers the impact of Cyclone Ditwah on the education sector in Sri Lanka, highlighting the damage to schools and the government's efforts to rebuild and relocate affected educational institutions. It also discusses the prioritization of children's psychosocial well-being and the involvement of various stakeholders in the recovery process."
5,Tourism impact and recovery,"This topic covers the impact of Cyclone Ditwah on Sri Lanka's tourism industry, highlighting the resilience and recovery efforts to assure safety and normalcy for tourists. It discusses the challenges faced by the sector, the response from tourism authorities, and the continued influx of tourists despite the cyclone's devastation."
6,Chinese aid and support,"This topic covers the extensive humanitarian aid and relief efforts provided by China to Sri Lanka in the aftermath of Cyclone Ditwah. It highlights financial donations, relief supplies, and technical assistance, such as drone expertise, to support recovery and rebuilding efforts in affected areas."
7,Fiscal response to cyclone,"This topic covers the Sri Lankan government's financial strategies and budgetary adjustments in response to Cyclone Ditwah. It highlights the allocation of funds for recovery and reconstruction, the impact on fiscal targets, and the government's efforts to maintain economic stability while addressing the cyclone's aftermath."
8,Meteorological updates and forecasts,"This topic covers the meteorological updates and forecasts related to Cyclone Ditwah, including its development, movement, and the expected weather conditions such as heavy rainfall and strong winds across Sri Lanka. It highlights the role of the Department of Meteorology and other meteorological centers in issuing warnings and alerts as the cyclone approached and moved away from the island."
9,Economic recovery and aid,"This topic covers the economic impact of Cyclone Ditwah on Sri Lanka, focusing on the country's resilience and recovery efforts. It highlights the role of international aid, foreign direct investment, and government measures in rebuilding infrastructure and managing the financial crisis triggered by the cyclone."


## 8. Selection Bias Analysis

For each of the 15 LLM-labelled topics, compute how much of each outlet's coverage is devoted to that topic. High variance across outlets signals **selection bias** — some outlets emphasise certain aspects of the cyclone while others downplay or ignore them.

In [11]:
import plotly.express as px

# Build aspect lookup from LLM results
aspect_lookup = {r['topic_id']: r['aspect'] for r in llm_results}
top_topic_ids = [r['topic_id'] for r in llm_results]

# Count total articles per outlet (full corpus, not just assigned)
outlet_totals = {}
for src_id, src_name in SOURCE_NAMES.items():
    outlet_totals[src_name] = sum(1 for s in sources if s == src_id)

# For each topic x outlet: count articles and compute proportion
rows = []
for tid in top_topic_ids:
    mask = topics_arr == tid
    topic_sources = [sources[i] for i in range(len(sources)) if mask[i]]

    outlet_counts = {}
    outlet_props = {}
    for src_id, src_name in SOURCE_NAMES.items():
        count = sum(1 for s in topic_sources if s == src_id)
        prop = count / outlet_totals[src_name] if outlet_totals[src_name] > 0 else 0.0
        outlet_counts[src_name] = count
        outlet_props[src_name] = prop

    props = list(outlet_props.values())
    variance = np.var(props)
    spread = max(props) - min(props)
    max_outlet = max(outlet_props, key=outlet_props.get)
    min_outlet = min(outlet_props, key=outlet_props.get)

    row = {
        'Topic ID': tid,
        'Aspect': aspect_lookup[tid],
        **{f'{name} (count)': outlet_counts[name] for name in outlet_totals},
        **{f'{name} (%)': outlet_props[name] * 100 for name in outlet_totals},
        'Variance': variance,
        'Spread': spread,
        'Max Coverage': max_outlet,
        'Min Coverage': min_outlet,
    }
    rows.append(row)

bias_df = pd.DataFrame(rows).sort_values('Variance', ascending=False).reset_index(drop=True)

# Display summary table
outlet_names = list(outlet_totals.keys())
display_cols = ['Aspect'] + [f'{n} (%)' for n in outlet_names] + ['Spread', 'Max Coverage', 'Min Coverage']
styled = bias_df[display_cols].copy()
for n in outlet_names:
    styled[f'{n} (%)'] = styled[f'{n} (%)'].map('{:.1f}%'.format)
styled['Spread'] = (bias_df['Spread'] * 100).map('{:.1f}pp'.format)

print("Per-outlet topic proportions (% of each outlet's articles), sorted by selection bias strength:\n")
with pd.option_context('display.max_colwidth', 40):
    display(styled)

Per-outlet topic proportions (% of each outlet's articles), sorted by selection bias strength:



Unnamed: 0,Aspect,Daily News (%),The Morning (%),Daily FT (%),The Island (%),Spread,Max Coverage,Min Coverage
0,Corporate Donations for Rebuilding,6.8%,1.6%,7.6%,3.9%,6.0pp,Daily FT,The Morning
1,Flood and irrigation restoration,1.8%,4.7%,0.5%,0.4%,4.3pp,The Morning,The Island
2,Education sector recovery,4.4%,4.9%,0.8%,1.8%,4.1pp,The Morning,Daily FT
3,Fiscal response to cyclone,2.6%,1.2%,5.3%,1.1%,4.2pp,Daily FT,The Island
4,Meteorological updates and forecasts,3.2%,2.0%,0.5%,4.9%,4.4pp,The Island,Daily FT
5,Infrastructure and Road Recovery,2.0%,4.5%,0.3%,2.1%,4.2pp,The Morning,Daily FT
6,Casualties and Displacement,3.4%,3.7%,1.3%,5.3%,4.0pp,The Island,Daily FT
7,Tourism impact and recovery,2.6%,2.0%,5.5%,2.8%,3.5pp,Daily FT,The Morning
8,Christmas and Cyclone Aftermath,1.4%,3.9%,0.5%,1.4%,3.3pp,The Morning,Daily FT
9,Economic recovery and aid,1.6%,3.5%,3.4%,1.1%,2.4pp,The Morning,The Island


In [12]:
# Proportion heatmap: topics (sorted by variance) x outlets
heatmap_data = bias_df[['Aspect'] + [f'{n} (%)' for n in outlet_names]].copy()
heatmap_data = heatmap_data.set_index('Aspect')
heatmap_data.columns = outlet_names

fig = px.imshow(
    heatmap_data.values,
    x=outlet_names,
    y=heatmap_data.index.tolist(),
    text_auto='.1f',
    color_continuous_scale='YlOrRd',
    labels={'x': 'Outlet', 'y': 'Topic', 'color': '% of outlet articles'},
    aspect='auto',
)
fig.update_layout(
    title='Coverage Proportion by Outlet (% of each outlet\'s articles)',
    height=600,
    width=800,
    yaxis=dict(dtick=1),
)
fig.show()

In [15]:
# Grouped bar chart: article counts per topic per outlet
bar_rows = []
for _, row in bias_df.iterrows():
    for name in outlet_names:
        bar_rows.append({
            'Aspect': row['Aspect'],
            'Outlet': name,
            'Articles': row[f'{name} (count)'],
        })

bar_df = pd.DataFrame(bar_rows)

fig = px.bar(
    bar_df,
    x='Aspect',
    y='Articles',
    color='Outlet',
    barmode='group',
    color_discrete_map=SOURCE_COLORS,
    title='Article Counts per Topic by Outlet',
)
fig.update_layout(
    xaxis_tickangle=-45,
    height=500,
    width=1000,
    xaxis_title='',
    legend_title='Outlet',
)
fig.show()

In [16]:
# Spread chart: max - min proportion per topic (selection bias signal strength)
spread_df = bias_df[['Aspect', 'Spread']].copy()
spread_df['Spread (pp)'] = spread_df['Spread'] * 100

fig = px.bar(
    spread_df,
    x='Aspect',
    y='Spread (pp)',
    color='Spread (pp)',
    color_continuous_scale='Reds',
    title='Selection Bias Signal: Coverage Spread (max - min outlet proportion)',
    labels={'Spread (pp)': 'Spread (percentage points)'},
    text_auto='.1f',
)
fig.update_layout(
    xaxis_tickangle=-45,
    height=500,
    width=1000,
    xaxis_title='',
    yaxis_title='Spread (percentage points)',
    showlegend=False,
)
fig.show()

In [17]:
# Selection bias summary
print("SELECTION BIAS RANKING")
print("=" * 80)
print(f"Topics ranked by coverage spread (max - min outlet proportion):\n")

for rank, (_, row) in enumerate(bias_df.iterrows(), 1):
    spread_pp = row['Spread'] * 100
    aspect = row['Aspect']
    max_out = row['Max Coverage']
    min_out = row['Min Coverage']

    per_outlet = ' | '.join(
        f"{n}: {row[f'{n} (%)']:.1f}%" for n in outlet_names
    )

    print(f"{rank:2d}. {aspect}")
    print(f"    Spread: {spread_pp:.1f}pp  |  Most: {max_out}  |  Least: {min_out}")
    print(f"    {per_outlet}")
    print()

SELECTION BIAS RANKING
Topics ranked by coverage spread (max - min outlet proportion):

 1. Corporate Donations for Rebuilding
    Spread: 6.0pp  |  Most: Daily FT  |  Least: The Morning
    Daily News: 6.8% | The Morning: 1.6% | Daily FT: 7.6% | The Island: 3.9%

 2. Flood and irrigation restoration
    Spread: 4.3pp  |  Most: The Morning  |  Least: The Island
    Daily News: 1.8% | The Morning: 4.7% | Daily FT: 0.5% | The Island: 0.4%

 3. Education sector recovery
    Spread: 4.1pp  |  Most: The Morning  |  Least: Daily FT
    Daily News: 4.4% | The Morning: 4.9% | Daily FT: 0.8% | The Island: 1.8%

 4. Fiscal response to cyclone
    Spread: 4.2pp  |  Most: Daily FT  |  Least: The Island
    Daily News: 2.6% | The Morning: 1.2% | Daily FT: 5.3% | The Island: 1.1%

 5. Meteorological updates and forecasts
    Spread: 4.4pp  |  Most: The Island  |  Least: Daily FT
    Daily News: 3.2% | The Morning: 2.0% | Daily FT: 0.5% | The Island: 4.9%

 6. Infrastructure and Road Recovery
    Spr