# Per-Outlet Topic Modeling & Cross-Outlet Comparison

This notebook runs BERTopic **independently per newspaper outlet**, then cross-identifies matching topics
and compares how outlets frame the same topics differently.

## Approach
1. Load pre-computed embeddings (same `google/embeddinggemma-300m` space across all outlets)
2. Fit separate BERTopic models per outlet with adjusted parameters for smaller corpora
3. Match topics across outlets using a 3-method ensemble (centroid similarity, keyword overlap, c-TF-IDF)
4. Use the Hungarian algorithm for optimal 1-to-1 matching per outlet pair
5. Group matched topics transitively via Union-Find
6. Visualize coverage differences, keyword divergence, and selection bias signals

## 1. Setup & Data Loading

In [23]:
import sys
import json
import warnings
import numpy as np
import pandas as pd
from collections import defaultdict
from itertools import combinations

from scipy.optimize import linear_sum_assignment
from scipy.spatial.distance import cosine

import plotly.graph_objects as go
import plotly.express as px

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
sys.path.append('..')

from src.db import get_db
from src.llm import get_llm
from dashboard.components.source_mapping import SOURCE_NAMES, SOURCE_COLORS

print("All imports successful")

All imports successful


In [24]:
# Configuration
OUTLETS = ["dailynews_en", "themorning_en", "ft_en", "island_en"]
RANDOM_SEED = 42
EMBEDDING_MODEL = "google/embeddinggemma-300m"

# BERTopic parameters adjusted for smaller per-outlet corpora (~500 articles each)
MIN_TOPIC_SIZE = 5
MIN_DF = 2
UMAP_N_NEIGHBORS = 10

# Cross-outlet matching parameters
WEIGHT_CENTROID = 1.0
WEIGHT_CTFIDF = 0
WEIGHT_KEYWORD = 0
MATCH_THRESHOLD = 0.75
KEYWORD_TOP_N = 30

# LLM aspect labelling
N_TOPICS = 15       # top topics per outlet to label
TOP_ARTICLES = 10   # representative articles per topic for LLM prompt

In [25]:
# Verify embeddings exist for the configured model
with get_db() as db:
    emb_count = db.get_embedding_count(embedding_model=EMBEDDING_MODEL)

if emb_count == 0:
    raise RuntimeError(
        f"No embeddings found for model '{EMBEDDING_MODEL}'. "
        f"Generate them first:\n"
        f"  python3 scripts/embeddings/01_generate_embeddings.py --model {EMBEDDING_MODEL}"
    )

print(f"Found {emb_count} embeddings for model '{EMBEDDING_MODEL}'")

Found 1657 embeddings for model 'google/embeddinggemma-300m'


In [26]:
# Load all embeddings by model name and split by outlet
print("Loading embeddings from database...")
with get_db() as db:
    all_data = db.get_all_embeddings(embedding_model=EMBEDDING_MODEL)

print(f"Total articles loaded: {len(all_data)}")

outlet_data = {}
for outlet in OUTLETS:
    records = [d for d in all_data if d['source_id'] == outlet]
    outlet_data[outlet] = {
        'documents': [f"{d['title']}\n\n{d['content']}" for d in records],
        'contents': [d['content'] for d in records],
        'embeddings': np.array([d['embedding'] for d in records]),
        'article_ids': [d['article_id'] for d in records],
        'titles': [d['title'] for d in records],
    }
    print(f"  {SOURCE_NAMES[outlet]}: {len(records)} articles")

del all_data

Loading embeddings from database...
Total articles loaded: 1657
  Daily News: 502 articles
  The Morning: 491 articles
  Daily FT: 380 articles
  The Island: 284 articles


## 2. Per-Outlet BERTopic Modeling

In [27]:
# Load embedding model once, share across all fits
print(f"Loading SentenceTransformer model ({EMBEDDING_MODEL})...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

custom_stop_words = sorted(set(ENGLISH_STOP_WORDS) | {"sri", "lanka", "lankan", "lankans"})

outlet_results = {}

for outlet in OUTLETS:
    name = SOURCE_NAMES[outlet]
    data = outlet_data[outlet]
    n_docs = len(data['documents'])
    print(f"\n{'='*60}")
    print(f"Fitting BERTopic for {name} ({n_docs} articles)")
    print(f"{'='*60}")

    umap_model = UMAP(
        n_neighbors=UMAP_N_NEIGHBORS,
        n_components=5,
        min_dist=0.0,
        metric='cosine',
        random_state=RANDOM_SEED,
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=MIN_TOPIC_SIZE,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True,
        core_dist_n_jobs=1,
    )

    vectorizer = CountVectorizer(
        ngram_range=(1, 3),
        stop_words=custom_stop_words,
        min_df=MIN_DF,
    )

    model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer,
        # representation_model=KeyBERTInspired(),
        top_n_words=30,
        verbose=False,
    )

    topics, probs = model.fit_transform(data['documents'], data['embeddings'])
    topic_info = model.get_topic_info()

    n_topics = len(topic_info[topic_info['Topic'] != -1])
    n_outliers = sum(1 for t in topics if t == -1)
    print(f"  Topics: {n_topics}, Outliers: {n_outliers}/{n_docs}")

    outlet_results[outlet] = {
        'model': model,
        'topics': topics,
        'probs': probs,
        'topic_info': topic_info,
        'documents': data['documents'],
        'embeddings': data['embeddings'],
        'article_ids': data['article_ids'],
    }

print("\nAll models fitted successfully.")

Loading SentenceTransformer model (google/embeddinggemma-300m)...

Fitting BERTopic for Daily News (502 articles)
  Topics: 28, Outliers: 52/502

Fitting BERTopic for The Morning (491 articles)
  Topics: 24, Outliers: 75/491

Fitting BERTopic for Daily FT (380 articles)
  Topics: 16, Outliers: 43/380

Fitting BERTopic for The Island (284 articles)
  Topics: 17, Outliers: 66/284

All models fitted successfully.


In [28]:
# Summary table
summary_rows = []
for outlet in OUTLETS:
    r = outlet_results[outlet]
    ti = r['topic_info']
    n_topics = len(ti[ti['Topic'] != -1])
    n_outliers = sum(1 for t in r['topics'] if t == -1)
    summary_rows.append({
        'Outlet': SOURCE_NAMES[outlet],
        'Articles': len(r['documents']),
        'Topics': n_topics,
        'Outliers': n_outliers,
        'Outlier %': f"{100*n_outliers/len(r['documents']):.1f}%",
    })

summary_df = pd.DataFrame(summary_rows)
print("Per-Outlet Topic Modeling Summary")
display(summary_df)

Per-Outlet Topic Modeling Summary


Unnamed: 0,Outlet,Articles,Topics,Outliers,Outlier %
0,Daily News,502,28,52,10.4%
1,The Morning,491,24,75,15.3%
2,Daily FT,380,16,43,11.3%
3,The Island,284,17,66,23.2%


## 3. Per-Outlet Topic Overview

In [29]:
# List all topics per outlet with top 30 keywords/ngrams
TOP_N_WORDS = 30

for outlet in OUTLETS:
    name = SOURCE_NAMES[outlet]
    r = outlet_results[outlet]
    ti = r['topic_info']
    ti_topics = ti[ti['Topic'] != -1].sort_values('Count', ascending=False)

    print(f"\n{'='*100}")
    print(f"  {name} — {len(ti_topics)} topics, {len(r['documents'])} articles")
    print(f"{'='*100}")

    for _, row in ti_topics.iterrows():
        tid = row['Topic']
        count = row['Count']
        words = r['model'].get_topic(tid)
        if words:
            keyword_str = ', '.join(w for w, _ in words[:TOP_N_WORDS])
        else:
            keyword_str = '(no keywords)'
        print(f"\n  Topic {tid} ({count} articles)")
        print(f"    {keyword_str}")


  Daily News — 28 topics, 502 articles

  Topic 0 (58 articles)
    disaster, climate, people, disasters, time, government, ditwah, natural, country, need, public, lives, relief, media, systems, global, world, cyclone, countries, rescue, affected, economic, like, tsunami, change, floods, response, early, national, infrastructure

  Topic 1 (35 articles)
    fund, rebuilding fund, rebuilding, rs, million, group, plc, donates, kumanayake, nandika, nandika sanath, dr nandika sanath, nandika sanath kumanayake, dr nandika, sanath kumanayake, fund established, presidential secretariat, sanath, dr, rebuilding fund established, cheque, presidential, donates rs, government rebuilding, government rebuilding fund, established, handed, president, contribution, secretariat

  Topic 2 (33 articles)
    said, minister, government, parliament, rs, disaster, country, opposition, billion, president, cabinet, 000, taken, people, budget, management, committee, affected, development, yesterday, debate, ec

In [30]:
# Intra-model topic similarity heatmaps (c-TF-IDF cosine similarity)
for outlet in OUTLETS:
    name = SOURCE_NAMES[outlet]
    model = outlet_results[outlet]['model']
    n_topics = len(model.get_topic_info()[model.get_topic_info()['Topic'] != -1])
    fig = model.visualize_heatmap(top_n_topics=n_topics, title=f"{name} — Inter-Topic Similarity")
    fig.show()

In [31]:
# Load article URLs from database
print("Loading article URLs...")
with get_db() as db:
    # Adjust this query based on your schema
    articles_with_urls = {
        row['id']: row.get('url', row.get('link', f"ID:{row['id']}"))
        for row in db.get_articles()  # or whatever method fetches articles
    }

# List articles with URLs under each topic
for outlet in OUTLETS:
    name = SOURCE_NAMES[outlet]
    r = outlet_results[outlet]
    ti = r['topic_info']
    ti_topics = ti[ti['Topic'] != -1].sort_values('Count', ascending=False)
    
    print(f"\n{'='*100}")
    print(f"  {name} — Articles by Topic")
    print(f"{'='*100}")
    
    for _, row in ti_topics.iterrows():
        tid = row['Topic']
        words = r['model'].get_topic(tid)
        keyword_str = ', '.join(w for w, _ in words[:20]) if words else ''
        
        print(f"\n  Topic {tid}: {keyword_str}")
        
        topic_mask = np.array(r['topics']) == tid
        article_indices = np.where(topic_mask)[0]
        
        for idx in article_indices:
            article_id = r['article_ids'][idx]
            title = outlet_data[outlet]['titles'][idx]
            url = articles_with_urls.get(article_id, 'No URL')
            print(f"    • {title[:70]}")
            print(f"      {url}")

Loading article URLs...

  Daily News — Articles by Topic

  Topic 0: disaster, climate, people, disasters, time, government, ditwah, natural, country, need, public, lives, relief, media, systems, global, world, cyclone, countries, rescue
    • New paradigm for climate multilateralism in South Asia
      https://dailynews.lk/2025/11/28/features/903627/new-paradigm-for-climate-multilateralism-in-south-asia/
    • Climate Change at work
      https://dailynews.lk/2025/11/29/editorial/904474/climate-change-at-work/
    • The storm speaks!
      https://dailynews.lk/2025/11/29/sachitra-mahendra/904492/the-storm-speaks/
    • Double down to tackle Sri Lanka’s freshwater tsunami
      https://dailynews.lk/2025/12/01/features/905494/double-down-to-tackle-sri-lankas-freshwater-tsunami/
    • Vigilance, the best weapon
      https://dailynews.lk/2025/12/01/general-opinion/905430/vigilance-the-best-weapon-2/
    • Ditwah’s wake-up call
      https://dailynews.lk/2025/12/02/general-opinion/906117

## 3b. LLM Aspect Labelling

For each outlet, select the top 15 topics by article count, extract keywords and representative
articles, then use an LLM to generate a short aspect label (2-5 words) and description per topic.
This makes cross-outlet comparisons and visualizations much more readable.

In [32]:
# Extract topic details per outlet for LLM labelling
outlet_topic_details = {}

for outlet in OUTLETS:
    name = SOURCE_NAMES[outlet]
    r = outlet_results[outlet]
    ti = r['topic_info']

    # Select top N_TOPICS by article count (excluding outliers)
    top_topics = (
        ti[ti['Topic'] != -1]
        .sort_values('Count', ascending=False)
        .head(N_TOPICS)
    )

    topics_arr = np.array(r['topics'])
    probs_arr = np.array(r['probs']) if r['probs'] is not None else None

    details = []
    for _, row in top_topics.iterrows():
        tid = row['Topic']

        # Get keywords with scores
        words = r['model'].get_topic(tid)
        keywords = [(w, float(s)) for w, s in words[:KEYWORD_TOP_N]] if words else []

        # Get articles assigned to this topic, sorted by probability
        mask = topics_arr == tid
        indices = np.where(mask)[0]

        if probs_arr is not None and probs_arr.ndim == 2 and probs_arr.shape[1] > 0:
            topic_ids_sorted = sorted(set(r['topics']) - {-1})
            if tid in topic_ids_sorted:
                col_idx = topic_ids_sorted.index(tid)
                topic_probs = probs_arr[indices, col_idx] if col_idx < probs_arr.shape[1] else np.ones(len(indices))
            else:
                topic_probs = np.ones(len(indices))
            sorted_order = np.argsort(-topic_probs)
        elif probs_arr is not None and probs_arr.ndim == 1:
            topic_probs = probs_arr[indices]
            sorted_order = np.argsort(-topic_probs)
        else:
            sorted_order = np.arange(len(indices))

        top_indices = indices[sorted_order[:TOP_ARTICLES]]

        rep_articles = []
        for idx in top_indices:
            rep_articles.append({
                'article_id': r['article_ids'][idx],
                'title': outlet_data[outlet]['titles'][idx],
                'excerpt': outlet_data[outlet]['contents'][idx][:500],
            })

        details.append({
            'topic_id': tid,
            'article_count': int(row['Count']),
            'keywords': keywords,
            'representative_articles': rep_articles,
        })

    outlet_topic_details[outlet] = details
    print(f"{name}: extracted details for {len(details)} topics")

print(f"\nTotal topics to label: {sum(len(d) for d in outlet_topic_details.values())}")

Daily News: extracted details for 15 topics
The Morning: extracted details for 15 topics
Daily FT: extracted details for 15 topics
The Island: extracted details for 15 topics

Total topics to label: 60


In [33]:
# LLM aspect labelling per outlet
llm = get_llm()
print(f"LLM loaded: {llm.model} ({type(llm).__name__})")

SYSTEM_PROMPT = """You are an expert media analyst studying how Sri Lankan English newspapers covered \
Cyclone Ditwah (November 2025 - December 2025). The cyclone caused significant damage in Sri Lanka, \
triggering government response, international aid, and extensive media coverage across multiple outlets.

You are analysing topics discovered by BERTopic from a single outlet's coverage (~300-500 articles). \
For each topic, you will receive its top keywords (with importance scores) and representative \
article titles and excerpts.

Your task is to identify what specific aspect of the Ditwah cyclone event this topic captures."""

outlet_llm_labels = {}   # outlet -> list of result dicts
aspect_lookup = {}       # (outlet, tid) -> aspect string

for outlet in OUTLETS:
    name = SOURCE_NAMES[outlet]
    details = outlet_topic_details[outlet]
    print(f"\n{'='*60}")
    print(f"  Labelling {name} ({len(details)} topics)")
    print(f"{'='*60}")

    results = []
    for i, d in enumerate(details):
        print(f"  Topic {d['topic_id']} ({i+1}/{len(details)})...", end=' ')

        # Build keyword section
        kw_lines = [f"  {w} (score: {s:.4f})" for w, s in d['keywords']]
        kw_section = "\n".join(kw_lines)

        # Build articles section
        art_lines = []
        for j, art in enumerate(d['representative_articles'], 1):
            art_lines.append(f"  Article {j}:")
            art_lines.append(f"    Title: {art['title']}")
            art_lines.append(f"    Excerpt: {art['excerpt'][:400]}")
        art_section = "\n".join(art_lines)

        prompt = f"""Below is a topic discovered from {name}'s coverage of Cyclone Ditwah.

TOPIC {d['topic_id']} ({d['article_count']} articles)

Top {KEYWORD_TOP_N} keywords (with BERTopic importance scores):
{kw_section}

Top {len(d['representative_articles'])} representative articles:
{art_section}

Based on the keywords and articles above, provide:
1. A short aspect phrase (2-5 words) that captures what this topic is about
2. A 1-2 sentence description explaining what this topic covers in the context of Cyclone Ditwah coverage

Respond with JSON in this exact format:
{{"aspect": "short phrase here", "description": "1-2 sentence description here"}}"""

        try:
            response = llm.generate(prompt, system_prompt=SYSTEM_PROMPT, json_mode=True)
            result = json.loads(response.content)
            aspect = result.get('aspect', 'Unknown')
            description = result.get('description', '')
            print(f"-> {aspect}")
        except Exception as e:
            print(f"LLM error: {e}")
            aspect = ', '.join(w for w, _ in d['keywords'][:3])
            description = f"Topic characterised by keywords: {', '.join(w for w, _ in d['keywords'][:8])}"
            print(f"  Fallback: {aspect}")

        aspect_lookup[(outlet, d['topic_id'])] = aspect
        results.append({
            'topic_id': d['topic_id'],
            'aspect': aspect,
            'description': description,
            'article_count': d['article_count'],
            'keywords': d['keywords'],
        })

    outlet_llm_labels[outlet] = results

    # Display results table for this outlet
    rows = [{'Topic': r['topic_id'], 'Aspect': r['aspect'], 'Articles': r['article_count'],
             'Description': r['description']} for r in results]
    display(pd.DataFrame(rows))

print(f"\nAll outlets labelled. Total labels: {len(aspect_lookup)}")

LLM loaded: gpt-4o (OpenAILLM)

  Labelling Daily News (15 topics)
  Topic 0 (1/15)... 

-> Climate Impact and Response
  Topic 1 (2/15)... -> Rebuilding Sri Lanka Fund
  Topic 2 (3/15)... -> Government response and budget
  Topic 3 (4/15)... -> Casualties and Damage Assessment
  Topic 4 (5/15)... -> Industry Recovery Efforts
  Topic 5 (6/15)... -> Indian humanitarian assistance
  Topic 6 (7/15)... -> Agricultural damage and recovery
  Topic 7 (8/15)... -> Impact on education sector
  Topic 8 (9/15)... -> Road and Transport Restoration
  Topic 9 (10/15)... -> Weather impacts and forecasts
  Topic 10 (11/15)... -> Tourism and Recovery Efforts
  Topic 11 (12/15)... -> International financial aid
  Topic 12 (13/15)... -> Opposition's response to cyclone
  Topic 13 (14/15)... -> Regional impact in Southeast Asia
  Topic 14 (15/15)... -> International aid response


Unnamed: 0,Topic,Aspect,Articles,Description
0,0,Climate Impact and Response,58,This topic covers the broader implications of ...
1,1,Rebuilding Sri Lanka Fund,35,This topic covers the financial contributions ...
2,2,Government response and budget,33,This topic covers the Sri Lankan government's ...
3,3,Casualties and Damage Assessment,27,This topic covers the human toll and destructi...
4,4,Industry Recovery Efforts,26,This topic covers the initiatives and financia...
5,5,Indian humanitarian assistance,25,This topic covers India's extensive humanitari...
6,6,Agricultural damage and recovery,17,This topic covers the extensive damage Cyclone...
7,7,Impact on education sector,17,This topic covers the disruption and recovery ...
8,8,Road and Transport Restoration,17,This topic covers the restoration and reopenin...
9,9,Weather impacts and forecasts,16,This topic covers the meteorological updates a...



  Labelling The Morning (15 topics)
  Topic 0 (1/15)... -> International relief efforts
  Topic 1 (2/15)... -> Impact on districts
  Topic 2 (3/15)... -> Disaster resilience and management
  Topic 3 (4/15)... -> Education Disruptions and Recovery
  Topic 4 (5/15)... -> IMF Emergency Financing
  Topic 5 (6/15)... -> Rebuilding Sri Lanka Fund
  Topic 6 (7/15)... -> Transportation Infrastructure Recovery
  Topic 7 (8/15)... -> Post-cyclone housing resettlement
  Topic 8 (9/15)... -> Agricultural impact and food security
  Topic 9 (10/15)... -> Emergency regulations and accountability
  Topic 10 (11/15)... -> Economic impact and recovery
  Topic 11 (12/15)... -> Political implications of disaster
  Topic 12 (13/15)... -> Festive season resilience
  Topic 13 (14/15)... -> Business recovery efforts
  Topic 14 (15/15)... -> Healthcare system impact


Unnamed: 0,Topic,Aspect,Articles,Description
0,0,International relief efforts,93,This topic covers the international humanitari...
1,1,Impact on districts,31,This topic covers the extensive impact of Cycl...
2,2,Disaster resilience and management,23,This topic covers the challenges and shortcomi...
3,3,Education Disruptions and Recovery,22,This topic covers the impact of Cyclone Ditwah...
4,4,IMF Emergency Financing,22,This topic covers the International Monetary F...
5,5,Rebuilding Sri Lanka Fund,21,This topic covers the financial contributions ...
6,6,Transportation Infrastructure Recovery,19,This topic covers the impact of Cyclone Ditwah...
7,7,Post-cyclone housing resettlement,17,This topic covers the government's efforts to ...
8,8,Agricultural impact and food security,16,This topic covers the impact of Cyclone Ditwah...
9,9,Emergency regulations and accountability,16,This topic covers the political and legal disc...



  Labelling Daily FT (15 topics)
  Topic 0 (1/15)... -> Government disaster response
  Topic 1 (2/15)... -> Rebuilding and Relief Efforts
  Topic 2 (3/15)... -> International humanitarian aid
  Topic 3 (4/15)... -> Tourism resilience post-cyclone
  Topic 4 (5/15)... -> IMF Financial Assistance
  Topic 5 (6/15)... -> Fiscal Impact and Response
  Topic 6 (7/15)... -> Presidential response and recovery
  Topic 7 (8/15)... -> Financial aid and recovery
  Topic 8 (9/15)... -> Indian humanitarian aid
  Topic 9 (10/15)... -> Industrial impact and recovery
  Topic 10 (11/15)... -> Financial market impact
  Topic 11 (12/15)... -> Impact and Casualties
  Topic 12 (13/15)... -> Japanese aid response
  Topic 13 (14/15)... -> India's aid to Sri Lanka
  Topic 14 (15/15)... -> Health and Humanitarian Aid


Unnamed: 0,Topic,Aspect,Articles,Description
0,0,Government disaster response,100,This topic covers the Sri Lankan government's ...
1,1,Rebuilding and Relief Efforts,58,This topic covers the financial contributions ...
2,2,International humanitarian aid,22,This topic covers the international humanitari...
3,3,Tourism resilience post-cyclone,20,This topic covers the impact of Cyclone Ditwah...
4,4,IMF Financial Assistance,18,This topic covers the International Monetary F...
5,5,Fiscal Impact and Response,16,This topic covers the financial implications o...
6,6,Presidential response and recovery,15,This topic covers President Anura Kumara Dissa...
7,7,Financial aid and recovery,14,This topic covers the financial aid and recove...
8,8,Indian humanitarian aid,13,This topic covers India's extensive humanitari...
9,9,Industrial impact and recovery,13,This topic covers the impact of Cyclone Ditwah...



  Labelling The Island (15 topics)
  Topic 0 (1/15)... -> Climate Change and Disaster Management
  Topic 1 (2/15)... -> International aid response
  Topic 2 (3/15)... -> Financial Contributions to Relief
  Topic 4 (5/15)... -> International relief efforts
  Topic 5 (6/15)... -> International financial aid
  Topic 6 (7/15)... -> Casualties and Missing Persons
  Topic 7 (8/15)... -> Government response criticism
  Topic 8 (9/15)... -> Severe flooding impact
  Topic 9 (10/15)... -> Government leadership response
  Topic 10 (11/15)... -> Tourism and Economic Resilience
  Topic 11 (12/15)... -> International diplomatic support
  Topic 12 (13/15)... -> Political tensions post-cyclone
  Topic 13 (14/15)... -> Infrastructure and Wildlife Damage
  Topic 14 (15/15)... -> Government and Relief Efforts


Unnamed: 0,Topic,Aspect,Articles,Description
0,0,Climate Change and Disaster Management,35,This topic covers the impact of Cyclone Ditwah...
1,1,International aid response,27,This topic covers the international humanitari...
2,2,Financial Contributions to Relief,25,This topic covers the financial contributions ...
3,3,Weather warnings and forecasts,14,This topic covers the issuance of weather warn...
4,4,International relief efforts,13,This topic covers the international assistance...
5,5,International financial aid,12,This topic covers the international financial ...
6,6,Casualties and Missing Persons,11,This topic covers the human toll of Cyclone Di...
7,7,Government response criticism,11,This topic covers the criticism and scrutiny f...
8,8,Severe flooding impact,10,This topic covers the severe flooding and its ...
9,9,Government leadership response,10,This topic covers the leadership and actions t...



All outlets labelled. Total labels: 60


## 4. Cross-Outlet Topic Matching (3 Methods)

In [34]:
def get_topic_ids(outlet):
    """Get non-outlier topic IDs for an outlet."""
    ti = outlet_results[outlet]['topic_info']
    return sorted(ti[ti['Topic'] != -1]['Topic'].tolist())


def compute_topic_centroids(outlet):
    """Compute mean embedding per topic."""
    r = outlet_results[outlet]
    centroids = {}
    for tid in get_topic_ids(outlet):
        mask = np.array(r['topics']) == tid
        if mask.sum() > 0:
            centroids[tid] = r['embeddings'][mask].mean(axis=0)
    return centroids


# Method 1: Centroid embedding similarity
print("Computing topic centroids per outlet...")
outlet_centroids = {o: compute_topic_centroids(o) for o in OUTLETS}


def centroid_similarity(outlet_a, tid_a, outlet_b, tid_b):
    """Cosine similarity between two topic centroids."""
    ca = outlet_centroids[outlet_a].get(tid_a)
    cb = outlet_centroids[outlet_b].get(tid_b)
    if ca is None or cb is None:
        return 0.0
    return float(1.0 - cosine(ca, cb))


# Method 2: Keyword Jaccard similarity
def keyword_jaccard(outlet_a, tid_a, outlet_b, tid_b, top_n=KEYWORD_TOP_N):
    """Jaccard similarity of top-N keyword sets."""
    words_a = outlet_results[outlet_a]['model'].get_topic(tid_a)
    words_b = outlet_results[outlet_b]['model'].get_topic(tid_b)
    if not words_a or not words_b:
        return 0.0
    set_a = set(w for w, _ in words_a[:top_n])
    set_b = set(w for w, _ in words_b[:top_n])
    intersection = len(set_a & set_b)
    union = len(set_a | set_b)
    return intersection / union if union > 0 else 0.0


# Method 3: c-TF-IDF similarity in union vocabulary space
def ctfidf_similarity_matrix(outlet_a, outlet_b):
    """Cosine similarity between c-TF-IDF vectors in a shared vocabulary space."""
    model_a = outlet_results[outlet_a]['model']
    model_b = outlet_results[outlet_b]['model']

    vocab_a = model_a.vectorizer_model.get_feature_names_out()
    vocab_b = model_b.vectorizer_model.get_feature_names_out()

    union_vocab = sorted(set(vocab_a) | set(vocab_b))
    word_to_idx = {w: i for i, w in enumerate(union_vocab)}

    def project_ctfidf(model, vocab):
        """Project c-TF-IDF matrix into union vocabulary space."""
        ctfidf = model.c_tf_idf_
        topic_ids = sorted([t for t in model.topic_labels_.keys() if t != -1])
        # BERTopic c_tf_idf_ rows: row 0 = outlier topic (-1), row 1+ = topics 0, 1, ...
        projected = np.zeros((len(topic_ids), len(union_vocab)))
        for i, tid in enumerate(topic_ids):
            row_idx = tid + 1  # offset for outlier row
            if row_idx >= ctfidf.shape[0]:
                continue
            row = ctfidf[row_idx]
            if hasattr(row, 'toarray'):
                row = row.toarray().flatten()
            for j, word in enumerate(vocab):
                if j < len(row) and word in word_to_idx:
                    projected[i, word_to_idx[word]] = row[j]
        return projected, topic_ids

    proj_a, tids_a = project_ctfidf(model_a, vocab_a)
    proj_b, tids_b = project_ctfidf(model_b, vocab_b)

    # Compute pairwise cosine similarity
    sim_matrix = np.zeros((len(tids_a), len(tids_b)))
    for i in range(len(tids_a)):
        norm_a = np.linalg.norm(proj_a[i])
        if norm_a == 0:
            continue
        for j in range(len(tids_b)):
            norm_b = np.linalg.norm(proj_b[j])
            if norm_b == 0:
                continue
            sim_matrix[i, j] = np.dot(proj_a[i], proj_b[j]) / (norm_a * norm_b)

    return sim_matrix, tids_a, tids_b


Computing topic centroids per outlet...


## 5. Topic Matching via Hungarian Algorithm

In [35]:
# Compute pairwise matches for all 6 outlet pairs
outlet_pairs = list(combinations(OUTLETS, 2))
pairwise_matches = {}  # (outlet_a, outlet_b) -> list of (tid_a, tid_b, score)
pairwise_centroid_sims = {}  # for heatmap visualization

for outlet_a, outlet_b in outlet_pairs:
    name_a = SOURCE_NAMES[outlet_a]
    name_b = SOURCE_NAMES[outlet_b]
    print(f"\nMatching: {name_a} <-> {name_b}")

    tids_a = get_topic_ids(outlet_a)
    tids_b = get_topic_ids(outlet_b)

    if not tids_a or not tids_b:
        print("  Skipping: one outlet has no topics.")
        pairwise_matches[(outlet_a, outlet_b)] = []
        continue

    # Build ensemble score matrix
    n_a, n_b = len(tids_a), len(tids_b)

    # Method 1: Centroid similarity
    centroid_sim = np.zeros((n_a, n_b))
    for i, ta in enumerate(tids_a):
        for j, tb in enumerate(tids_b):
            centroid_sim[i, j] = centroid_similarity(outlet_a, ta, outlet_b, tb)

    pairwise_centroid_sims[(outlet_a, outlet_b)] = (centroid_sim, tids_a, tids_b)

    # Method 2: Keyword Jaccard
    keyword_sim = np.zeros((n_a, n_b))
    for i, ta in enumerate(tids_a):
        for j, tb in enumerate(tids_b):
            keyword_sim[i, j] = keyword_jaccard(outlet_a, ta, outlet_b, tb)

    # Method 3: c-TF-IDF similarity
    ctfidf_sim, ctfidf_tids_a, ctfidf_tids_b = ctfidf_similarity_matrix(outlet_a, outlet_b)

    # Align c-TF-IDF matrix to match tids_a/tids_b ordering
    ctfidf_aligned = np.zeros((n_a, n_b))
    tid_a_idx = {t: i for i, t in enumerate(ctfidf_tids_a)}
    tid_b_idx = {t: i for i, t in enumerate(ctfidf_tids_b)}
    for i, ta in enumerate(tids_a):
        for j, tb in enumerate(tids_b):
            if ta in tid_a_idx and tb in tid_b_idx:
                ctfidf_aligned[i, j] = ctfidf_sim[tid_a_idx[ta], tid_b_idx[tb]]

    # Weighted ensemble
    ensemble = (
        WEIGHT_CENTROID * centroid_sim
        + WEIGHT_CTFIDF * ctfidf_aligned
        + WEIGHT_KEYWORD * keyword_sim
    )

    # Hungarian algorithm (minimize cost = maximize similarity)
    cost = 1.0 - ensemble
    row_ind, col_ind = linear_sum_assignment(cost)

    matches = []
    for r, c in zip(row_ind, col_ind):
        score = ensemble[r, c]
        if score >= MATCH_THRESHOLD:
            matches.append((tids_a[r], tids_b[c], float(score)))
            print(f"  Match: T{tids_a[r]} <-> T{tids_b[c]} (score: {score:.3f})")

    pairwise_matches[(outlet_a, outlet_b)] = matches
    print(f"  Total matches: {len(matches)} (threshold: {MATCH_THRESHOLD})")

# Show ensemble score distribution for threshold calibration
all_scores = []
for matches in pairwise_matches.values():
    all_scores.extend([s for _, _, s in matches])

if all_scores:
    fig = px.histogram(
        x=all_scores, nbins=30,
        title='Ensemble Match Score Distribution (matched pairs only)',
        labels={'x': 'Ensemble Score', 'y': 'Count'},
    )
    fig.add_vline(x=MATCH_THRESHOLD, line_dash='dash', line_color='red',
                  annotation_text=f'Threshold={MATCH_THRESHOLD}')
    fig.update_layout(height=350)
    fig.show()


Matching: Daily News <-> The Morning
  Match: T0 <-> T2 (score: 0.971)
  Match: T1 <-> T5 (score: 0.978)
  Match: T2 <-> T23 (score: 0.935)
  Match: T3 <-> T1 (score: 0.978)
  Match: T4 <-> T13 (score: 0.981)
  Match: T5 <-> T0 (score: 0.970)
  Match: T6 <-> T8 (score: 0.942)
  Match: T7 <-> T3 (score: 0.983)
  Match: T8 <-> T6 (score: 0.969)
  Match: T9 <-> T15 (score: 0.983)
  Match: T10 <-> T17 (score: 0.904)
  Match: T11 <-> T4 (score: 0.987)
  Match: T12 <-> T9 (score: 0.948)
  Match: T14 <-> T22 (score: 0.860)
  Match: T16 <-> T14 (score: 0.855)
  Match: T17 <-> T16 (score: 0.891)
  Match: T18 <-> T12 (score: 0.832)
  Match: T20 <-> T19 (score: 0.872)
  Match: T21 <-> T7 (score: 0.859)
  Match: T22 <-> T18 (score: 0.768)
  Match: T24 <-> T11 (score: 0.788)
  Match: T25 <-> T10 (score: 0.862)
  Match: T26 <-> T20 (score: 0.820)
  Total matches: 23 (threshold: 0.75)

Matching: Daily News <-> Daily FT


  Match: T0 <-> T0 (score: 0.981)
  Match: T1 <-> T1 (score: 0.982)
  Match: T2 <-> T5 (score: 0.949)
  Match: T3 <-> T11 (score: 0.961)
  Match: T4 <-> T9 (score: 0.980)
  Match: T5 <-> T8 (score: 0.987)
  Match: T8 <-> T6 (score: 0.947)
  Match: T10 <-> T3 (score: 0.956)
  Match: T11 <-> T4 (score: 0.965)
  Match: T14 <-> T14 (score: 0.920)
  Match: T16 <-> T12 (score: 0.985)
  Match: T22 <-> T2 (score: 0.943)
  Match: T24 <-> T13 (score: 0.946)
  Match: T25 <-> T7 (score: 0.957)
  Match: T26 <-> T15 (score: 0.937)
  Total matches: 15 (threshold: 0.75)

Matching: Daily News <-> The Island
  Match: T0 <-> T0 (score: 0.974)
  Match: T1 <-> T2 (score: 0.977)
  Match: T2 <-> T16 (score: 0.850)
  Match: T3 <-> T6 (score: 0.974)
  Match: T5 <-> T4 (score: 0.985)
  Match: T6 <-> T7 (score: 0.883)
  Match: T7 <-> T9 (score: 0.918)
  Match: T8 <-> T13 (score: 0.948)
  Match: T9 <-> T3 (score: 0.958)
  Match: T10 <-> T10 (score: 0.954)
  Match: T11 <-> T5 (score: 0.982)
  Match: T12 <-> T12 (s

## 6. Transitive Topic Grouping (Union-Find)

In [36]:
# Union-Find data structure
class UnionFind:
    def __init__(self):
        self.parent = {}
        self.rank = {}

    def find(self, x):
        if x not in self.parent:
            self.parent[x] = x
            self.rank[x] = 0
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        rx, ry = self.find(x), self.find(y)
        if rx == ry:
            return
        if self.rank[rx] < self.rank[ry]:
            rx, ry = ry, rx
        self.parent[ry] = rx
        if self.rank[rx] == self.rank[ry]:
            self.rank[rx] += 1


# Build union-find from pairwise matches
# Keys are (outlet, topic_id) tuples
uf = UnionFind()

# Register all topics
for outlet in OUTLETS:
    for tid in get_topic_ids(outlet):
        uf.find((outlet, tid))

# Union matched topics
for (outlet_a, outlet_b), matches in pairwise_matches.items():
    for tid_a, tid_b, score in matches:
        uf.union((outlet_a, tid_a), (outlet_b, tid_b))

# Extract groups
groups = defaultdict(list)
for outlet in OUTLETS:
    for tid in get_topic_ids(outlet):
        root = uf.find((outlet, tid))
        groups[root].append((outlet, tid))

# Classify groups
topic_groups = []
for group_id, (root, members) in enumerate(sorted(groups.items(), key=lambda x: -len(x[1]))):
    outlets_in_group = set(o for o, _ in members)

    # Build label from LLM aspect labels if available, else fall back to keywords
    member_aspects = [aspect_lookup[(o, tid)] for o, tid in members if (o, tid) in aspect_lookup]
    if member_aspects:
        # Pick the most common aspect across members
        from collections import Counter
        aspect_counts = Counter(member_aspects)
        # Join unique aspects (most common first)
        unique_aspects = [a for a, _ in aspect_counts.most_common()]
        label = unique_aspects[0] if len(unique_aspects) == 1 else ' / '.join(unique_aspects[:3])
    else:
        # Fallback: keyword-based label
        all_keywords = []
        for o, tid in members:
            words = outlet_results[o]['model'].get_topic(tid)
            if words:
                all_keywords.extend([w for w, _ in words[:5]])
        keyword_counts = Counter(all_keywords)
        top_keywords = [w for w, _ in keyword_counts.most_common(5)]
        label = ', '.join(top_keywords) if top_keywords else 'Unknown'

    topic_groups.append({
        'group_id': group_id,
        'label': label,
        'members': members,
        'n_outlets': len(outlets_in_group),
        'outlets': outlets_in_group,
        'is_shared': len(outlets_in_group) >= 2,
    })

shared_groups = [g for g in topic_groups if g['is_shared']]
unique_groups = [g for g in topic_groups if not g['is_shared']]

print(f"Total topic groups: {len(topic_groups)}")
print(f"  Shared (2+ outlets): {len(shared_groups)}")
print(f"  Unique (1 outlet):   {len(unique_groups)}")

print(f"\nShared Topic Groups:")
for g in shared_groups:
    outlet_names = [SOURCE_NAMES[o] for o in sorted(g['outlets'])]
    print(f"  Group {g['group_id']}: [{', '.join(outlet_names)}] - {g['label']}")

# Distribution by outlet count
dist = Counter(g['n_outlets'] for g in topic_groups)
print(f"\nGroups by outlet count:")
for n in sorted(dist.keys(), reverse=True):
    print(f"  In {n} outlet(s): {dist[n]} groups")

Total topic groups: 10
  Shared (2+ outlets): 5
  Unique (1 outlet):   5

Shared Topic Groups:
  Group 0: [Daily News, Daily FT, The Island, The Morning] - International financial aid / International aid response / Climate Impact and Response
  Group 1: [Daily News, Daily FT, The Island, The Morning] - Rebuilding Sri Lanka Fund / Rebuilding and Relief Efforts / Financial Contributions to Relief
  Group 2: [Daily News, Daily FT, The Island, The Morning] - International relief efforts / Indian humanitarian assistance / Indian humanitarian aid
  Group 3: [Daily News, Daily FT, The Island, The Morning] - Tourism and Recovery Efforts / Tourism resilience post-cyclone / Tourism and Economic Resilience
  Group 4: [Daily News, The Morning] - children, mental, navy, clean, programme

Groups by outlet count:
  In 4 outlet(s): 4 groups
  In 2 outlet(s): 1 groups
  In 1 outlet(s): 5 groups


## 7. Visualizations

### 7a. Cross-Outlet Similarity Heatmaps

In [37]:
# One heatmap per outlet pair showing centroid cosine similarity
for (outlet_a, outlet_b), (sim_matrix, tids_a, tids_b) in pairwise_centroid_sims.items():
    name_a = SOURCE_NAMES[outlet_a]
    name_b = SOURCE_NAMES[outlet_b]

    # Build topic labels (LLM aspect if available, else keywords)
    def topic_label(outlet, tid):
        aspect = aspect_lookup.get((outlet, tid))
        if aspect:
            return f"T{tid}: {aspect}"
        words = outlet_results[outlet]['model'].get_topic(tid)
        kw = ', '.join([w for w, _ in words[:3]]) if words else ''
        return f"T{tid}: {kw}"

    labels_a = [topic_label(outlet_a, t) for t in tids_a]
    labels_b = [topic_label(outlet_b, t) for t in tids_b]

    fig = px.imshow(
        sim_matrix,
        x=labels_b,
        y=labels_a,
        color_continuous_scale='RdYlGn',
        zmin=0, zmax=1,
        labels=dict(x=name_b, y=name_a, color='Cosine Sim'),
        title=f'Topic Centroid Similarity: {name_a} vs {name_b}',
    )
    fig.update_layout(height=500, width=800)
    fig.show()

### 7b. Keyword Divergence Analysis

In [38]:
# For shared topic groups, show shared vs distinctive keywords per outlet
for g in shared_groups[:8]:  # limit to first 8 groups
    group_label = g['label']
    members = g['members']

    # Collect keywords per outlet in this group
    outlet_keywords = {}
    for o, tid in members:
        words = outlet_results[o]['model'].get_topic(tid)
        if words:
            outlet_keywords[SOURCE_NAMES[o]] = {w: s for w, s in words[:KEYWORD_TOP_N]}

    if len(outlet_keywords) < 2:
        continue

    # Find shared and distinctive keywords
    all_words = set()
    for kw_dict in outlet_keywords.values():
        all_words |= set(kw_dict.keys())

    shared_kws = set.intersection(*[set(kw.keys()) for kw in outlet_keywords.values()])

    # Build figure: side-by-side bars for each outlet
    fig = go.Figure()
    outlet_names = sorted(outlet_keywords.keys())

    for oname in outlet_names:
        kw_dict = outlet_keywords[oname]
        # Sort by score descending
        sorted_kw = sorted(kw_dict.items(), key=lambda x: x[1], reverse=True)
        words = [w for w, _ in sorted_kw]
        scores = [s for _, s in sorted_kw]
        colors = [SOURCE_COLORS[oname] if w not in shared_kws else '#888888' for w in words]

        fig.add_trace(go.Bar(
            name=oname,
            y=words[::-1],
            x=scores[::-1],
            orientation='h',
            marker_color=colors[::-1],
        ))

    fig.update_layout(
        title=f'Keyword Divergence: Group "{group_label}"<br><sub>Grey = shared keywords, colored = distinctive</sub>',
        barmode='group',
        height=400,
        xaxis_title='Keyword Score',
    )
    fig.show()

### 7c. Coverage Volume Comparison

In [39]:
# Grouped bar chart: article counts per outlet per matched topic group
coverage_rows = []
for g in shared_groups:
    for outlet in OUTLETS:
        # Count articles in this group for this outlet
        member_tids = [tid for o, tid in g['members'] if o == outlet]
        count = 0
        if member_tids:
            topics_arr = np.array(outlet_results[outlet]['topics'])
            for tid in member_tids:
                count += int((topics_arr == tid).sum())
        coverage_rows.append({
            'Group': f"G{g['group_id']}: {g['label'][:30]}",
            'group_id': g['group_id'],
            'Outlet': SOURCE_NAMES[outlet],
            'Articles': count,
        })

coverage_df = pd.DataFrame(coverage_rows)

fig = px.bar(
    coverage_df,
    x='Group',
    y='Articles',
    color='Outlet',
    barmode='group',
    color_discrete_map=SOURCE_COLORS,
    title='Article Coverage per Outlet per Shared Topic Group',
)
fig.update_layout(height=500, xaxis_tickangle=-30)
fig.show()

### 7d. Topic Proportion Heatmap

In [40]:
# % of each outlet's coverage devoted to each topic group, sorted by variance
prop_rows = []
for g in shared_groups:
    row = {'Group': f"G{g['group_id']}: {g['label'][:35]}"}
    proportions = []
    for outlet in OUTLETS:
        member_tids = [tid for o, tid in g['members'] if o == outlet]
        total_articles = len(outlet_results[outlet]['documents'])
        count = 0
        if member_tids:
            topics_arr = np.array(outlet_results[outlet]['topics'])
            for tid in member_tids:
                count += int((topics_arr == tid).sum())
        prop = count / total_articles if total_articles > 0 else 0
        row[SOURCE_NAMES[outlet]] = prop
        proportions.append(prop)
    row['variance'] = np.var(proportions)
    prop_rows.append(row)

prop_df = pd.DataFrame(prop_rows).sort_values('variance', ascending=False)

outlet_names = [SOURCE_NAMES[o] for o in OUTLETS]
heat_data = prop_df[outlet_names].values

fig = px.imshow(
    heat_data,
    x=outlet_names,
    y=prop_df['Group'].tolist(),
    color_continuous_scale='YlOrRd',
    labels=dict(color='Proportion'),
    title='Topic Coverage Proportions per Outlet<br><sub>Sorted by variance (highest = strongest selection bias signal)</sub>',
    text_auto='.1%',
)
fig.update_layout(height=max(400, len(prop_df) * 35), width=700)
fig.show()

### 7e. Sankey Diagram

In [41]:
# Sankey diagram showing topic correspondences across outlets
# Nodes: each outlet's topics; Links: matched pairs

node_labels = []
node_colors = []
node_map = {}  # (outlet, tid) -> node index

for outlet in OUTLETS:
    color = SOURCE_COLORS[SOURCE_NAMES[outlet]]
    for tid in get_topic_ids(outlet):
        aspect = aspect_lookup.get((outlet, tid))
        if aspect:
            label = f"{SOURCE_NAMES[outlet]}\nT{tid}: {aspect}"
        else:
            words = outlet_results[outlet]['model'].get_topic(tid)
            kw = ', '.join([w for w, _ in words[:3]]) if words else f'T{tid}'
            label = f"{SOURCE_NAMES[outlet]}\nT{tid}: {kw}"
        node_map[(outlet, tid)] = len(node_labels)
        node_labels.append(label)
        node_colors.append(color)

# Links from matches
link_source = []
link_target = []
link_value = []
link_color = []

for (outlet_a, outlet_b), matches in pairwise_matches.items():
    for tid_a, tid_b, score in matches:
        src = node_map.get((outlet_a, tid_a))
        tgt = node_map.get((outlet_b, tid_b))
        if src is not None and tgt is not None:
            link_source.append(src)
            link_target.append(tgt)
            # Value = article count in source topic
            topics_arr = np.array(outlet_results[outlet_a]['topics'])
            val = max(1, int((topics_arr == tid_a).sum()))
            link_value.append(val)
            link_color.append(f'rgba(150,150,150,0.3)')

fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        label=node_labels,
        color=node_colors,
    ),
    link=dict(
        source=link_source,
        target=link_target,
        value=link_value,
        color=link_color,
    ),
))

fig.update_layout(
    title='Cross-Outlet Topic Correspondences',
    height=700,
    width=1000,
    font_size=10,
)
fig.show()

### 7f. Unique vs Shared Topics

In [42]:
# Stacked bar chart: shared vs unique topics per outlet
shared_unique_rows = []
for outlet in OUTLETS:
    name = SOURCE_NAMES[outlet]
    all_tids = set(get_topic_ids(outlet))

    # Topics that appear in shared groups
    shared_tids = set()
    for g in shared_groups:
        for o, tid in g['members']:
            if o == outlet:
                shared_tids.add(tid)

    unique_tids = all_tids - shared_tids
    shared_unique_rows.append({'Outlet': name, 'Type': 'Shared', 'Count': len(shared_tids)})
    shared_unique_rows.append({'Outlet': name, 'Type': 'Unique', 'Count': len(unique_tids)})

su_df = pd.DataFrame(shared_unique_rows)

fig = px.bar(
    su_df, x='Outlet', y='Count', color='Type',
    barmode='stack',
    color_discrete_map={'Shared': '#4CAF50', 'Unique': '#FF9800'},
    title='Shared vs Unique Topics per Outlet',
)
fig.update_layout(height=400)
fig.show()

# List unique topics with LLM aspect labels (or keywords as fallback)
print("\nUnique Topics (only in one outlet):")
print("=" * 80)
for g in unique_groups:
    outlet, tid = g['members'][0]
    aspect = aspect_lookup.get((outlet, tid))
    words = outlet_results[outlet]['model'].get_topic(tid)
    kw = ', '.join([w for w, _ in words[:8]]) if words else 'N/A'
    topics_arr = np.array(outlet_results[outlet]['topics'])
    count = int((topics_arr == tid).sum())
    if aspect:
        print(f"  {SOURCE_NAMES[outlet]:15s} | T{tid:2d} ({count:3d} articles) | {aspect}")
        print(f"  {'':15s} |     keywords: {kw}")
    else:
        print(f"  {SOURCE_NAMES[outlet]:15s} | T{tid:2d} ({count:3d} articles) | {kw}")


Unique Topics (only in one outlet):
  Daily News      | T15 ( 12 articles) | pakistan, relief, ndma, high, minister, prime minister, pakistani, prime
  Daily News      | T19 (  9 articles) | jaishankar, indian, minister, india, prime minister, prime, visit, dr jaishankar
  Daily News      | T23 (  8 articles) | condolences, russia, president, message, russian, saudi, friendly, egypt
  Daily News      | T27 (  6 articles) | rescue relief, clearing, gamage, tri forces, rescue, cleaning, tri, volunteers
  The Morning     | T21 (  6 articles) | police, public, personal, messages, individuals, information, media, fake


### 7g. Selection Bias Metric

In [43]:
# Rank topic groups by coverage proportion variance across outlets
bias_rows = []
for g in shared_groups:
    proportions = {}
    for outlet in OUTLETS:
        member_tids = [tid for o, tid in g['members'] if o == outlet]
        total_articles = len(outlet_results[outlet]['documents'])
        count = 0
        if member_tids:
            topics_arr = np.array(outlet_results[outlet]['topics'])
            for tid in member_tids:
                count += int((topics_arr == tid).sum())
        proportions[SOURCE_NAMES[outlet]] = count / total_articles if total_articles > 0 else 0

    props = list(proportions.values())
    variance = np.var(props)
    max_outlet = max(proportions, key=proportions.get)
    min_outlet = min(proportions, key=proportions.get)

    bias_rows.append({
        'Group': f"G{g['group_id']}",
        'Label': g['label'][:40],
        'Variance': variance,
        'Max Coverage': f"{max_outlet} ({proportions[max_outlet]:.1%})",
        'Min Coverage': f"{min_outlet} ({proportions[min_outlet]:.1%})",
        'Spread': proportions[max_outlet] - proportions[min_outlet],
        **{f"{k} %": f"{v:.1%}" for k, v in proportions.items()},
    })

bias_df = pd.DataFrame(bias_rows).sort_values('Variance', ascending=False)

print("Selection Bias Indicators (by coverage variance)")
print("=" * 80)
display(bias_df)

# Bar chart of spread
fig = px.bar(
    bias_df.head(15),
    x='Label',
    y='Spread',
    title='Topic Coverage Spread (max - min proportion) Across Outlets<br><sub>Higher spread = stronger selection bias signal</sub>',
    labels={'Spread': 'Max-Min Proportion Spread', 'Label': 'Topic Group'},
    color='Spread',
    color_continuous_scale='Reds',
)
fig.update_layout(height=450, xaxis_tickangle=-30)
fig.show()

Selection Bias Indicators (by coverage variance)


Unnamed: 0,Group,Label,Variance,Max Coverage,Min Coverage,Spread,Daily News %,The Morning %,Daily FT %,The Island %
2,G2,International relief efforts / Indian hu,0.004038,The Morning (18.9%),Daily FT (3.4%),0.155199,5.0%,18.9%,3.4%,4.6%
1,G1,Rebuilding Sri Lanka Fund / Rebuilding a,0.001639,Daily FT (15.3%),The Morning (4.3%),0.109862,7.0%,4.3%,15.3%,8.8%
0,G0,International financial aid / Internatio,0.001368,Daily News (65.7%),The Morning (56.6%),0.091179,65.7%,56.6%,64.7%,59.9%
3,G3,Tourism and Recovery Efforts / Tourism r,0.000134,Daily FT (5.3%),The Morning (2.0%),0.032265,3.2%,2.0%,5.3%,3.5%
4,G4,"children, mental, navy, clean, programme",7.4e-05,Daily News (1.8%),Daily FT (0.0%),0.017928,1.8%,1.6%,0.0%,0.0%


## 8. Summary

In [44]:
print("=" * 70)
print("PER-OUTLET TOPIC COMPARISON SUMMARY")
print("=" * 70)

# Per-outlet statistics
print("\n1. Per-Outlet Statistics")
print("-" * 50)
for outlet in OUTLETS:
    r = outlet_results[outlet]
    ti = r['topic_info']
    n_topics = len(ti[ti['Topic'] != -1])
    n_outliers = sum(1 for t in r['topics'] if t == -1)
    n_docs = len(r['documents'])
    print(f"  {SOURCE_NAMES[outlet]:15s}: {n_docs:4d} articles, {n_topics:3d} topics, "
          f"{n_outliers:3d} outliers ({100*n_outliers/n_docs:.1f}%)")

# Cross-outlet group distribution
print(f"\n2. Cross-Outlet Topic Group Distribution")
print("-" * 50)
dist = Counter(g['n_outlets'] for g in topic_groups)
for n in sorted(dist.keys(), reverse=True):
    label = 'outlet' if n == 1 else 'outlets'
    print(f"  In {n} {label}: {dist[n]} groups")

# Selection bias indicators
print(f"\n3. Selection Bias Indicators")
print("-" * 50)
if len(bias_df) > 0:
    # Per-outlet: how many unique topics
    for outlet in OUTLETS:
        unique_count = sum(1 for g in unique_groups
                          if g['members'][0][0] == outlet)
        print(f"  {SOURCE_NAMES[outlet]:15s}: {unique_count} unique topics (not matched to other outlets)")

# Most unevenly covered topics (using LLM labels)
print(f"\n4. Most Unevenly Covered Topics")
print("-" * 50)
for _, row in bias_df.head(5).iterrows():
    print(f"  {row['Group']:5s} | {row['Label']:40s} | Spread: {row['Spread']:.1%}")
    print(f"         Max: {row['Max Coverage']:30s} | Min: {row['Min Coverage']}")

print("\n" + "=" * 70)
print("Analysis complete.")

PER-OUTLET TOPIC COMPARISON SUMMARY

1. Per-Outlet Statistics
--------------------------------------------------
  Daily News     :  502 articles,  28 topics,  52 outliers (10.4%)
  The Morning    :  491 articles,  24 topics,  75 outliers (15.3%)
  Daily FT       :  380 articles,  16 topics,  43 outliers (11.3%)
  The Island     :  284 articles,  17 topics,  66 outliers (23.2%)

2. Cross-Outlet Topic Group Distribution
--------------------------------------------------
  In 4 outlets: 4 groups
  In 2 outlets: 1 groups
  In 1 outlet: 5 groups

3. Selection Bias Indicators
--------------------------------------------------
  Daily News     : 4 unique topics (not matched to other outlets)
  The Morning    : 1 unique topics (not matched to other outlets)
  Daily FT       : 0 unique topics (not matched to other outlets)
  The Island     : 0 unique topics (not matched to other outlets)

4. Most Unevenly Covered Topics
--------------------------------------------------
  G2    | International