In [18]:
# Setup 

import os
import json
import numpy as np
import pandas as pd

import psycopg2
from psycopg2.extras import RealDictCursor

import umap
import hdbscan

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
DB_CONFIG = {
    "host": os.getenv("PGHOST", "voz-publica2.postgres.database.azure.com"),
    "port": int(os.getenv("PGPORT", 5432)),
    "dbname": os.getenv("PGDATABASE", "postgres"),
    "user": os.getenv("PGUSER", "diegomancera"),
    "password": os.getenv("PGPASSWORD"),
    "sslmode": "require"
}

conn = psycopg2.connect(**DB_CONFIG)


In [22]:
# Fetch Embeddings and metadata - Last Quarter Only
sql = """
    SELECT
    s.doc_id,
    s.speaker_normalized,
    s.text,
    s.embedding,
    s.token_count,
    m.published_at
    FROM speech_turns s
    JOIN raw_transcripts_meta m
    ON s.doc_id = m.doc_id
    WHERE
    s.embedding IS NOT NULL
    AND s.text IS NOT NULL
    AND length(s.text) > 20
    AND s.token_count > 15
    AND m.published_at >= '2025-10-01'  -- Last quarter: Oct-Dec 2025
    ORDER BY m.published_at DESC;
"""

df = pd.read_sql(sql, conn)
print(f"Loaded {df.shape[0]} rows from last quarter")
print(f"Date range: {df['published_at'].min()} to {df['published_at'].max()}")
df.head()


  df = pd.read_sql(sql, conn)


Loaded 9622 rows from last quarter
Date range: 2025-10-01 14:59:00+00:00 to 2025-12-17 23:44:00+00:00


Unnamed: 0,doc_id,speaker_normalized,text,embedding,token_count,published_at
0,2025-12-17-conference,,La acompa√±an: La jefa de Gobierno de la Ciudad...,"[0.027068669,-0.007255755,0.01948871,0.0156713...",22,2025-12-17 23:44:00+00:00
1,2025-12-17-conference,Claudia Sheinbaum Pardo,"aci√≥n‚Äù. Y lo tercero, que es hermoso, es regal...","[0.012905889,0.032169674,-0.0027023687,0.00900...",138,2025-12-17 23:44:00+00:00
2,2025-12-17-conference,Claudia Sheinbaum Pardo,"viendo, en donde hac√≠a tres reflexiones que le...","[0.028016198,0.020888483,-0.019381993,0.028196...",300,2025-12-17 23:44:00+00:00
3,2025-12-17-conference,Claudia Sheinbaum Pardo,con los estudiantes en la explanada de Ciudad...,"[0.0053103263,-0.014485596,4.312076e-06,0.0177...",300,2025-12-17 23:44:00+00:00
4,2025-12-17-conference,Claudia Sheinbaum Pardo,Buenas tardes. Me da gusto estar con ustedes. ...,"[0.043162066,-0.018256737,-0.008310742,0.03373...",300,2025-12-17 23:44:00+00:00


## Convert embbedings to matrix 

In [23]:
def parse_embedding(e):
    if isinstance(e, list):
        return np.array(e, dtype=np.float32)
    if isinstance(e, str):
        return np.array(json.loads(e), dtype=np.float32)
    return np.array(e, dtype=np.float32)

embeddings = np.vstack(df["embedding"].apply(parse_embedding).values)
print(embeddings.shape)


(9622, 1536)


## Dimensionality Reduction with UMAP

In [24]:
reducer = umap.UMAP(
    n_neighbors=15,
    n_components=25,
    metric="cosine",
    random_state=42
)

embeddings_reduced = reducer.fit_transform(embeddings)
print(embeddings_reduced.shape)


  warn(


(9622, 25)


## Clustering with HDBSCAN

In [25]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=30,
    min_samples=10,
    metric="euclidean",
    cluster_selection_method="eom"
)

labels = clusterer.fit_predict(embeddings_reduced)

df["topic_id"] = labels
df["topic_id"].value_counts().head(10)


topic_id
-1     2384
 18     792
 20     548
 16     520
 19     442
 12     405
 17     299
 24     269
 30     216
 37     207
Name: count, dtype: int64

In [26]:
# Topic filtering 
topics_df = df[df["topic_id"] != -1].copy()

print("Number of topics:", topics_df["topic_id"].nunique())


Number of topics: 63


In [27]:
# Extrtact representative sentences per topic
# We‚Äôll select sentences closest to the topic centroid.
topic_descriptors = []

for topic_id, group in topics_df.groupby("topic_id"):
    topic_embeddings = np.vstack(group["embedding"].apply(parse_embedding))
    centroid = topic_embeddings.mean(axis=0, keepdims=True)

    sims = cosine_similarity(topic_embeddings, centroid).flatten()
    top_idx = sims.argsort()[-5:][::-1]

    reps = group.iloc[top_idx][["text", "speaker_normalized", "published_at"]]

    topic_descriptors.append({
        "topic_id": topic_id,
        "size": len(group),
        "representative_sentences": reps.to_dict(orient="records")
    })


In [28]:
#Extract keywords per topic (TF-IDF)

# Common Spanish stop words
spanish_stopwords = [
    'el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'ser', 'se', 'no', 'haber',
    'por', 'con', 'su', 'para', 'como', 'estar', 'tener', 'le', 'lo', 'todo',
    'pero', 'm√°s', 'hacer', 'o', 'poder', 'decir', 'este', 'ir', 'otro', 'ese',
    'la', 'si', 'me', 'ya', 'ver', 'porque', 'dar', 'cuando', '√©l', 'muy',
    'sin', 'vez', 'mucho', 'saber', 'qu√©', 'sobre', 'mi', 'alguno', 'mismo',
    'yo', 'tambi√©n', 'hasta', 'a√±o', 'dos', 'querer', 'entre', 'as√≠', 'primero',
    'desde', 'grande', 'eso', 'ni', 'nos', 'llegar', 'pasar', 'tiempo', 'ella',
    's√≠', 'd√≠a', 'uno', 'bien', 'poco', 'deber', 'entonces', 'poner', 'cosa',
    'tanto', 'hombre', 'parecer', 'nuestro', 'tan', 'donde', 'ahora', 'parte',
    'despu√©s', 'vida', 'quedar', 'siempre', 'creer', 'hablar', 'llevar', 'dejar',
    'nada', 'cada', 'seguir', 'menos', 'nuevo', 'encontrar', 'algo', 'solo',
    'decir', 'saber', 'sentir', 'tomar', 'mano', 'antes', 'mundo', 'aqu√≠',
    'sus', 'les', 'te', 'esta', 'del', 'al', 'los', 'las', 'unos', 'unas', 'd√≠as',
    'buenos', 'buen', 'est√°n', 'c√≥mo', 'tardes', 'buenas', 'permiso',
   'gracias', 'alegr√≠as', 'da√±os', 'todas', 'todos', 'estamos', 'chavas', 'chavos'
]

vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words=spanish_stopwords,
    ngram_range=(1, 2)
)

topic_keywords = {}

for topic_id, group in topics_df.groupby("topic_id"):
    texts = group["text"].tolist()
    tfidf = vectorizer.fit_transform(texts)
    scores = tfidf.mean(axis=0).A1
    terms = np.array(vectorizer.get_feature_names_out())

    top_terms = terms[scores.argsort()[-10:][::-1]]
    topic_keywords[topic_id] = top_terms.tolist()


In [30]:
# Final Topic Summary
final_topics = []

for t in topic_descriptors:
    tid = t["topic_id"]
    final_topics.append({
        "topic_id": tid,
        "size": t["size"],
        "keywords": topic_keywords.get(tid, []),
        "examples": t["representative_sentences"]
    })

final_topics[:2]


[{'topic_id': 0,
  'size': 34,
  'keywords': ['dirige presidenta',
   'mensaje dirige',
   'escuchemos mensaje',
   'escuchemos',
   'dirige',
   'mensaje',
   'mexicanos doctora',
   'constitucional',
   'presidenta constitucional',
   'constitucional estados'],
  'examples': [{'text': 'Estimado p√∫blico, escuchemos el mensaje que nos dirige la Presidenta Constitucional de los Estados Unidos Mexicanos, la Doctora Claudia Sheinbaum Pardo.',
    'speaker_normalized': None,
    'published_at': Timestamp('2025-12-13 19:33:00+0000', tz='UTC')},
   {'text': 'Estimado p√∫blico, escuchemos el mensaje que nos dirige la Presidenta Constitucional de los Estados Unidos Mexicanos, la Doctora Claudia Sheinbaum Pardo.',
    'speaker_normalized': None,
    'published_at': Timestamp('2025-12-14 01:44:00+0000', tz='UTC')},
   {'text': 'Estimado p√∫blico, escuchemos el mensaje que nos dirige la Presidenta Constitucional de los Estados Unidos Mexicanos, Doctora Claudia Sheinbaum Pardo.',
    'speaker_nor

## Improvements Needed

The current approach captures too much formulaic language. Let's implement:
1. **Higher token threshold** - Focus on substantive speeches (50+ tokens)
2. **Better stop words** - Add domain-specific terms
3. **Text preprocessing** - Remove formulaic openings
4. **Adjusted UMAP/HDBSCAN** - Better parameters
5. **Filter very short or repetitive content**


In [34]:
# IMPROVED VERSION - Fetch only substantive content
sql_improved = """
    SELECT
    s.doc_id,
    s.speaker_normalized,
    s.text,
    s.embedding,
    s.token_count,
    m.published_at
    FROM speech_turns s
    JOIN raw_transcripts_meta m
    ON s.doc_id = m.doc_id
    WHERE
    s.embedding IS NOT NULL
    AND s.text IS NOT NULL
    AND s.token_count > 50  -- Substantive speeches only
    AND m.published_at >= '2025-10-01'
    ORDER BY m.published_at DESC;
"""

df_improved = pd.read_sql(sql_improved, conn)
print(f"Loaded {df_improved.shape[0]} substantive speeches")
print(f"Token count stats: min={df_improved['token_count'].min()}, mean={df_improved['token_count'].mean():.0f}, max={df_improved['token_count'].max()}")
df_improved.head()


  df_improved = pd.read_sql(sql_improved, conn)


Loaded 7391 substantive speeches
Token count stats: min=51, mean=212, max=300


Unnamed: 0,doc_id,speaker_normalized,text,embedding,token_count,published_at
0,2025-12-17-conference,Claudia Sheinbaum Pardo,"aci√≥n‚Äù. Y lo tercero, que es hermoso, es regal...","[0.012905889,0.032169674,-0.0027023687,0.00900...",138,2025-12-17 23:44:00+00:00
1,2025-12-17-conference,Claudia Sheinbaum Pardo,"viendo, en donde hac√≠a tres reflexiones que le...","[0.028016198,0.020888483,-0.019381993,0.028196...",300,2025-12-17 23:44:00+00:00
2,2025-12-17-conference,Claudia Sheinbaum Pardo,con los estudiantes en la explanada de Ciudad...,"[0.0053103263,-0.014485596,4.312076e-06,0.0177...",300,2025-12-17 23:44:00+00:00
3,2025-12-17-conference,Claudia Sheinbaum Pardo,Buenas tardes. Me da gusto estar con ustedes. ...,"[0.043162066,-0.018256737,-0.008310742,0.03373...",300,2025-12-17 23:44:00+00:00
4,2025-12-17-conference,,Encabeza esta entrega gratuita de libros ‚Äú25 p...,"[0.045315072,0.04872283,0.016775096,0.03860097...",51,2025-12-17 23:44:00+00:00


In [35]:
# Convert embeddings to matrix
embeddings_improved = np.vstack(df_improved["embedding"].apply(parse_embedding).values)
print(f"Embeddings shape: {embeddings_improved.shape}")


Embeddings shape: (7391, 1536)


In [36]:
# IMPROVED UMAP - Fewer components for better clustering
reducer_improved = umap.UMAP(
    n_neighbors=15,
    n_components=15,  # Reduced from 25
    metric="cosine",
    min_dist=0.0,  # Allow tighter clusters
    random_state=42
)

embeddings_reduced_improved = reducer_improved.fit_transform(embeddings_improved)
print(f"Reduced shape: {embeddings_reduced_improved.shape}")


  warn(


Reduced shape: (7391, 15)


In [37]:
# IMPROVED HDBSCAN - Adjusted parameters
clusterer_improved = hdbscan.HDBSCAN(
    min_cluster_size=50,  # Increased from 30 for more coherent topics
    min_samples=15,  # Increased from 10
    metric="euclidean",
    cluster_selection_method="eom",
    cluster_selection_epsilon=0.5  # Allow merging similar clusters
)

labels_improved = clusterer_improved.fit_predict(embeddings_reduced_improved)
df_improved["topic_id"] = labels_improved

print(f"Number of topics found: {len(set(labels_improved)) - (1 if -1 in labels_improved else 0)}")
print(f"Noise ratio: {(labels_improved == -1).sum() / len(labels_improved):.1%}")
print("\nTopic distribution:")
df_improved["topic_id"].value_counts().head(15)


Number of topics found: 12
Noise ratio: 2.8%

Topic distribution:


topic_id
 11    3102
 8     1203
 2      645
 10     567
 4      480
 9      392
 7      296
-1      206
 5      146
 6      139
 0       91
 3       63
 1       61
Name: count, dtype: int64

In [38]:
# Filter out noise
topics_df_improved = df_improved[df_improved["topic_id"] != -1].copy()
print(f"Topics with assigned clusters: {topics_df_improved.shape[0]} speeches")
print(f"Number of unique topics: {topics_df_improved['topic_id'].nunique()}")


Topics with assigned clusters: 7185 speeches
Number of unique topics: 12


In [43]:
# IMPROVED: Centroid-based keyword extraction (better than pure TF-IDF)
# This aligns descriptors with semantic center, not just frequency

spanish_stopwords_expanded = [
    # Basic stop words
    'el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'ser', 'se', 'no', 'haber',
    'por', 'con', 'su', 'para', 'como', 'estar', 'tener', 'le', 'lo', 'todo',
    'pero', 'm√°s', 'hacer', 'o', 'poder', 'decir', 'este', 'ir', 'otro', 'ese',
    'si', 'me', 'ya', 'ver', 'porque', 'dar', 'cuando', '√©l', 'muy', 'sin',
    'vez', 'mucho', 'saber', 'qu√©', 'sobre', 'mi', 'alguno', 'mismo', 'yo',
    'tambi√©n', 'hasta', 'a√±o', 'dos', 'querer', 'entre', 'as√≠', 'primero',
    'desde', 'grande', 'eso', 'ni', 'nos', 'llegar', 'pasar', 'tiempo', 'ella',
    's√≠', 'd√≠a', 'uno', 'bien', 'poco', 'deber', 'entonces', 'poner', 'cosa',
    'tanto', 'hombre', 'parecer', 'nuestro', 'tan', 'donde', 'ahora', 'parte',
    'despu√©s', 'vida', 'quedar', 'siempre', 'creer', 'hablar', 'llevar', 'dejar',
    'nada', 'cada', 'seguir', 'menos', 'nuevo', 'encontrar', 'algo', 'solo',
    'sentir', 'tomar', 'mano', 'antes', 'mundo', 'aqu√≠', 'sus', 'les', 'te',
    'esta', 'del', 'al', 'los', 'las', 'unos', 'unas', 'es', 'una', 'hay', 'est√°', 'ha', 'vamos', 'tenemos',
    # Government/formulaic terms
    'd√≠as', 'buenos', 'buen', 'est√°n', 'c√≥mo', 'tardes', 'buenas', 'permiso',
    'gracias', 'alegr√≠as', 'da√±os', 'todas', 'todos', 'estamos', 'chavas', 'chavos',
    'presidenta', 'presidente', 'constitucional', 'estados', 'unidos', 'mexicanos',
    'doctora', 'doctor', 'mensaje', 'escuchemos', 'dirige', 'estimado', 'p√∫blico',
    'se√±or', 'se√±ora', 'licenciado', 'licenciada', 'secretario', 'secretaria',
    'excelencia', 'honorable', 'distinguido', 'distinguida', 'compa√±eros', 'compa√±eras',
    'bienvenidos', 'bienvenidas', 'saludo', 'saludos', 'presente', 'presentes',
    'asistentes', 'ciudadanos', 'ciudadanas', 'estimados', 'estimadas', 'queridos',
    'queridas', 'apreciados', 'apreciadas', 'nombre', 'representaci√≥n', 'gobierno',
    'federal', 'nacional', 'pardo', 'sheinbaum', 'claudia', 'rep√∫blica', 'm√©xico'
]

topic_keywords_improved = {}
MACRO_TOPIC_THRESHOLD = 1500  # Clusters larger than this are "macro-topics"

for topic_id, group in topics_df_improved.groupby("topic_id"):
    if len(group) < 2:
        continue
    
    is_macro_topic = len(group) > MACRO_TOPIC_THRESHOLD
    
    # Compute cluster centroid embedding
    topic_embeddings = np.vstack(group["embedding"].apply(parse_embedding))
    centroid = topic_embeddings.mean(axis=0, keepdims=True)
    
    # Find speeches closest to centroid (semantic core of the topic)
    sims = cosine_similarity(topic_embeddings, centroid).flatten()
    
    # For macro-topics, use more samples; for regular topics, use fewer
    n_core_samples = 100 if is_macro_topic else 50
    n_core_samples = min(n_core_samples, len(group))
    
    top_idx = sims.argsort()[-n_core_samples:]
    core_texts = group.iloc[top_idx]["text"].tolist()
    
    # Extract keywords from semantic core only (not all documents)
    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words=spanish_stopwords_expanded,
        ngram_range=(1, 3) if not is_macro_topic else (1, 2),  # Simpler n-grams for macro-topics
        min_df=2,
        max_df=0.7
    )
    
    try:
        tfidf = vectorizer.fit_transform(core_texts)
        scores = tfidf.mean(axis=0).A1
        terms = np.array(vectorizer.get_feature_names_out())
        
        # For macro-topics, get more keywords (they're narrative backbones)
        n_keywords = 20 if is_macro_topic else 15
        top_terms = terms[scores.argsort()[-n_keywords:][::-1]]
        
        topic_keywords_improved[topic_id] = {
            "keywords": top_terms.tolist(),
            "is_macro_topic": is_macro_topic,
            "size": len(group)
        }
    except:
        # Handle edge cases
        topic_keywords_improved[topic_id] = {
            "keywords": [],
            "is_macro_topic": is_macro_topic,
            "size": len(group)
        }

print(f"Extracted keywords for {len(topic_keywords_improved)} topics")
macro_count = sum(1 for v in topic_keywords_improved.values() if v["is_macro_topic"])
print(f"  ‚Ä¢ {macro_count} macro-topics (>{MACRO_TOPIC_THRESHOLD} speeches - narrative backbones)")
print(f"  ‚Ä¢ {len(topic_keywords_improved) - macro_count} regular topics")


Extracted keywords for 12 topics
  ‚Ä¢ 1 macro-topics (>1500 speeches - narrative backbones)
  ‚Ä¢ 11 regular topics


In [44]:
# Extract representative sentences for improved topics
topic_descriptors_improved = []

for topic_id, group in topics_df_improved.groupby("topic_id"):
    if len(group) < 2:
        continue
        
    topic_embeddings = np.vstack(group["embedding"].apply(parse_embedding))
    centroid = topic_embeddings.mean(axis=0, keepdims=True)
    
    sims = cosine_similarity(topic_embeddings, centroid).flatten()
    top_idx = sims.argsort()[-5:][::-1]
    
    reps = group.iloc[top_idx][["text", "speaker_normalized", "published_at", "token_count"]]
    
    topic_descriptors_improved.append({
        "topic_id": topic_id,
        "size": len(group),
        "representative_sentences": reps.to_dict(orient="records")
    })

print(f"Descriptors created for {len(topic_descriptors_improved)} topics")


Descriptors created for 12 topics


In [45]:
# Build improved final topics summary with macro-topic labels
final_topics_improved = []

for t in topic_descriptors_improved:
    tid = t["topic_id"]
    topic_info = topic_keywords_improved.get(tid, {"keywords": [], "is_macro_topic": False, "size": 0})
    
    final_topics_improved.append({
        "topic_id": tid,
        "size": t["size"],
        "is_macro_topic": topic_info["is_macro_topic"],
        "keywords": topic_info["keywords"][:10],  # Top 10 keywords
        "examples": t["representative_sentences"][:3]  # Top 3 examples
    })

# Sort by size (largest topics first)
final_topics_improved.sort(key=lambda x: x["size"], reverse=True)

print(f"Total topics discovered: {len(final_topics_improved)}")

# Count macro-topics
macro_topics = [t for t in final_topics_improved if t["is_macro_topic"]]
regular_topics = [t for t in final_topics_improved if not t["is_macro_topic"]]

print(f"  ‚Ä¢ {len(macro_topics)} MACRO-TOPICS (narrative backbones, >1500 speeches)")
print(f"  ‚Ä¢ {len(regular_topics)} regular topics")

print("\n" + "="*80)
print("TOP 5 TOPICS BY SIZE")
print("="*80)

for i, topic in enumerate(final_topics_improved[:5], 1):
    topic_type = "üî¥ MACRO-TOPIC" if topic["is_macro_topic"] else "üü¢ Regular Topic"
    print(f"\n{topic_type}")
    print(f"üìå TOPIC {topic['topic_id']} ({topic['size']} speeches)")
    print(f"Keywords: {', '.join(topic['keywords'][:8])}")
    print(f"\nExample excerpt: {topic['examples'][0]['text'][:200]}...")
    print("-"*80)


Total topics discovered: 12
  ‚Ä¢ 1 MACRO-TOPICS (narrative backbones, >1500 speeches)
  ‚Ä¢ 11 regular topics

TOP 5 TOPICS BY SIZE

üî¥ MACRO-TOPIC
üìå TOPIC 11 (3102 speeches)
Keywords: seguridad, pueblo, michoac√°n, hemos, muchas, tiene, tema, nosotros

Example excerpt:  luchamos contra eso. Entonces, se cae por s√≠ mismo. Que ‚Äúla Presidenta protege a delincuentes‚Äù. Bueno, el Consejo de Seguridad de M√©xico es el √∫nico que ha enviado delincuentes de un lado y del otro,...
--------------------------------------------------------------------------------

üü¢ Regular Topic
üìå TOPIC 8 (1203 speeches)
Keywords: caminos, mil, trabajando, son, va, estado, hidalgo, puentes

Example excerpt:  liberando todo el camino. Est√° all√°, hoy llega el subsecretario de Infraestructura all√°, para apoyar las labores; la subsecretaria de Transportes est√° en Hidalgo; estamos con los directores generales...
--------------------------------------------------------------------------------

üü¢

In [46]:
# Detailed view of all topics with macro-topic indicators
for topic in final_topics_improved:
    topic_type = "üî¥ MACRO-TOPIC (Narrative Backbone)" if topic["is_macro_topic"] else "üü¢ Regular Topic"
    
    print(f"\n{'='*80}")
    print(f"{topic_type}")
    print(f"TOPIC {topic['topic_id']} - Size: {topic['size']} speeches")
    print(f"{'='*80}")
    
    if topic["is_macro_topic"]:
        print("\n‚ö†Ô∏è  This is a macro-topic (>1500 speeches)")
        print("   ‚Ä¢ Don't expect sharp keywords - it's a broad narrative backbone")
        print("   ‚Ä¢ Keywords below are from the semantic core (centroid-based)")
        print("   ‚Ä¢ Consider sub-clustering this topic for finer granularity\n")
    
    print(f"üîë Keywords (extracted from semantic centroid):")
    for kw in topic['keywords'][:10]:
        print(f"   ‚Ä¢ {kw}")
    
    print(f"\nüìù Representative Examples (closest to centroid):")
    for i, ex in enumerate(topic['examples'][:2], 1):
        print(f"\n   Example {i} ({ex.get('token_count', 'N/A')} tokens):")
        print(f"   Speaker: {ex['speaker_normalized'] or 'Unknown'}")
        print(f"   Date: {ex['published_at']}")
        print(f"   Text: {ex['text'][:300]}...")



üî¥ MACRO-TOPIC (Narrative Backbone)
TOPIC 11 - Size: 3102 speeches

‚ö†Ô∏è  This is a macro-topic (>1500 speeches)
   ‚Ä¢ Don't expect sharp keywords - it's a broad narrative backbone
   ‚Ä¢ Keywords below are from the semantic core (centroid-based)
   ‚Ä¢ Consider sub-clustering this topic for finer granularity

üîë Keywords (extracted from semantic centroid):
   ‚Ä¢ seguridad
   ‚Ä¢ pueblo
   ‚Ä¢ michoac√°n
   ‚Ä¢ hemos
   ‚Ä¢ muchas
   ‚Ä¢ tiene
   ‚Ä¢ tema
   ‚Ä¢ nosotros
   ‚Ä¢ son
   ‚Ä¢ justicia

üìù Representative Examples (closest to centroid):

   Example 1 (300 tokens):
   Speaker: Claudia Sheinbaum Pardo
   Date: 2025-12-02 15:26:00+00:00
   Text:  luchamos contra eso. Entonces, se cae por s√≠ mismo. Que ‚Äúla Presidenta protege a delincuentes‚Äù. Bueno, el Consejo de Seguridad de M√©xico es el √∫nico que ha enviado delincuentes de un lado y del otro, y del otro, y del otro, a Estados Unidos, extraditados o enviados. O sea, cuando pas√≥ eso dec√≠an un...

   Example 2 

## Summary of Improvements Applied

‚úÖ **Centroid-based keyword extraction** - Keywords are extracted from speeches closest to the cluster centroid (semantic core), not all documents. This gives semantically-aligned descriptors rather than just frequency-based ones.

‚úÖ **Macro-topic labeling** - Clusters with >1500 speeches are labeled as "macro-topics" (narrative backbones). These don't have sharp keywords by nature and represent broad thematic areas. They can be sub-clustered later for finer granularity.

‚úÖ **Higher token threshold (50+)** - Filters out formulaic/ceremonial language

‚úÖ **Optimized UMAP/HDBSCAN** - Better clustering parameters for political corpora

‚úÖ **Enhanced stop words** - Domain-specific terms filtered out


In [47]:
# OPTIONAL: Sub-cluster a macro-topic for finer granularity
# Pick a macro-topic ID and run this cell to break it down further

MACRO_TOPIC_TO_SUBCLUSTER = 0  # Change this to the macro-topic ID you want to split

# Filter to just this macro-topic
macro_df = df_improved[df_improved["topic_id"] == MACRO_TOPIC_TO_SUBCLUSTER].copy()

if len(macro_df) > 0:
    print(f"Sub-clustering macro-topic {MACRO_TOPIC_TO_SUBCLUSTER} ({len(macro_df)} speeches)")
    
    # Extract embeddings
    macro_embeddings = np.vstack(macro_df["embedding"].apply(parse_embedding).values)
    
    # Reduce with UMAP
    sub_reducer = umap.UMAP(
        n_neighbors=15,
        n_components=10,
        metric="cosine",
        min_dist=0.0,
        random_state=42
    )
    macro_reduced = sub_reducer.fit_transform(macro_embeddings)
    
    # Cluster with HDBSCAN (more granular)
    sub_clusterer = hdbscan.HDBSCAN(
        min_cluster_size=100,  # Smaller clusters
        min_samples=20,
        metric="euclidean",
        cluster_selection_method="eom"
    )
    sub_labels = sub_clusterer.fit_predict(macro_reduced)
    
    macro_df["sub_topic_id"] = sub_labels
    
    print(f"\nSub-topics found: {len(set(sub_labels)) - (1 if -1 in sub_labels else 0)}")
    print(f"Noise: {(sub_labels == -1).sum()} speeches")
    print("\nSub-topic distribution:")
    print(macro_df["sub_topic_id"].value_counts().head(10))
else:
    print(f"Macro-topic {MACRO_TOPIC_TO_SUBCLUSTER} not found or is not a macro-topic")


Sub-clustering macro-topic 0 (91 speeches)


  warn(



Sub-topics found: 0
Noise: 91 speeches

Sub-topic distribution:
sub_topic_id
-1    91
Name: count, dtype: int64


## LLM Labelling

In [48]:
def build_topic_prompt(topic):
    examples = "\n".join(
        f"- {ex['text'][:300]}"
        for ex in topic["examples"][:5]
    )

    topic_type = "macro-topic" if topic["size"] > 1500 else "regular topic"

    prompt = f"""
You are analyzing political speech topics discovered using semantic clustering.

This topic is a {topic_type}.

Keywords:
{", ".join(topic["keywords"])}

Representative excerpts:
{examples}

Tasks:
1. Provide a concise topic label (3‚Äì6 words).
2. Write a brief description (2‚Äì3 sentences) explaining what this topic represents.
3. If this is a macro-topic, explain why it functions as a narrative backbone.

Do NOT invent information.
Base your answer strictly on the provided material.
"""

    return prompt.strip()


In [54]:
from openai import AzureOpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
print("‚úÖ Environment variables loaded from .env")

# Initialize Azure OpenAI client
client = AzureOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
)

azure_openai_chat_deployment = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT", "gpt-4.1")

def label_topic_with_llm(topic, client, deployment=azure_openai_chat_deployment):
    prompt = build_topic_prompt(topic)

    response = client.chat.completions.create(
        model=deployment,
        messages=[
            {"role": "system", "content": "You are a careful political discourse analyst."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2
    )

    return response.choices[0].message.content

print("‚úÖ Azure OpenAI client initialized")
print(f"Using deployment: {azure_openai_chat_deployment}")


‚úÖ Environment variables loaded from .env
‚úÖ Azure OpenAI client initialized
Using deployment: gpt-4.1


In [55]:
# Test LLM labeling on the top 3 topics first
print("Generating LLM labels for top 3 topics...")
print("="*80)

for i, topic in enumerate(final_topics_improved[:3], 1):
    print(f"\nüè∑Ô∏è  Labeling Topic {topic['topic_id']} (Size: {topic['size']})...")
    
    try:
        label = label_topic_with_llm(topic, client)
        topic["llm_label"] = label
        print(f"\n{label}")
        print("-"*80)
    except Exception as e:
        print(f"‚ùå Error labeling topic {topic['topic_id']}: {e}")
        topic["llm_label"] = None

print("\n‚úÖ Test labeling complete!")


Generating LLM labels for top 3 topics...

üè∑Ô∏è  Labeling Topic 11 (Size: 3102)...

1. Topic Label  
Security, Justice, and Governance in Michoac√°n

2. Brief Description  
This topic centers on issues of public security and justice in Michoac√°n, highlighting government strategies to combat crime, strengthen institutions, and address extortion affecting local industries. It emphasizes collective governmental action, extradition of criminals, and the need for robust law enforcement and judicial systems.

3. Macro-topic Explanation  
As a macro-topic, this functions as a narrative backbone because it integrates core themes of governance, security, and justice that underpin broader political discourse. It connects specific regional concerns (Michoac√°n, extortion, local industries) with national strategies and institutional reforms, serving as a foundation for related discussions on policy, leadership, and public trust.
-----------------------------------------------------------------

In [56]:
# Label ALL topics (this may take a few minutes)
import time

print(f"Generating LLM labels for ALL {len(final_topics_improved)} topics...")
print("This may take a few minutes...\n")

labeled_count = 0
error_count = 0

for i, topic in enumerate(final_topics_improved, 1):
    # Skip if already labeled
    if "llm_label" in topic and topic["llm_label"]:
        labeled_count += 1
        continue
    
    try:
        print(f"[{i}/{len(final_topics_improved)}] Labeling Topic {topic['topic_id']} (Size: {topic['size']})...", end=" ")
        label = label_topic_with_llm(topic, client)
        topic["llm_label"] = label
        labeled_count += 1
        print("‚úÖ")
        
        # Small delay to avoid rate limits
        time.sleep(0.5)
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        topic["llm_label"] = None
        error_count += 1

print(f"\n{'='*80}")
print(f"‚úÖ Labeling complete!")
print(f"   Successfully labeled: {labeled_count}/{len(final_topics_improved)}")
if error_count > 0:
    print(f"   Errors: {error_count}")


Generating LLM labels for ALL 12 topics...
This may take a few minutes...

[4/12] Labeling Topic 10 (Size: 567)... ‚úÖ
[5/12] Labeling Topic 4 (Size: 480)... ‚úÖ
[6/12] Labeling Topic 9 (Size: 392)... ‚úÖ
[7/12] Labeling Topic 7 (Size: 296)... ‚úÖ
[8/12] Labeling Topic 5 (Size: 146)... ‚úÖ
[9/12] Labeling Topic 6 (Size: 139)... ‚úÖ
[10/12] Labeling Topic 0 (Size: 91)... ‚úÖ
[11/12] Labeling Topic 3 (Size: 63)... ‚úÖ
[12/12] Labeling Topic 1 (Size: 61)... ‚úÖ

‚úÖ Labeling complete!
   Successfully labeled: 12/12


In [57]:
# Display all labeled topics with LLM-generated descriptions
print("="*80)
print("FINAL LABELED TOPICS")
print("="*80)

for topic in final_topics_improved:
    topic_type = "üî¥ MACRO-TOPIC" if topic["is_macro_topic"] else "üü¢ Regular Topic"
    
    print(f"\n{topic_type}")
    print(f"üìä TOPIC {topic['topic_id']} | Size: {topic['size']} speeches")
    print("-"*80)
    
    if topic.get("llm_label"):
        print(f"\n{topic['llm_label']}")
    else:
        print("\n‚ùå No LLM label generated")
    
    print(f"\nüîë Top Keywords: {', '.join(topic['keywords'][:5])}")
    print("="*80)


FINAL LABELED TOPICS

üî¥ MACRO-TOPIC
üìä TOPIC 11 | Size: 3102 speeches
--------------------------------------------------------------------------------

1. Topic Label  
Security, Justice, and Governance in Michoac√°n

2. Brief Description  
This topic centers on issues of public security and justice in Michoac√°n, highlighting government strategies to combat crime, strengthen institutions, and address extortion affecting local industries. It emphasizes collective governmental action, extradition of criminals, and the need for robust law enforcement and judicial systems.

3. Macro-topic Explanation  
As a macro-topic, this functions as a narrative backbone because it integrates core themes of governance, security, and justice that underpin broader political discourse. It connects specific regional concerns (Michoac√°n, extortion, local industries) with national strategies and institutional reforms, serving as a foundation for related discussions on policy, leadership, and public tr

In [58]:
# Save labeled topics to JSON file for later use
output_data = []

for topic in final_topics_improved:
    output_data.append({
        "topic_id": int(topic["topic_id"]),
        "size": int(topic["size"]),
        "is_macro_topic": bool(topic["is_macro_topic"]),
        "keywords": topic["keywords"],
        "llm_label": topic.get("llm_label", ""),
        "examples": [
            {
                "text": ex["text"],
                "speaker": ex["speaker_normalized"],
                "date": ex["published_at"].isoformat() if hasattr(ex["published_at"], "isoformat") else str(ex["published_at"]),
                "token_count": int(ex.get("token_count", 0))
            }
            for ex in topic["examples"]
        ]
    })

output_file = "topics_labeled_q4_2025.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Saved {len(output_data)} labeled topics to {output_file}")
print(f"   ‚Ä¢ {sum(1 for t in output_data if t['is_macro_topic'])} macro-topics")
print(f"   ‚Ä¢ {sum(1 for t in output_data if not t['is_macro_topic'])} regular topics")


‚úÖ Saved 12 labeled topics to topics_labeled_q4_2025.json
   ‚Ä¢ 1 macro-topics
   ‚Ä¢ 11 regular topics
