In [0]:
# ============================================================================
# INSTALLATION
# ============================================================================
!pip install transformers accelerate torch bertopic sentence-transformers scikit-learn pandas pyarrow hf-transfer seaborn

In [0]:
# ============================================================================
# IMPORTS
# ============================================================================
import pandas as pd
import torch
import pyarrow.parquet as pq
from transformers import AutoTokenizer, AutoModel
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from IPython.display import display

# ============================================================================
# CUSTOM EMBEDDER
# ============================================================================
class CustomEmbedder:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def encode(self, texts, **kwargs):
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.cpu().numpy()

# ============================================================================
# MAIN ANALYSIS
# ============================================================================
# Load model
print("Loading model...")
MODEL_NAME = "nvidia/llama-embed-nemotron-8b"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
print(f"✓ Loaded {MODEL_NAME}")

# Load data
print("\nLoading data...")
df = pq.read_table("export_articles_translated.parquet").to_pandas()
docs = df["content_english"].dropna().tolist()
print(f"✓ Loaded {len(docs)} documents")

# ============================================================================
# CONFIGURE UMAP AND HDBSCAN PARAMETERS
# ============================================================================
# UMAP parameters
umap_model = UMAP(
    n_neighbors=20,        # Default: 15. Higher = more global structure. Range: 2-100
    n_components=5,        # Default: 5. Number of dimensions to reduce to
    min_dist=0.0,          # Default: 0.1. Minimum distance between points. Range: 0.0-0.99
    metric='cosine',       # Distance metric: 'cosine', 'euclidean', etc.
    random_state=42        # For reproducibility
)

# HDBSCAN parameters
hdbscan_model = HDBSCAN(
    min_cluster_size=220,   # Default: 10. Minimum size of clusters. Increase for fewer, larger topics
    min_samples=15,         # Default: None. Higher = more conservative clustering
    metric='euclidean',    # Distance metric: 'euclidean', 'manhattan', etc.
    cluster_selection_method='eom',  # 'eom' or 'leaf'
    prediction_data=True   # Needed for transform
)

# Setup BERTopic
print("\nSetting up BERTopic...")
embedding_model = CustomEmbedder(model, tokenizer)
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
custom_stopwords = list(ENGLISH_STOP_WORDS) + ['said', 'efe']
vectorizer_model = CountVectorizer(stop_words=custom_stopwords)

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    verbose=True
)
print("✓ BERTopic configured")

# Run analysis
print("\nRunning topic modeling...")
topics, probs = topic_model.fit_transform(docs)

# Add topics to dataframe
df_clean = df.dropna(subset=["content_english"]).copy()
df_clean["topic"] = topics

# Display results
print(f"\n{'='*60}")
print(f"RESULTS: Found {len(set(topics))} topics")
print(f"{'='*60}\n")

# Show topic info
topic_info = topic_model.get_topic_info()
display(topic_info)

# Show sample of articles with topics
print(f"\n{'='*60}")
print("SAMPLE ARTICLES WITH TOPICS")
print(f"{'='*60}\n")
display(df_clean[["content_english", "topic"]].head(10))

# Optional: Save results (uncomment if needed)
# df_clean.to_parquet("articles_with_topics.parquet", index=False)
# topic_model.save("bertopic_model")
# print("\n✓ Saved to: articles_with_topics.parquet and bertopic_model/")

In [0]:
# ============================================================================
# SAVE RESULTS TO CSV
# ============================================================================
# Add topic labels to the dataframe
print("\nPreparing data for export...")

# Define your custom topic labels
custom_topic_labels = {
    -1: "Outliers / Unassigned",
    0: "Latin American Politics",
    1: "Swiss Domestic Affairs",
    2: "Africa & Middle East",
    3: "International Sports News",
    4: "Law, Crime, Public Safety",
    5: "Arts & Culture",
    6: "Business & Economics",
    7: "International Security & Military Affairs",
    8: "International Trade & Geopolitics",
    9: "Natural Disaster & Humanitarian Response",
    10: "US Domestic Affairs",
    11: "Climate Action & Policy",
    12: "Business & Economics in Latin America"
}

# Map topics to custom labels
df_clean['topic_label'] = df_clean['topic'].map(custom_topic_labels)

# Save all articles with topics
output_file = "articles_with_topics.csv"
df_clean.to_csv(output_file, index=False, encoding='utf-8')
print(f"✓ Saved all articles to: {output_file}")

In [0]:
import matplotlib.pyplot as plt
import numpy as np

# Calculate topic percentages
topic_counts = df_clean['topic_label'].value_counts()
topic_percentages = (topic_counts / len(df_clean) * 100).sort_values(ascending=True)

# Create horizontal bar chart
fig, ax = plt.subplots(figsize=(10, 8))

# Use a nice color palette (viridis, but you can try: 'plasma', 'cividis', 'coolwarm')
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(topic_percentages)))

bars = ax.barh(topic_percentages.index, topic_percentages.values, color=colors)

# Add bar labels
for i, (bar, value) in enumerate(zip(bars, topic_percentages.values)):
    ax.text(value + 0.3, i, f'{value:.1f}%', 
            va='center', fontsize=10, fontweight='bold')

# Minimal styling
ax.set_xlabel('Percentage of Corpus (%)', fontsize=11)
ax.set_ylabel('')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(axis='x', alpha=0.3, linestyle='--')

plt.tight_layout()
plt.savefig('topic_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"✓ Saved chart to: topic_distribution.png")

In [0]:
# ============================================================================
# TOPIC COUNTS OVER TIME - SMALL MULTIPLES (COMPACT)
# ============================================================================
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime

# ============================================================================
# LOAD DATA
# ============================================================================
print("Loading data...")
df = pd.read_csv("articles_with_topics.csv")
print(f"✓ Loaded {len(df)} articles")

# Parse dates
df['releaseDate'] = pd.to_datetime(df['releaseDate'])
df['date'] = df['releaseDate'].dt.date
print(f"Date range: {df['releaseDate'].min()} to {df['releaseDate'].max()}")

# Set date limits for x-axis
date_min = datetime(2025, 10, 27)
date_max = datetime(2025, 11, 5)

# ============================================================================
# PREPARE DATA
# ============================================================================
# Calculate total counts per topic
topic_totals = df.groupby('topic_label').size().reset_index(name='total')
topic_totals = topic_totals.sort_values('total', ascending=False)

# Separate outliers from other topics
outliers_mask = topic_totals['topic_label'] == 'Outliers / Unassigned'
outliers_topic = topic_totals[outliers_mask]['topic_label'].tolist()
other_topics = topic_totals[~outliers_mask]['topic_label'].tolist()

# Combine: sorted topics + outliers at the end
topics = other_topics + outliers_topic

daily_counts = df.groupby(['topic_label', 'date']).size().reset_index(name='count')

# ============================================================================
# CREATE SMALL MULTIPLES (COMPACT)
# ============================================================================
print("\nCreating compact small multiples chart...")
n_topics = len(topics)
n_cols = 4  # More columns = more compact
n_rows = (n_topics + n_cols - 1) // n_cols

# Reduced figure size
fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 2.5*n_rows))
axes = axes.flatten() if n_rows > 1 else axes

for idx, topic in enumerate(topics):
    ax = axes[idx]
    
    topic_data = daily_counts[daily_counts['topic_label'] == topic].copy()
    topic_data = topic_data.sort_values('date')
    topic_data['date'] = pd.to_datetime(topic_data['date'])
    
    # Calculate total
    total = topic_data['count'].sum()
    
    # Plot with label for legend
    ax.plot(topic_data['date'], topic_data['count'], 
            color='steelblue', linewidth=1.5, marker='o', markersize=3, 
            markerfacecolor='steelblue', markeredgecolor='white', markeredgewidth=0.5,
            label=f'Total: {total:,}')
    
    # Smaller title with less padding
    ax.set_title(topic, fontsize=9, fontweight='bold', pad=5)
    
    if topic == 'Outliers / Unassigned':
        ax.set_ylim(0, 385)
    else:
        ax.set_ylim(0, 190)
    
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    
    # Format x-axis
    ax.set_xlim(date_min, date_max)
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))  # Shorter date format
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=2))
    plt.setp(ax.xaxis.get_majorticklabels(), fontsize=7)
    
    # Add legend (text only, no marker or line)
    ax.legend(loc='upper right', fontsize=7, framealpha=0.9, handlelength=0, handletextpad=0, markerscale=0)

for idx in range(n_topics, len(axes)):
    axes[idx].axis('off')

plt.tight_layout(h_pad=1.5, w_pad=1.5)
plt.savefig('topics_over_time_small_multiples.png', dpi=300, bbox_inches='tight')
print("✓ Saved: topics_over_time_small_multiples.png")
plt.show()
plt.close()