# Nowcasting Coup Events

- Using news headlines from **January 2017 to December 2024**
- Focus on **four countries**: Belarus, Bolivia, DR Congo, and Mali
- Objective: **train a coup event detector** using a **support set labeled with GPT-generated synthetic annotations**

In [None]:
import pandas as pd

df_corpus = pd.read_csv("input/df_corpus.csv")
df_corpus

In [None]:
# Support set
df_support = pd.read_csv('input/support.csv')
df_support.head()

In [None]:
from FewShotX import Embeddings

embedder = Embeddings(model_name='all-MiniLM-L6-v2')
df_support_embed = embedder.embed_df(df_support, text_col='title')
df_support_embed.head()

In [None]:
from FewShotX import Embeddings

run_embeddings_query = False

if run_embeddings_query:
    # Query set
    embedder = Embeddings(model_name='all-MiniLM-L6-v2')
    df_query_embed = embedder.embed_df(df_corpus, text_col='title')
else:
    # Load the precomputed embeddings
    df_query_embed = pd.read_csv('temp/df_embed.csv')

df_query_embed.head()

In [None]:
import numpy as np
from sklearn.preprocessing import normalize

class ProtoScorer:
    def __init__(self, prototype_df, dtype=np.float32, similarity='cosine'):
        self.dtype = dtype
        self.similarity_type = similarity
        
        # Extract prototype embeddings
        self.prototype_embeddings = prototype_df[[col for col in prototype_df.columns if 'emb' in col]].values.astype(dtype)
        
        if similarity == 'cosine':
            self.prototype_embeddings = normalize(self.prototype_embeddings, axis=1)
            self.similarity_fn = self._cosine_similarity
        elif similarity == 'dot':
            self.similarity_fn = self._dot_product
        else:
            raise ValueError("Unsupported similarity. Use 'cosine' or 'dot'.")

    def _cosine_similarity(self, queries, prototypes):
        queries = normalize(queries, axis=1)
        sim = np.dot(queries, prototypes.T)
        return (sim + 1) / 2  # map cosine similarity from [-1, 1] to [0, 1]

    def _dot_product(self, queries, prototypes):
        sim = np.dot(queries, prototypes.T)
        # Optional normalization for dot product
        sim_min, sim_max = sim.min(), sim.max()
        return (sim - sim_min) / (sim_max - sim_min + 1e-8)  # scale to [0, 1]

    def score(self, query_embeddings):
        return np.array(self.similarity_fn(query_embeddings, self.prototype_embeddings), dtype=self.dtype)

    def score_dataframe(self, df):
        # Extract query embeddings
        query_embeddings = df[[col for col in df.columns if 'emb' in col]].values.astype(self.dtype)

        # Compute similarity scores
        scores = self.score(query_embeddings)
        mean_score = np.mean(scores, axis=1)

        # Create a DataFrame of similarity scores
        score_columns = [f'score_{i}' for i in range(self.prototype_embeddings.shape[0])]
        scores_df = pd.DataFrame(scores, columns=score_columns, dtype=self.dtype)

        # Merge with metadata and add mean score
        non_emb_cols = [col for col in df.columns if 'emb' not in col]
        export_df = pd.concat([df[non_emb_cols].reset_index(drop=True), scores_df], axis=1)
        export_df['mean_score'] = mean_score

        return export_df

In [None]:
# Score your large query set
scorer = ProtoScorer(df_support_embed, similarity='dot')
df_scored = scorer.score_dataframe(df_query_embed)
df_scored

## Unsupervised method: DBSCAN

We will cluster articles using `days_since_start`, and `mean_score` (or their scaled versions), this means:
+ Articles must be close in time and score
+ To form a cluster, there must be at least `k` articles that are close to each other

In [None]:
import pandas as pd
import numpy as np

# Load the scored DataFrame
df_scored = pd.read_csv('temp/df_scored.csv')
df_clusters = df_scored[['isocode', 'date', 'title', 'mean_score']].copy()

# Just keep the top 10% of the scores
df_clusters = df_clusters[df_clusters['mean_score'] >= 0.5]
df_clusters = df_clusters.sort_values(by=['isocode', 'date'])
df_clusters = df_clusters.reset_index(drop=True)
df_clusters

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Step 1: Prepare data
df = df_clusters.copy()
df['date_dt'] = pd.to_datetime(df['date'].astype(str))
df['days_since_start'] = (df['date_dt'] - df['date_dt'].min()).dt.days

# Step 3: Run DBSCAN by isocode
results = []

for isocode, group in df.groupby('isocode'):
    if len(group) < 2:
        continue

    group = group.copy()

    # Feature matrix: [time, score]
    features = group[['days_since_start', 'mean_score']].values
    features_scaled = StandardScaler().fit_transform(features)

    # DBSCAN clustering
    db = DBSCAN(eps=0.5, min_samples=2)
    group['cluster'] = db.fit_predict(features_scaled)

    results.append(group)

# Step 4: Combine results
df_clustered_final = pd.concat(results, ignore_index=True)
df_clustered_final

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

# Loop through each country
for isocode, group in df_clustered_final.groupby('isocode'):
    plt.figure(figsize=(12, 6))
    
    # Plot unclustered points in grey
    noise = group[group['cluster'] == -1]
    plt.scatter(noise['date_dt'], noise['mean_score'], color='grey', label='Noise', alpha=0.6)

    # Plot each cluster with a different color
    clusters = group[group['cluster'] != -1]
    for cluster_id in clusters['cluster'].unique():
        subcluster = clusters[clusters['cluster'] == cluster_id]
        plt.scatter(subcluster['date_dt'], subcluster['mean_score'], label=f'Cluster {cluster_id}', alpha=0.8)

    plt.title(f"{isocode} - mean score over time")
    plt.xlabel("Date")
    plt.ylabel("Mean score")
    plt.legend()
    plt.tight_layout()
    plt.show()

## Supervised method

In [None]:
import pandas as pd
import numpy as np

# Convert date column to month
df = pd.read_csv('temp/df_scored.csv')
df['date'] = pd.to_datetime(df['date'].astype(str))
df['month'] = df['date'].dt.to_period('M').astype(str)  

# Define custom top-k functions
def topk_mean(k):
    return lambda x: np.mean(np.sort(x)[-k:]) if len(x) >= k else np.nan

# Aggregation
agg_funcs = {
    'avg_score': ('mean_score', 'mean'),
    'top1_score': ('mean_score', 'max'),
    'top3_avg_score': ('mean_score', topk_mean(3)),
    'top5_avg_score': ('mean_score', topk_mean(5)),
    'top20_avg_score': ('mean_score', topk_mean(20)),
    'min': ('mean_score', 'min'),
    'perc_75': ('mean_score', lambda x: np.percentile(x, 75)),
    'perc_50': ('mean_score', lambda x: np.percentile(x, 50)),
    'perc_25': ('mean_score', lambda x: np.percentile(x, 25)),
    'std': ('mean_score', 'std'),
}

# Step 5: Group and aggregate
df_monthly = df.groupby(['isocode', 'month']).agg(**agg_funcs).reset_index()
df_monthly['period'] = df_monthly['month'].str.replace('-', '').astype(int)
df_monthly.drop(columns=['month'], inplace=True)
df_monthly

In [None]:
df_target = pd.read_csv('input/df_target.csv')
isocodes_to_keep = df_monthly['isocode'].unique().tolist()
df_target = df_target[df_target['isocode'].isin(isocodes_to_keep)]
df_target = df_target[(df_target['period'] >= 201701) & (df_target['period'] <= 202412)]
df_target.reset_index(drop=True, inplace=True)
df_target

In [None]:
# Merge the two DataFrames
df_merged = pd.merge(df_target, df_monthly, on=['isocode', 'period'], how='left')
df_merged

## Training a classifier using CatBoost

+ Handles NaNs natively (CatBoost automatically learns optimal splits for missing values)
+ Allow validation set and early stop to prevent overfitting
+ Requires minimal data cleaning — it handles numerical, categorical, and missing data efficiently.

In [None]:
from catboost import CatBoostClassifier

# Split into train and test sets based on time
train_df = df_merged[df_merged['period'] < 202401].copy()
test_df = df_merged[df_merged['period'] >= 202401].copy()

# Feature and target columns
feature_cols = [col for col in df_merged.columns if col not in ['isocode', 'period', 'coup']]
X = train_df[feature_cols]
y = train_df['coup']

X_test = test_df[feature_cols]
y_test = test_df['coup']

# Initialize and train CatBoost model
model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.01,
    depth=3,
    random_seed=42,
    verbose=1)

model.fit(X, y)

In [None]:
from sklearn.metrics import precision_recall_curve

# Predicted probabilities 
test_proba = model.predict_proba(X_test)[:, 1]

# Compute F1 score for each threshold
precision, recall, thresholds = precision_recall_curve(y_test, test_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)

# Find threshold that gives max F1
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Best F1 threshold: {best_threshold:.3f}")
print(f"Best F1 score: {best_f1:.3f}")

In [None]:
from sklearn.metrics import classification_report

# Use optimal threshold to make final predictions
test_preds = (test_proba >= best_threshold).astype(int)

# Evaluate
print("\nClassification Report with Optimal Threshold:")
print(classification_report(y_test, test_preds))

+ `Recall` (0.67) on class 1.0 shows you’re catching 2/3 coups.
+ `Precision` is 1.0, meaning no false alarms.

In [None]:
# 6. Create results DataFrame
results_df = test_df[['isocode', 'period']].copy()
results_df['actual_coup'] = y_test.values
results_df['predicted_coup'] = test_preds
results_df['predicted_proba'] = test_proba

print("\nPredicted Coup Events:")
results_df[results_df['actual_coup'] == 1]

## Feature importance

In [None]:
import matplotlib.pyplot as plt

importances = model.get_feature_importance()
feature_names = X.columns

# Combine into a DataFrame
feat_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

plt.figure(figsize=(8, 4))
plt.barh(feat_imp_df['feature'], feat_imp_df['importance'])
plt.gca().invert_yaxis()
plt.xlabel("Importance")
plt.title(f"Feature Importances (CatBoost)")
plt.tight_layout()
plt.show()