In [1]:
pip install tomotopy

Note: you may need to restart the kernel to use updated packages.


In [6]:
import re
import os
import pandas as pd
import tomotopy as tp

# --- Configuration ---
CHINA_FILE = "energy_narrative_C_cleaned.csv"
UK_FILE    = "energy_narrative_W_cleaned.csv"
TEXT_COL   = "full_content"
SOURCE_COL = "source"

# Experimental Parameters
K_LIST     = [10, 20, 30]  # Number of topics to test per dataset
ITER       = 800           # Training iterations
SEED       = 42            # For reproducibility
OUT_DIR    = "narrative_analysis_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Basic English Stopwords
STOPWORDS = set("a an the and or but if while with without of to in on for from as by is are was were be been being this that these those it its they their them we our you your i he she his her at into over under not no do does did doing done can could would should will may might must said says say according also more most one two new just about".split())

# --- Utility Functions ---

def tokenize(text: str):
    """Clean text, remove URLs/non-alpha, and filter stopwords."""
    text = str(text).lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    return [t for t in text.split() if len(t) >= 3 and t not in STOPWORDS]

def run_topic_model(df_input, dataset_label, k_val):
    """
    Trains a DMR model for a specific dataset and K value.
    Returns a list of dictionaries containing topic metadata.
    """
    print(f"PROCESS: Dataset={dataset_label} | K={k_val} | Docs={len(df_input)}")
    
    # Preprocessing: Tokenize and filter short documents
    df_temp = df_input.copy()
    df_temp["tokens"] = df_temp[TEXT_COL].apply(tokenize)
    df_temp = df_temp[df_temp["tokens"].apply(len) >= 30] 
    
    # Initialize DMR Model
    model = tp.DMRModel(k=k_val, seed=SEED)
    for row in df_temp.itertuples():
        # Metadata allows the model to learn source-specific distributions
        meta = f"source={getattr(row, SOURCE_COL)}"
        model.add_doc(row.tokens, metadata=meta)
    
    # Training
    model.train(ITER)
    
    # Extract Topic Results
    topic_results = []
    for k in range(k_val):
        # Get top 15 words for each topic
        words = ", ".join([w for w, _ in model.get_topic_words(k, top_n=15)])
        topic_results.append({
            "Dataset_Type": dataset_label,
            "Topic_Count_K": k_val,
            "Topic_ID": k,
            "Top_Keywords": words,
            "Log_Likelihood": model.ll_per_word
        })
    return topic_results

# --- Main Execution Logic ---

# 1. Load and Tag Data
print("Loading data files...")
df_c = pd.read_csv(CHINA_FILE)
df_w = pd.read_csv(UK_FILE)

df_c["region"] = "China"
df_w["region"] = "UK"

# Create the merged dataset
df_merged = pd.concat([df_c, df_w], ignore_index=True)

# Define the three experimental arms
tasks = [
    (df_c, "China_Only"),
    (df_w, "UK_Only"),
    (df_merged, "Merged_Global")
]

master_data = []

# 2. Run Nested Loops (Dataset x K-Value)
for dataframe, label in tasks:
    print(f"\n>>> Starting analysis for: {label}")
    for k in K_LIST:
        results = run_topic_model(dataframe, label, k)
        master_data.extend(results)

# 3. Consolidation and Output
summary_df = pd.DataFrame(master_data)

# Export the Master Comparison Table
master_output_path = os.path.join(OUT_DIR, "master_topic_comparison.csv")
summary_df.to_csv(master_output_path, index=False, encoding="utf-8-sig")

# Export separate files for each K for easier side-by-side reading
for k in K_LIST:
    k_subset = summary_df[summary_df["Topic_Count_K"] == k]
    k_subset.to_csv(os.path.join(OUT_DIR, f"comparison_table_K{k}.csv"), index=False, encoding="utf-8-sig")

print(f"\n" + "="*50)
print("ANALYSIS COMPLETE")
print(f"Master Table: {master_output_path}")
print(f"K-Specific Tables saved in: {OUT_DIR}")
print("="*50)

Loading data files...

>>> Starting analysis for: China_Only
PROCESS: Dataset=China_Only | K=10 | Docs=5315


  model.train(ITER)


PROCESS: Dataset=China_Only | K=20 | Docs=5315


  model.train(ITER)


PROCESS: Dataset=China_Only | K=30 | Docs=5315


  model.train(ITER)



>>> Starting analysis for: UK_Only
PROCESS: Dataset=UK_Only | K=10 | Docs=2117


  model.train(ITER)


PROCESS: Dataset=UK_Only | K=20 | Docs=2117


  model.train(ITER)


PROCESS: Dataset=UK_Only | K=30 | Docs=2117


  model.train(ITER)



>>> Starting analysis for: Merged_Global
PROCESS: Dataset=Merged_Global | K=10 | Docs=7432


  model.train(ITER)


PROCESS: Dataset=Merged_Global | K=20 | Docs=7432


  model.train(ITER)


PROCESS: Dataset=Merged_Global | K=30 | Docs=7432

ANALYSIS COMPLETE
Master Table: narrative_analysis_outputs/master_topic_comparison.csv
K-Specific Tables saved in: narrative_analysis_outputs


  model.train(ITER)
