In [1]:
pip install tomotopy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
import re
import os
import pandas as pd
import tomotopy as tp

china_file = "energy_narrative_C_cleaned.csv"
uk_file    = "energy_narrative_W_cleaned.csv"

text_col = "full_content"      
source_col = "source"          

optional_meta_cols = ["keyword"]  

# topic number
K_LIST = [10, 20, 30]

# training parameters
ITER = 800
LOG_EVERY = 100
SEED = 42

# output
OUT_DIR = "stm_like_outputs_4sources"
os.makedirs(OUT_DIR, exist_ok=True)

#  Read in data+merge
df_c = pd.read_csv(china_file)
df_w = pd.read_csv(uk_file)

# Add a media tag to each file
df_c["media"] = "China"
df_w["media"] = "UK"

df = pd.concat([df_c, df_w], ignore_index=True)

print("Loaded rows total:", len(df))
print("Columns:", list(df.columns))

# Necessary column inspection
for c in [text_col, source_col]:
    if c not in df.columns:
        raise ValueError(f"Column '{c}' not found. Please check your CSV files.")

# Metadata column: Mandatory inclusion of media+source, optional is added
meta_cols = ["media", source_col]
for c in optional_meta_cols:
    if c in df.columns:
        meta_cols.append(c)

print("Using metadata columns:", meta_cols)

df = df.dropna(subset=[text_col, source_col]).copy()
df[text_col] = df[text_col].astype(str)

#  Text preprocessing (one-time, non repetitive)
STOPWORDS = set("""
a an the and or but if while with without of to in on for from as by is are was were be been being
this that these those it its they their them we our you your i he she his her at into over under
not no do does did doing done can could would should will may might must
said says say according also more most one two new just about
""".split())

def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)      # romove URL
    text = re.sub(r"[^a-z\s]", " ", text)    # keep letters only
    toks = [t for t in text.split() if len(t) >= 3 and t not in STOPWORDS]
    return toks

def build_meta(row):
    parts = []
    for c in meta_cols:
        val = str(row.get(c, "NA")).strip()
        if val == "" or val.lower() == "nan":
            val = "NA"
        val = re.sub(r"\s+", "_", val)       # avoid space
        parts.append(f"{c}={val}")
    return " ".join(parts)

print("\nPrecomputing tokens + metadata strings (only once)...")
df["meta_str"] = df.apply(build_meta, axis=1)
df["tokens"] = df[text_col].apply(tokenize)

# Filter short text (reduce noise)
MIN_TOKENS = 30
df_use = df[df["tokens"].apply(len) >= MIN_TOKENS].copy()
print(f"Docs kept (len>= {MIN_TOKENS} tokens): {len(df_use)} / {len(df)}")

docs_tokens = df_use["tokens"].tolist()
docs_meta   = df_use["meta_str"].tolist()

# Group by metadata to calculate topic popularity
def parse_group(meta_str, key):
    m = re.search(rf"{key}=([^\s]+)", meta_str)
    return m.group(1) if m else "NA"

def compute_group_topic_prevalence(model, meta_list, group_key):
    """
    output DataFrame：
    group, n_docs, topic_0 ... topic_{K-1}
    """
    K = model.k
    group_thetas = {}

    for doc, meta_str in zip(model.docs, meta_list):
        g = parse_group(meta_str, group_key)
        theta = doc.get_topic_dist()
        group_thetas.setdefault(g, []).append(theta)

    rows = []
    for g, thetas in group_thetas.items():
        n = len(thetas)
        avg = [sum(t[k] for t in thetas) / n for k in range(K)]
        row = {"group": g, "n_docs": n}
        for k in range(K):
            row[f"topic_{k}"] = avg[k]
        rows.append(row)

    return pd.DataFrame(rows).sort_values("n_docs", ascending=False)

# Output requirement: Compare four newspapers together → Group by source
GROUP_KEY = source_col

#  run K = 10 / 20 / 30
for K in K_LIST:
    print("\n" + "="*70)
    print(f"Training DMR (STM-like) model with K={K}")
    print("="*70)

    model = tp.DMRModel(k=K, alpha=0.1, sigma=1.0, seed=SEED)

    # load file
    for toks, meta_str in zip(docs_tokens, docs_meta):
        model.add_doc(toks, metadata=meta_str)

    # train
    for i in range(0, ITER, LOG_EVERY):
        model.train(LOG_EVERY)
        print(f"K={K} | Iter {i+LOG_EVERY:4d}/{ITER} | ll_per_word={model.ll_per_word:.4f}")

    # ---------- output 1：top words ----------
    topn = 12
    topwords_path = os.path.join(OUT_DIR, f"K{K}_top_words.txt")
    with open(topwords_path, "w", encoding="utf-8") as f:
        f.write(f"DMR (STM-like) model | K={K}\n")
        f.write(f"Final ll_per_word: {model.ll_per_word:.6f}\n\n")
        for k in range(K):
            words = model.get_topic_words(k, top_n=topn)
            f.write(f"Topic {k}:\n")
            f.write(", ".join([w for w, _ in words]) + "\n\n")
    print(f"Saved top words -> {topwords_path}")

    # ---------- output 2：按 source（四报纸）topic prevalence ----------
    prev_df = compute_group_topic_prevalence(model, docs_meta, group_key=GROUP_KEY)
    prev_path = os.path.join(OUT_DIR, f"K{K}_topic_prevalence_by_{GROUP_KEY}.csv")
    prev_df.to_csv(prev_path, index=False, encoding="utf-8-sig")
    print(f"Saved topic prevalence -> {prev_path}")

    # ---------- output 3: top topics summary ----------
    summary_path = os.path.join(OUT_DIR, f"K{K}_group_top_topics_summary_by_{GROUP_KEY}.txt")
    topic_cols = [f"topic_{k}" for k in range(K)]
    with open(summary_path, "w", encoding="utf-8") as f:
        f.write(f"Group top topics summary | K={K} | grouped by {GROUP_KEY}\n\n")
        for _, row in prev_df.iterrows():
            g = row["group"]
            n_docs = int(row["n_docs"])
            top3 = sorted(topic_cols, key=lambda c: row[c], reverse=True)[:3]
            f.write(f"Group: {g} (n_docs={n_docs})\n")
            for tc in top3:
                k_id = int(tc.split('_')[1])
                top_words = ", ".join([w for w, _ in model.get_topic_words(k_id, top_n=6)])
                f.write(f"  {tc}: {row[tc]:.3f} | {top_words}\n")
            f.write("\n")
    print(f"Saved group summary -> {summary_path}")

print("\nAll done. Outputs are in:", OUT_DIR)


Loaded rows total: 7432
Columns: ['url', 'title', 'publish_date', 'source', 'keyword', 'full_content', 'cleaned_content', 'year', 'month', 'day', 'media']
Using metadata columns: ['media', 'source', 'keyword']

Precomputing tokens + metadata strings (only once)...
Docs kept (len>= 30 tokens): 7349 / 7432

Training DMR (STM-like) model with K=10


  model.train(LOG_EVERY)


K=10 | Iter  100/800 | ll_per_word=-8.5352
K=10 | Iter  200/800 | ll_per_word=-8.5074
K=10 | Iter  300/800 | ll_per_word=-8.4988
K=10 | Iter  400/800 | ll_per_word=-8.4925
K=10 | Iter  500/800 | ll_per_word=-8.4860
K=10 | Iter  600/800 | ll_per_word=-8.4811
K=10 | Iter  700/800 | ll_per_word=-8.4773
K=10 | Iter  800/800 | ll_per_word=-8.4751
Saved top words -> stm_like_outputs_4sources\K10_top_words.txt
Saved topic prevalence -> stm_like_outputs_4sources\K10_topic_prevalence_by_source.csv
Saved group summary -> stm_like_outputs_4sources\K10_group_top_topics_summary_by_source.txt

Training DMR (STM-like) model with K=20
K=20 | Iter  100/800 | ll_per_word=-8.5643
K=20 | Iter  200/800 | ll_per_word=-8.5196
K=20 | Iter  300/800 | ll_per_word=-8.5034
K=20 | Iter  400/800 | ll_per_word=-8.4950
K=20 | Iter  500/800 | ll_per_word=-8.4882
K=20 | Iter  600/800 | ll_per_word=-8.4823
K=20 | Iter  700/800 | ll_per_word=-8.4799
K=20 | Iter  800/800 | ll_per_word=-8.4766
Saved top words -> stm_like_o