In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Configuration
INPUT_FILE = "K20_theta.csv"
OUT_DIR = "topic_time_series_named"
os.makedirs(OUT_DIR, exist_ok=True)

# Topic ID -> Topic Name mapping
topic_name_map = {
    1: "Green Finance",
    2: "Expos & Tech Events",
    3: "Coal & Emissions Policy",
    4: "Green Deal Politics",
    5: "Global Macroeconomy",
    6: "Renewables & Hydrogen",
    7: "Biodiversity & Water",
    8: "China–Africa Cooperation",
    9: "German Politics",
    10: "CPC Governance",
    11: "Activism & Media Boilerplate",
    12: "Low-Carbon Policy",
    13: "Multilateralism",
    14: "BRI Middle East Ties",
    15: "EVs & Batteries",
    16: "US Politics & China–US",
    17: "Australian Politics",
    18: "Nuclear & Steel",
    19: "COP & Paris Talks",
    20: "Oil & Gas Transition"
}

# Load data
df = pd.read_csv(INPUT_FILE)

# Ensure year
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df = df.dropna(subset=["year"])
df["year"] = df["year"].astype(int)

# Optional: restrict years
df = df[(df["year"] >= 2020) & (df["year"] <= 2025)]

# Topic columns
topic_cols = [f"topic_{i}" for i in range(1, 21)]

# Aggregate: mean topic prevalence by year
ts = (
    df.groupby("year", as_index=False)[topic_cols]
      .mean()
)

# Rename columns to topic names
rename_dict = {
    f"topic_{i}": topic_name_map[i]
    for i in range(1, 21)
}
ts = ts.rename(columns=rename_dict)

# Save aggregated table
ts.to_csv(
    os.path.join(OUT_DIR, "topic_time_series_by_year_named.csv"),
    index=False,
    encoding="utf-8-sig"
)

# Plot: Topic vs Time (using names)
for topic_name in topic_name_map.values():
    plt.figure(figsize=(7, 4))
    plt.plot(ts["year"], ts[topic_name], marker="o")

    plt.title(f"{topic_name} over time")
    plt.xlabel("Year")
    plt.ylabel("Mean topic prevalence")
    plt.grid(alpha=0.3)
    plt.tight_layout()

    fname = topic_name.lower().replace(" ", "_").replace("&", "and").replace("–", "-")
    plt.savefig(
        os.path.join(OUT_DIR, f"{fname}_time_series.png"),
        dpi=200
    )
    plt.close()

print("Done. Outputs saved to:", OUT_DIR)


Done. Outputs saved to: topic_time_series_named


In [4]:
# Plot 2 grouped figures 

import os
import matplotlib.pyplot as plt

GROUP_A = [
    "Biodiversity & Water",
    "China–Africa Cooperation",
    "Global Macroeconomy",
    "Green Finance",
    "Oil & Gas Transition"
]

GROUP_B = [
    "Coal & Emissions Policy",
    "COP & Paris Talks",
    "EVs & Batteries",
    "Expos & Tech Events",
    "Low-Carbon Policy"
]

def plot_topics_one_fig(ts, topics, title, save_path):
    # check columns exist
    missing = [t for t in topics if t not in ts.columns]
    if missing:
        raise ValueError(f"Missing topic columns in ts: {missing}")

    plt.figure(figsize=(10, 6))

    # plot lines
    for t in topics:
        plt.plot(ts["year"], ts[t], marker="o", linewidth=2.2, markersize=5, label=t)

    # make y-axis pretty but still raw mean prevalence
    ymin = ts[topics].min().min()
    ymax = ts[topics].max().max()
    pad = (ymax - ymin) * 0.12 if ymax > ymin else 0.005
    plt.ylim(ymin - pad, ymax + pad)  

    plt.title(title, fontsize=14)
    plt.xlabel("Year", fontsize=12)
    plt.ylabel("Mean topic prevalence", fontsize=12)
    plt.xticks(sorted(ts["year"].unique()))
    plt.grid(alpha=0.25)

    # legend outside to avoid covering lines
    plt.legend(loc="center left", bbox_to_anchor=(1.02, 0.5), frameon=True, fontsize=10)

    plt.tight_layout()
    plt.savefig(save_path, dpi=220, bbox_inches="tight")
    plt.close()

groupA_path = os.path.join(OUT_DIR, "groupA_5topics_mean_prevalence.png")
groupB_path = os.path.join(OUT_DIR, "groupB_5topics_mean_prevalence.png")

plot_topics_one_fig(
    ts,
    GROUP_A,
    "Topic Trends (Mean Prevalence) — Group A",
    groupA_path
)

plot_topics_one_fig(
    ts,
    GROUP_B,
    "Topic Trends (Mean Prevalence) — Group B",
    groupB_path
)

print("Saved:")
print(groupA_path)
print(groupB_path)


Saved:
topic_time_series_named\groupA_5topics_mean_prevalence.png
topic_time_series_named\groupB_5topics_mean_prevalence.png
