In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import os

# --- Configuration ---
INPUT_FILE = "STM_topic_comparison.csv"  # 也可以改为 "STM_topic_comparison.csv"
OUTPUT_DIR = "stm_final_visuals"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Set visual style
sns.set_style("white")
plt.rcParams['font.family'] = 'sans-serif'

# 1. Load and Filter Data (Using K=20 as the optimal scale)
df = pd.read_csv(INPUT_FILE)
df_k20 = df[df['Topic_Count_K'] == 20].copy()

# Helper function to clean and split keywords
def get_word_list(text):
    return [w.strip() for w in text.replace(",", " ").split()]

# =================================================================
# PLOT 1: Topic Correlation Network (Semantic Connectivity)
# =================================================================
print("Generating Topic Network Map...")
merged_k20 = df_k20[df_k20['Dataset_Type'] == 'Merged_Global'].copy()

topics = merged_k20['Topic_ID'].tolist()
keywords_sets = [set(get_word_list(k)) for k in merged_k20['Top_Keywords']]

G = nx.Graph()

# Add nodes with labels (using Top 3 words as identifiers)
for i in range(len(topics)):
    label = f"T{topics[i]}\n" + "\n".join(list(get_word_list(merged_k20.iloc[i]['Top_Keywords']))[:2])
    G.add_node(topics[i], label=label)

# Add edges based on Jaccard Similarity (keyword overlap)
for i in range(len(topics)):
    for j in range(i + 1, len(topics)):
        set_i, set_j = keywords_sets[i], keywords_sets[j]
        similarity = len(set_i.intersection(set_j)) / len(set_i.union(set_j))
        
        # Only draw edges for meaningful connections (threshold > 0.05)
        if similarity > 0.05:
            G.add_edge(topics[i], topics[j], weight=similarity)

plt.figure(figsize=(15, 12))
# layout for the network
pos = nx.kamada_kawai_layout(G)

# Draw components
edges = G.edges()
weights = [G[u][v]['weight'] * 15 for u, v in edges] # Scale edge width by similarity

nx.draw_networkx_nodes(G, pos, node_size=2500, node_color='#80cbc4', alpha=0.9)
nx.draw_networkx_edges(G, pos, width=weights, edge_color='#b0bec5', alpha=0.6)
nx.draw_networkx_labels(G, pos, labels=nx.get_node_attributes(G, 'label'), font_size=10, font_weight='bold')

plt.title("Topic Network Map: Semantic Connectivity of Global Energy Narratives (K=20)", fontsize=18, pad=20)
plt.axis('off')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "topic_network_map.png"), dpi=300)
plt.close()

# =================================================================
# PLOT 2: Region-Specific Mirror Plot (Tornado Plot)
# =================================================================
print("Generating Region-Specific Mirror Plot...")
china_topics = df_k20[df_k20['Dataset_Type'] == 'China_Only'].copy()
uk_topics = df_k20[df_k20['Dataset_Type'] == 'UK_Only'].copy()

# Build universal word sets for exclusivity cross-check
all_c_words = set().union(*[set(get_word_list(kw)) for kw in china_topics['Top_Keywords']])
all_u_words = set().union(*[set(get_word_list(kw)) for kw in uk_topics['Top_Keywords']])

def get_exclusivity_score(kw_str, other_region_set):
    words = set(get_word_list(kw_str))
    # Score = count of keywords that NEVER appear in the other region's entire model
    return len(words - other_region_set)

china_topics['Exclusivity'] = china_topics['Top_Keywords'].apply(lambda x: get_exclusivity_score(x, all_u_words))
uk_topics['Exclusivity'] = uk_topics['Top_Keywords'].apply(lambda x: get_exclusivity_score(x, all_c_words))

# Select top 8 most unique topics for each side
top_c = china_topics.nlargest(8, 'Exclusivity').copy()
top_u = uk_topics.nlargest(8, 'Exclusivity').copy()

top_c['Region'] = 'China'
top_u['Region'] = 'UK'
top_u['Exclusivity'] = -top_u['Exclusivity'] # Mirror effect

combined = pd.concat([top_c, top_u])

plt.figure(figsize=(16, 10))
colors = ['#ef5350' if r == 'China' else '#42a5f5' for r in combined['Region']]
plt.barh(range(len(combined)), combined['Exclusivity'], color=colors, alpha=0.8)

y_labels = []
for i, (idx, row) in enumerate(combined.iterrows()):
    label = f"{row['Region']} T{row['Topic_ID']}"
    y_labels.append(label)
    
    # Display top keywords on the bars
    kws = ", ".join(get_word_list(row['Top_Keywords'])[:6])
    if row['Region'] == 'China':
        plt.text(0.5, i, f" {kws}", va='center', ha='left', fontsize=11, fontweight='bold', color='#b71c1c')
    else:
        plt.text(-0.5, i, f"{kws} ", va='center', ha='right', fontsize=11, fontweight='bold', color='#0d47a1')

plt.yticks(range(len(combined)), y_labels, fontsize=12)
plt.axvline(0, color='black', linewidth=1.5)
plt.title("Region-Specific Narrative Power: Mirror Comparison of Unique Topics (K=20)", fontsize=18, pad=30)
plt.xlabel("<- Specific to UK Media | Specific to China Media ->", fontsize=14)
plt.xlim(-18, 18) 
plt.grid(axis='x', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "region_specific_mirror_plot.png"), dpi=300)
plt.close()

print(f"Success! Both plots saved in: {OUTPUT_DIR}")

Generating Topic Network Map...
Generating Region-Specific Mirror Plot...
Success! Both plots saved in: stm_final_visuals
