In [4]:
import pandas as pd

# Calculating Topic Prevalence
df_topics = pd.read_csv("D:/SOSC314/Project/K20_theta.csv")
topic_cols = [c for c in df_topics.columns if c.startswith("topic_")]

prevalence_df = (
    df_topics
    .groupby("media")[topic_cols]
    .mean()
    .reset_index()
)

print(prevalence_df)


   media   topic_1   topic_2   topic_3   topic_4   topic_5   topic_6  \
0  China  0.026723  0.087898  0.023349  0.022145  0.046317  0.148772   
1     UK  0.008998  0.004890  0.166338  0.017377  0.031013  0.025022   

    topic_7   topic_8   topic_9  ...  topic_11  topic_12  topic_13  topic_14  \
0  0.028003  0.023337  0.010774  ...  0.013338  0.137916  0.049250  0.042998   
1  0.023993  0.003795  0.018610  ...  0.202805  0.006794  0.027404  0.008197   

   topic_15  topic_16  topic_17  topic_18  topic_19  topic_20  
0  0.117602  0.021405  0.003147  0.015383  0.095176  0.020204  
1  0.012513  0.030254  0.190541  0.010891  0.076827  0.130768  

[2 rows x 21 columns]


In [12]:
# Comparing Narrative Difference

long_df = prevalence_df.melt(
    id_vars="media",
    value_vars=topic_cols,
    var_name="topic",
    value_name="prevalence"
)

china_df = long_df[long_df["media"] == "China"]
uk_df    = long_df[long_df["media"] == "UK"]

compare_df = china_df.merge(
    uk_df,
    on="topic",
    suffixes=("_china", "_uk")
)

# comparing the difference
compare_df["difference"] = (
    compare_df["prevalence_china"] - compare_df["prevalence_uk"]
)

compare_df = compare_df.sort_values(
    by="difference",
    key=abs,
    ascending=False
)



# merge and output
compare_df["topic_id"] = compare_df["topic"].str.replace("topic_", "").astype(int)

df_words = pd.read_excel("D:/SOSC314/Project/K20_topics_with_names.xlsx")

final_df = compare_df.merge(
    df_words,              
    left_on="topic_id",    
    right_on="Topic_ID",   
    how="left"
)

# arranging orders
final_df = final_df[["topic", "Topic_Name", "Highest_Prob", 
                     "prevalence_china", "prevalence_uk", "difference"]]
final_df.rename(columns={"Highest_Prob": "top_words"}, inplace=True)



# output document
final_df.to_excel(r"D:\SOSC314\STM_Topic_Prevalence_Comparison.xlsx", index=False)

print("Saved: STM_Topic_Prevalence_Comparison.xlsx")



Saved: STM_Topic_Prevalence_Comparison.xlsx
