In [None]:
import pandas as pd

DATA_FILE = "2024_fb_posts_president_scored_anon.csv"

#Load the dataset
df = pd.read_csv(DATA_FILE)

# Detect numeric and non-numeric columns
numeric_cols = df.select_dtypes(include='number').columns.tolist()
non_numeric_cols = df.select_dtypes(exclude='number').columns.tolist()

#Overall numeric summary
df[numeric_cols].describe().transpose().to_csv("summary_fb_posts_overall_numeric.csv")

#Overall non-numeric summary
non_numeric_summary = []

for col in non_numeric_cols:
    vc = df[col].value_counts(dropna=True)
    most_freq = vc.idxmax() if not vc.empty else None
    most_freq_count = vc.max() if not vc.empty else None
    unique_count = df[col].nunique(dropna=True)

    non_numeric_summary.append({
        "column": col,
        "unique_count": unique_count,
        "most_frequent": most_freq,
        "most_freq_count": most_freq_count
    })

pd.DataFrame(non_numeric_summary).to_csv("summary_fb_posts_overall_non_numeric.csv", index=False)

# Grouped by Facebook_Id — numeric stats
grouped_fbid = df.groupby("Facebook_Id")[numeric_cols].describe()
grouped_fbid.to_csv("summary_fb_posts_grouped_facebook_id.csv")

# Grouped by Facebook_Id and post_id — top 10 only
top_fbid_postid = df.groupby(["Facebook_Id", "post_id"]).size().sort_values(ascending=False).head(10).index

results = []

for fb_id, post_id in top_fbid_postid:
    sub_df = df[(df["Facebook_Id"] == fb_id) & (df["post_id"] == post_id)]
    desc = sub_df[numeric_cols].describe().transpose()
    desc["Facebook_Id"] = fb_id
    desc["post_id"] = post_id
    results.append(desc)

pd.concat(results).to_csv("summary_fb_posts_grouped_facebook_id_post_id.csv")
