In [None]:
import pandas as pd

DATA_FILE = "2024_tw_posts_president_scored_anon.csv"

# Load data
df = pd.read_csv(DATA_FILE)

# Identify numeric and non-numeric columns
numeric_cols = df.select_dtypes(include='number').columns.tolist()
non_numeric_cols = df.select_dtypes(exclude='number').columns.tolist()

# Overall numeric summary
df[numeric_cols].describe().transpose().to_csv("summary_tw_posts_overall_numeric.csv")

#Overall non-numeric summary
non_numeric_summary = []

for col in non_numeric_cols:
    vc = df[col].value_counts(dropna=True)
    most_freq = vc.idxmax() if not vc.empty else None
    most_freq_count = vc.max() if not vc.empty else None
    unique_count = df[col].nunique(dropna=True)

    non_numeric_summary.append({
        "column": col,
        "unique_count": unique_count,
        "most_frequent": most_freq,
        "most_freq_count": most_freq_count
    })

pd.DataFrame(non_numeric_summary).to_csv("summary_tw_posts_overall_non_numeric.csv", index=False)

# Grouped by id
grouped_id = df.groupby("id")[numeric_cols].describe()
grouped_id.to_csv("summary_tw_posts_grouped_id.csv")

#Grouped by id and quoteId — top 10 only
top_id_quote = df.groupby(["id", "quoteId"]).size().sort_values(ascending=False).head(10).index

results = []

for tid, qid in top_id_quote:
    sub_df = df[(df["id"] == tid) & (df["quoteId"] == qid)]
    desc = sub_df[numeric_cols].describe().transpose()
    desc["id"] = tid
    desc["quoteId"] = qid
    results.append(desc)

pd.concat(results).to_csv("summary_tw_posts_grouped_id_quoteId.csv")
