In [None]:
import pandas as pd

DATA_FILE = "2024_fb_ads_president_scored_anon.csv"

# Load the dataset
df = pd.read_csv(DATA_FILE)

# Detect numeric vs non-numeric
numeric_cols = df.select_dtypes(include='number').columns.tolist()
non_numeric_cols = df.select_dtypes(exclude='number').columns.tolist()

# Overall numeric summary
numeric_summary = df[numeric_cols].describe().transpose()
numeric_summary.to_csv("summary_pandas_overall_numeric.csv")

#Overall non-numeric summary
non_numeric_summary = []

for col in non_numeric_cols:
    vc = df[col].value_counts(dropna=True)
    most_freq = vc.idxmax() if not vc.empty else None
    most_freq_count = vc.max() if not vc.empty else None
    unique_count = df[col].nunique(dropna=True)

    non_numeric_summary.append({
        "column": col,
        "unique_count": unique_count,
        "most_frequent": most_freq,
        "most_freq_count": most_freq_count
    })

pd.DataFrame(non_numeric_summary).to_csv("summary_pandas_overall_non_numeric.csv", index=False)

#Grouped by page_id — numeric stats only (for performance)
grouped_page_id = df.groupby("page_id")[numeric_cols].describe()
grouped_page_id.to_csv("summary_pandas_grouped_page_id.csv")

# Grouped by page_id and ad_id — numeric stats only, top 10 groups
# Top 10 combinations of page_id + ad_id
top_combos = df.groupby(["page_id", "ad_id"]).size().sort_values(ascending=False).head(10).index

results = []

for pid, aid in top_combos:
    sub_df = df[(df["page_id"] == pid) & (df["ad_id"] == aid)]
    desc = sub_df[numeric_cols].describe().transpose()
    desc["page_id"] = pid
    desc["ad_id"] = aid
    results.append(desc)

# Concatenate and save
pd.concat(results).to_csv("summary_pandas_grouped_page_id_ad_id.csv")