For dataset: fb_ads


In [8]:
import csv

def load_csv(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        data = list(reader)
    return data

# Example usage
file_path = "2024_fb_ads_president_scored_anon.csv"
data = load_csv(file_path)

# Preview the first 2 rows
for row in data[:2]:
    print(row)

{'page_id': '4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230', 'ad_id': '0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc', 'ad_creation_time': '2024-10-21', 'bylines': 'Texas Organizing Project PAC', 'currency': 'USD', 'delivery_by_region': "{'Texas': {'spend': 249, 'impressions': 47499}}", 'demographic_distribution': "{'female_18-24': {'spend': 28, 'impressions': 5507}, 'male_45-54': {'spend': 14, 'impressions': 2757}, 'male_65+': {'spend': 3, 'impressions': 714}, 'female_65+': {'spend': 3, 'impressions': 725}, 'unknown_55-64': {'spend': 0, 'impressions': 21}, 'male_55-64': {'spend': 7, 'impressions': 1502}, 'female_55-64': {'spend': 7, 'impressions': 1520}, 'unknown_45-54': {'spend': 0, 'impressions': 40}, 'female_45-54': {'spend': 14, 'impressions': 2735}, 'male_18-24': {'spend': 21, 'impressions': 4055}, 'unknown_35-44': {'spend': 0, 'impressions': 69}, 'male_35-44': {'spend': 26, 'impressions': 5149}, 'female_35-44': {'spend': 28, 'impressions': 5

In [11]:
import csv
from collections import defaultdict, Counter
import statistics

def load_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        return list(reader)

def summarize_column(col_values):
    numeric = [float(v) for v in col_values if v and isinstance(v, str) and v.replace('.', '', 1).isdigit()]

    if numeric:
        return {
            "count": len(numeric),
            "mean": round(statistics.mean(numeric), 2),
            "min": min(numeric),
            "max": max(numeric),
            "std_dev": round(statistics.stdev(numeric), 2) if len(numeric) > 1 else 0
        }
    else:
        return {
            "unique": len(set(col_values)),
            "most_common": Counter(col_values).most_common(1)[0]
        }

def summarize_dataset(data, label="Dataset"):
    print(f"\n--- Summary: {label} ---")
    for col in data[0]:
        col_vals = [row[col] for row in data]
        summary = summarize_column(col_vals)
        print(f"\nColumn: {col}")
        for k, v in summary.items():
            print(f"  {k}: {v}")

def group_by(data, keys):
    groups = defaultdict(list)
    for row in data:
        try:
            key = tuple(row[k] for k in keys)
            groups[key].append(row)
        except KeyError as e:
            print(f"Missing key: {e}")
    return groups

# === MAIN ===
data = load_data("2024_fb_ads_president_scored_anon.csv")
summarize_dataset(data)

# Group by page_id
print("\n--- Grouped by page_id ---")
group1 = group_by(data, ["page_id"])
for i, (key, group_rows) in enumerate(group1.items()):
    if i >= 2: break
    print(f"\nGroup: {key}")
    summarize_dataset(group_rows)

# Group by (page_id, ad_id)
print("\n--- Grouped by (page_id, ad_id) ---")
group2 = group_by(data, ["page_id", "ad_id"])
for i, (key, group_rows) in enumerate(group2.items()):
    if i >= 2: break
    print(f"\nGroup: {key}")
    summarize_dataset(group_rows)



--- Summary: Dataset ---

Column: page_id
  unique: 4334
  most_common: ('4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d', 55446)

Column: ad_id
  unique: 240582
  most_common: ('0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc', 1)

Column: ad_creation_time
  unique: 509
  most_common: ('2024-10-27', 8619)

Column: bylines
  unique: 3676
  most_common: ('HARRIS FOR PRESIDENT', 49788)

Column: currency
  unique: 18
  most_common: ('USD', 240443)

Column: delivery_by_region
  unique: 136401
  most_common: ('{}', 30368)

Column: demographic_distribution
  unique: 210080
  most_common: ('{}', 30368)

Column: estimated_audience_size
  count: 240581
  mean: 553515.98
  min: 0.0
  max: 1000001.0
  std_dev: 409515.57

Column: estimated_impressions
  count: 240581
  mean: 45856.77
  min: 499.0
  max: 1000000.0
  std_dev: 137211.86

Column: estimated_spend
  count: 240581
  mean: 1073.19
  min: 49.0
  max: 474999.0
  std_dev: 5036.38

Column: publisher_platfor

In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv("2024_fb_ads_president_scored_anon.csv")

# ===== Overall Descriptive Statistics =====
print("\n--- Descriptive Statistics ---")
print(df.describe(include='all'))

# ===== Unique Counts and Most Frequent Values =====
print("\n--- Unique Count and Most Frequent Values ---")
for col in df.columns:
    print(f"\nColumn: {col}")
    print(f"  Unique values: {df[col].nunique()}")
    if df[col].dtype == 'object':
        print(f"  Most frequent: {df[col].value_counts().idxmax()} ({df[col].value_counts().max()} times)")

# ===== Grouped by 'page_id' =====
numeric_cols = df.select_dtypes(include='number').columns

print("\n--- Grouped by page_id ---")
grouped_page = df.groupby("page_id")[numeric_cols].agg(['count', 'mean', 'min', 'max', 'std'])
print(grouped_page.head(3))

print("\n--- Grouped by (page_id, ad_id) ---")
grouped_page_ad = df.groupby(["page_id", "ad_id"])[numeric_cols].agg(['count', 'mean', 'min', 'max', 'std'])
print(grouped_page_ad.head(3))



--- Descriptive Statistics ---
                                                  page_id  \
count                                              246745   
unique                                               4475   
top     4d66f5853f0365dba032a87704a634f023d15babde973b...   
freq                                                55503   
mean                                                  NaN   
std                                                   NaN   
min                                                   NaN   
25%                                                   NaN   
50%                                                   NaN   
75%                                                   NaN   
max                                                   NaN   

                                                    ad_id ad_creation_time  \
count                                              246745           246745   
unique                                             246745              547   
t

In [13]:
import polars as pl

# Load dataset using Polars
df = pl.read_csv("2024_fb_ads_president_scored_anon.csv")

# Print general summary
print("\n--- Descriptive Statistics ---")
print(df.describe())

# Unique + Most frequent
print("\n--- Unique Count and Most Frequent Values ---")
for col in df.columns:
    print(f"\nColumn: {col}")
    print(f"  Unique: {df[col].n_unique()}")
    try:
        most_common = df.group_by(col).count().sort("count", descending=True)[0, col]
        print(f"  Most Frequent: {most_common}")
    except:
        print("  Most Frequent: N/A")

# Select only numeric columns
numeric_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype in [pl.Float64, pl.Int64]]

# Grouped by page_id
print("\n--- Grouped by page_id ---")
grouped_page = df.group_by("page_id").agg([
    pl.count(),
    *[pl.mean(col).alias(f"{col}_mean") for col in numeric_cols],
    *[pl.min(col).alias(f"{col}_min") for col in numeric_cols],
    *[pl.max(col).alias(f"{col}_max") for col in numeric_cols],
    *[pl.std(col).alias(f"{col}_std") for col in numeric_cols],
])
print(grouped_page.head(3))

# Grouped by page_id + ad_id
print("\n--- Grouped by (page_id, ad_id) ---")
grouped_page_ad = df.group_by(["page_id", "ad_id"]).agg([
    pl.count(),
    *[pl.mean(col).alias(f"{col}_mean") for col in numeric_cols],
    *[pl.min(col).alias(f"{col}_min") for col in numeric_cols],
    *[pl.max(col).alias(f"{col}_max") for col in numeric_cols],
    *[pl.std(col).alias(f"{col}_std") for col in numeric_cols],
])
print(grouped_page_ad.head(3))



--- Descriptive Statistics ---
shape: (9, 42)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ page_id   ┆ ad_id     ┆ ad_creati ┆ … ┆ womens_is ┆ incivilit ┆ freefair_ ┆ fraud_il │
│ ---       ┆ ---       ┆ ---       ┆ on_time   ┆   ┆ sue_topic ┆ y_illumin ┆ illuminat ┆ luminati │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ _illumina ┆ ating     ┆ ing       ┆ ng       │
│           ┆           ┆           ┆ str       ┆   ┆ tin…      ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ ---       ┆ f64       ┆ f64       ┆ f64      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 246745    ┆ 246745    ┆ 246745    ┆ … ┆ 246745.0  ┆ 246745.0  ┆ 246745.0  ┆ 246745.0 │
│ null_coun ┆ 0         ┆ 0         ┆ 0     

  most_common = df.group_by(col).count().sort("count", descending=True)[0, col]


  Most Frequent: 4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d

Column: ad_id
  Unique: 246745
  Most Frequent: 35413ad3c317947b9878e90ca40b3dfbb7fd755c1eb2de69d0a01ce7ef8248d9

Column: ad_creation_time
  Unique: 547
  Most Frequent: 2024-10-27

Column: bylines
  Unique: 3791
  Most Frequent: HARRIS FOR PRESIDENT

Column: currency
  Unique: 18
  Most Frequent: USD

Column: delivery_by_region
  Unique: 141122
  Most Frequent: {}

Column: demographic_distribution
  Unique: 215622
  Most Frequent: {}

Column: estimated_audience_size
  Unique: 9
  Most Frequent: 1000001

Column: estimated_impressions
  Unique: 39
  Most Frequent: 499

Column: estimated_spend
  Unique: 44
  Most Frequent: 49

Column: publisher_platforms
  Unique: 9
  Most Frequent: ['facebook', 'instagram']

Column: illuminating_scored_message
  Unique: 26338
  Most Frequent: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855

Column: illuminating_mentions
  Unique: 278
  Most Frequent: []



  pl.count(),


shape: (3, 126)
┌────────────┬───────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ page_id    ┆ count ┆ estimated_ ┆ estimated_ ┆ … ┆ womens_is ┆ incivilit ┆ freefair_ ┆ fraud_ill │
│ ---        ┆ ---   ┆ audience_s ┆ impression ┆   ┆ sue_topic ┆ y_illumin ┆ illuminat ┆ uminating │
│ str        ┆ u32   ┆ ize_mean   ┆ s_mean     ┆   ┆ _illumina ┆ ating_std ┆ ing_std   ┆ _std      │
│            ┆       ┆ ---        ┆ ---        ┆   ┆ tin…      ┆ ---       ┆ ---       ┆ ---       │
│            ┆       ┆ f64        ┆ f64        ┆   ┆ ---       ┆ f64       ┆ f64       ┆ f64       │
│            ┆       ┆            ┆            ┆   ┆ f64       ┆           ┆           ┆           │
╞════════════╪═══════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ aee327e3d4 ┆ 2     ┆ 75000.0    ┆ 13499.0    ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0       │
│ 42a8a2aa2c ┆       ┆            ┆            ┆   ┆           ┆           

  pl.count(),


shape: (3, 127)
┌────────────┬────────────┬───────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ page_id    ┆ ad_id      ┆ count ┆ estimated_ ┆ … ┆ womens_is ┆ incivilit ┆ freefair_ ┆ fraud_ill │
│ ---        ┆ ---        ┆ ---   ┆ audience_s ┆   ┆ sue_topic ┆ y_illumin ┆ illuminat ┆ uminating │
│ str        ┆ str        ┆ u32   ┆ ize_mean   ┆   ┆ _illumina ┆ ating_std ┆ ing_std   ┆ _std      │
│            ┆            ┆       ┆ ---        ┆   ┆ tin…      ┆ ---       ┆ ---       ┆ ---       │
│            ┆            ┆       ┆ f64        ┆   ┆ ---       ┆ f64       ┆ f64       ┆ f64       │
│            ┆            ┆       ┆            ┆   ┆ f64       ┆           ┆           ┆           │
╞════════════╪════════════╪═══════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ e3ee066f4a ┆ 4ef033514f ┆ 1     ┆ 1.000001e6 ┆ … ┆ null      ┆ null      ┆ null      ┆ null      │
│ 12968ba948 ┆ 9a4803aa75 ┆       ┆            ┆   ┆           ┆           