In [1]:
import csv
import math
import os
from collections import defaultdict, Counter
from typing import List, Dict

# --------- Helper Functions ---------
def is_number(value: str) -> bool:
    try:
        float(value)
        return True
    except ValueError:
        return False

def safe_float(value: str) -> float:
    try:
        return float(value)
    except:
        return float('nan')

# --------- Core Function: Descriptive Stats ---------
def describe_dataset(file_path: str):
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        data = list(reader)

    print(f"\n\n===== Descriptive Statistics for: {os.path.basename(file_path)} =====")

    numeric_stats = defaultdict(list)
    non_numeric_stats = defaultdict(list)

    for row in data:
        for col, val in row.items():
            if is_number(val):
                numeric_stats[col].append(float(val))
            else:
                non_numeric_stats[col].append(val)

    # ---------- Numeric Columns ----------
    print("\n--- NUMERIC COLUMNS ---")
    for col, values in numeric_stats.items():
        count = len(values)
        mean = sum(values) / count if count else 0
        minimum = min(values) if values else None
        maximum = max(values) if values else None
        stddev = math.sqrt(sum((x - mean)**2 for x in values) / count) if count else 0

        print(f"\nColumn: {col}")
        print(f"  Count     : {count}")
        print(f"  Mean      : {mean:.2f}")
        print(f"  Min       : {minimum}")
        print(f"  Max       : {maximum}")
        print(f"  Std Dev   : {stddev:.2f}")

    # ---------- Non-Numeric Columns ----------
    print("\n--- NON-NUMERIC COLUMNS ---")
    for col, values in non_numeric_stats.items():
        count = len(values)
        unique_vals = set(values)
        freq_counter = Counter(values)
        most_common = freq_counter.most_common(1)[0] if freq_counter else ('', 0)

        print(f"\nColumn: {col}")
        print(f"  Count          : {count}")
        print(f"  Unique Values  : {len(unique_vals)}")
        print(f"  Most Frequent  : {most_common[0]} (Count: {most_common[1]})")


In [3]:
# Put your actual file paths here
files = [
    "C:/Users/unnat/OneDrive/Desktop/RA/Task_04_Descriptive_Stats/data/period_03/2024_fb_ads_president_scored_anon.csv",
    "C:/Users/unnat/OneDrive/Desktop/RA/Task_04_Descriptive_Stats/data/period_03/2024_fb_posts_president_scored_anon.csv",
    "C:/Users/unnat/OneDrive/Desktop/RA/Task_04_Descriptive_Stats/data/period_03/2024_tw_posts_president_scored_anon.csv"
]

for file_path in files:
    describe_dataset(file_path)




===== Descriptive Statistics for: 2024_fb_ads_president_scored_anon.csv =====

--- NUMERIC COLUMNS ---

Column: estimated_audience_size
  Count     : 246745
  Mean      : 556462.86
  Min       : 0.0
  Max       : 1000001.0
  Std Dev   : 409863.93

Column: estimated_impressions
  Count     : 246745
  Mean      : 45601.53
  Min       : 499.0
  Max       : 1000000.0
  Std Dev   : 136790.49

Column: estimated_spend
  Count     : 246745
  Mean      : 1061.29
  Min       : 49.0
  Max       : 474999.0
  Std Dev   : 4992.55

Column: scam_illuminating
  Count     : 246745
  Mean      : 0.07
  Min       : 0.0
  Max       : 1.0
  Std Dev   : 0.26

Column: election_integrity_Truth_illuminating
  Count     : 246745
  Mean      : 0.05
  Min       : 0.0
  Max       : 1.0
  Std Dev   : 0.22

Column: advocacy_msg_type_illuminating
  Count     : 246745
  Mean      : 0.55
  Min       : 0.0
  Max       : 1.0
  Std Dev   : 0.50

Column: issue_msg_type_illuminating
  Count     : 246745
  Mean      : 0.38


In [4]:
def describe_by_group(file_path: str, group_columns: List[str]):
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        data = list(reader)

    print(f"\n\n===== Grouped Statistics for: {os.path.basename(file_path)} | Grouped by: {group_columns} =====")

    groups = defaultdict(list)

    for row in data:
        group_key = tuple(row[col] for col in group_columns if col in row)
        groups[group_key].append(row)

    for group_key, rows in list(groups.items())[:5]:  # Limiting to 5 groups for display
        print(f"\n--- Group: {group_key} (n={len(rows)}) ---")
        numeric_stats = defaultdict(list)
        non_numeric_stats = defaultdict(list)

        for row in rows:
            for col, val in row.items():
                if is_number(val):
                    numeric_stats[col].append(float(val))
                else:
                    non_numeric_stats[col].append(val)

        for col, values in numeric_stats.items():
            count = len(values)
            mean = sum(values) / count if count else 0
            minimum = min(values) if values else None
            maximum = max(values) if values else None
            stddev = math.sqrt(sum((x - mean)**2 for x in values) / count) if count else 0

            print(f"  [NUM] {col} → Count: {count}, Mean: {mean:.2f}, Min: {minimum}, Max: {maximum}, Std Dev: {stddev:.2f}")

        for col, values in non_numeric_stats.items():
            count = len(values)
            unique_vals = set(values)
            most_common = Counter(values).most_common(1)[0] if values else ("", 0)
            print(f"  [CAT] {col} → Count: {count}, Unique: {len(unique_vals)}, Most Frequent: {most_common[0]} ({most_common[1]})")


In [5]:
# Run groupby on each file
for file_path in files:
    describe_by_group(file_path, group_columns=["page_id"])

print("\n---\n")

# If ad_id exists in the file, do combined group
for file_path in files:
    describe_by_group(file_path, group_columns=["page_id", "ad_id"])




===== Grouped Statistics for: 2024_fb_ads_president_scored_anon.csv | Grouped by: ['page_id'] =====

--- Group: ('4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230',) (n=33) ---
  [NUM] estimated_audience_size → Count: 33, Mean: 43636.36, Min: 30000.0, Max: 75000.0, Std Dev: 20680.57
  [NUM] estimated_impressions → Count: 33, Mean: 66908.09, Min: 499.0, Max: 374999.0, Std Dev: 95130.57
  [NUM] estimated_spend → Count: 33, Mean: 467.18, Min: 49.0, Max: 2249.0, Std Dev: 652.94
  [NUM] scam_illuminating → Count: 33, Mean: 0.00, Min: 0.0, Max: 0.0, Std Dev: 0.00
  [NUM] election_integrity_Truth_illuminating → Count: 33, Mean: 0.00, Min: 0.0, Max: 0.0, Std Dev: 0.00
  [NUM] advocacy_msg_type_illuminating → Count: 33, Mean: 1.00, Min: 1.0, Max: 1.0, Std Dev: 0.00
  [NUM] issue_msg_type_illuminating → Count: 33, Mean: 1.00, Min: 1.0, Max: 1.0, Std Dev: 0.00
  [NUM] attack_msg_type_illuminating → Count: 33, Mean: 0.00, Min: 0.0, Max: 0.0, Std Dev: 0.00
  [NUM] image_msg_type_i