In [1]:
import pandas as pd
import numpy as np

# STEP 1: Load the dataset
df = pd.read_csv("2024_fb_posts_president_scored_anon.csv", low_memory=False)
print("Original shape:", df.shape)

# STEP 2: Drop malformed and 100% missing columns
columns_to_drop = [
    "Sponsor Id", "Sponsor Name", "Sponsor Category",
    "illuminating_scored_messageelection_integrity_Truth_illuminating"
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# STEP 3: Convert Total Interactions to numeric (remove commas)
if "Total Interactions" in df.columns:
    df["Total Interactions"] = df["Total Interactions"].astype(str).str.replace(",", "")
    df["Total Interactions"] = pd.to_numeric(df["Total Interactions"], errors='coerce')

# STEP 4: Convert date/time columns to datetime
datetime_columns = ["Post Created"]
for col in datetime_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Optional: combine date + time into full datetime
if {"Post Created Date", "Post Created Time"}.issubset(df.columns):
    df["Post Created Timestamp"] = pd.to_datetime(
        df["Post Created Date"] + " " + df["Post Created Time"],
        errors="coerce"
    )

# STEP 5: Handle 56% missing columns
for col in ["Video Share Status", "Video Length"]:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown")

# STEP 6: Normalize string columns
df["Page Category"] = df["Page Category"].astype(str).str.strip().str.upper()
df["Page Admin Top Country"] = df["Page Admin Top Country"].astype(str).str.upper()

# STEP 7: Save cleaned version
df.to_csv("fb_posts_cleaned.csv", index=False)
print("Cleaned file saved as fb_posts_cleaned.csv")


Original shape: (19009, 56)


  df[col] = pd.to_datetime(df[col], errors='coerce')


Cleaned file saved as fb_posts_cleaned.csv


In [1]:
import csv
import math
from collections import defaultdict, Counter

def is_number(value):
    try:
        float(value)
        return True
    except:
        return False

def compute_numeric_stats(values):
    count = len(values)
    if count == 0:
        return {"count": 0, "mean": None, "min": None, "max": None, "stddev": None}
    mean = sum(values) / count
    variance = sum((x - mean) ** 2 for x in values) / count
    stddev = math.sqrt(variance)
    return {"count": count, "mean": mean, "min": min(values), "max": max(values), "stddev": stddev}

def compute_categorical_stats(values):
    counter = Counter(values)
    most_common = counter.most_common(1)[0] if counter else (None, 0)
    return {"count": len(values), "unique": len(counter), "most_common": most_common[0], "most_common_count": most_common[1]}

def aggregate_by(rows, group_keys):
    grouped = defaultdict(list)
    for row in rows:
        key = tuple(row[k] for k in group_keys)
        grouped[key].append(row)
    return grouped

def analyze(rows, group_keys=None, group_limit=3):
    columns = rows[0].keys()
    if group_keys:
        grouped = aggregate_by(rows, group_keys)
        print(f"\n=== GROUPED BY {group_keys} ===")
        for key, group in list(grouped.items())[:group_limit]:
            print(f"\nGroup {key}")
            for col in columns:
                col_values = [row[col] for row in group if row[col] not in ("", None)]
                if not col_values:
                    continue
                if all(is_number(v) for v in col_values):
                    nums = [float(v) for v in col_values]
                    stats = compute_numeric_stats(nums)
                else:
                    stats = compute_categorical_stats(col_values)
                print(f"  {col}: {stats}")
    else:
        print("\n=== OVERALL SUMMARY ===")
        for col in columns:
            col_values = [row[col] for row in rows if row[col] not in ("", None)]
            if not col_values:
                continue
            if all(is_number(v) for v in col_values):
                nums = [float(v) for v in col_values]
                stats = compute_numeric_stats(nums)
            else:
                stats = compute_categorical_stats(col_values)
            print(f"{col}: {stats}")

#Loading the data 
with open("fb_posts_cleaned.csv", newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    rows = list(reader)

#Running analysis on the data
analyze(rows)  # Overall
analyze(rows, group_keys=["Facebook_Id"])  # Grouped by Facebook id
analyze(rows, group_keys=["Facebook_Id", "post_id"])  # Grouped by Facebook id and Post id



=== OVERALL SUMMARY ===
Facebook_Id: {'count': 19009, 'unique': 21, 'most_common': '32fc18da91029ff09bf74fe9887eace6b5d2145809d583f696e344530508b064', 'most_common_count': 9013}
post_id: {'count': 19009, 'unique': 19009, 'most_common': '8570b69695e00d8f06b12398ed525497e1712b5369c6fc2138fe98f69811c138', 'most_common_count': 1}
Page Category: {'count': 19009, 'unique': 7, 'most_common': 'PERSON', 'most_common_count': 9453}
Page Admin Top Country: {'count': 19009, 'unique': 2, 'most_common': 'US', 'most_common_count': 16280}
Post Created: {'count': 9378, 'unique': 8936, 'most_common': '9/22/2023 10:20', 'most_common_count': 6}
Post Created Date: {'count': 19009, 'unique': 425, 'most_common': '10/31/2024', 'most_common_count': 103}
Post Created Time: {'count': 19009, 'unique': 16102, 'most_common': '19:42:00', 'most_common_count': 7}
Type: {'count': 16544, 'unique': 9, 'most_common': 'Link', 'most_common_count': 7404}
Total Interactions: {'count': 19009, 'mean': 4190.439633857646, 'min': 