# Geographic Disparities in Sponsorship and Earnings

This notebook analyzes geographic disparities by grouping accounts by location (as provided / normalized in the sample CSV) and computing:

- Mean and median `total_sponsors` and `estimated_earnings` by country
- Share of global sponsors and earnings captured by each country

In [None]:
import pandas as pd
from data_loader import load_sample_data

In [None]:
def summarize_by_location(df: pd.DataFrame, min_accounts: int = 5) -> pd.DataFrame:
    df = df.copy()
    if "location" not in df.columns:
        df["location"] = "Unknown"

    df["location"] = df["location"].fillna("Unknown")

    group = (
        df.groupby("location")
        .agg(
            n_accounts=("username", "count"),
            mean_sponsors=("total_sponsors", "mean"),
            median_sponsors=("total_sponsors", "median"),
            mean_earnings=("estimated_earnings", "mean"),
            median_earnings=("estimated_earnings", "median"),
            total_sponsors=("total_sponsors", "sum"),
            total_earnings=("estimated_earnings", "sum"),
        )
        .reset_index()
    )

    # Filter out very small cells to focus on meaningful country-level patterns.
    group = group[group["n_accounts"] >= min_accounts]

    total_sponsors_global = group["total_sponsors"].sum() or 1.0
    total_earnings_global = group["total_earnings"].sum() or 1.0

    group["share_of_sponsors"] = group["total_sponsors"] / total_sponsors_global
    group["share_of_earnings"] = group["total_earnings"] / total_earnings_global

    return group.sort_values("mean_sponsors", ascending=False)

## Load Data and Compute Geographic Summary

In [None]:
df = load_sample_data()
summary = summarize_by_location(df, min_accounts=5)

## Display Results

In [None]:
if summary.empty:
    print("No locations with sufficient number of accounts for analysis.")
else:
    pd.set_option("display.max_columns", 20)
    print("\nGeographic disparities in sponsorship and earnings (by location):\n")
    summary[
        [
            "location",
            "n_accounts",
            "mean_sponsors",
            "median_sponsors",
            "mean_earnings",
            "median_earnings",
            "share_of_sponsors",
            "share_of_earnings",
        ]
    ]