# Gender-Based Disparities in Sponsorship and Earnings

This notebook compares key outcome metrics across reported gender categories:

- `total_sponsors`
- `estimated_earnings`
- `followers` / `public_repos` (as proxies for visibility and output)

In [None]:
import pandas as pd
from data_loader import load_sample_data

In [None]:
def summarize_by_gender(df: pd.DataFrame, min_accounts: int = 5) -> pd.DataFrame:
    df = df.copy()
    if "gender" not in df.columns:
        df["gender"] = "Unknown"

    df["gender"] = df["gender"].fillna("Unknown")

    summary = (
        df.groupby("gender")
        .agg(
            n_accounts=("username", "count"),
            mean_sponsors=("total_sponsors", "mean"),
            median_sponsors=("total_sponsors", "median"),
            mean_earnings=("estimated_earnings", "mean"),
            median_earnings=("estimated_earnings", "median"),
            mean_followers=("followers", "mean"),
            mean_public_repos=("public_repos", "mean"),
        )
        .reset_index()
    )

    summary = summary[summary["n_accounts"] >= min_accounts]
    return summary.sort_values("mean_sponsors", ascending=False)

## Load Data and Compute Gender Summary

In [None]:
df = load_sample_data()
summary = summarize_by_gender(df, min_accounts=5)

## Display Results

In [None]:
if summary.empty:
    print("No gender categories with sufficient number of accounts for analysis.")
else:
    pd.set_option("display.max_columns", 20)
    print("\nGender-based disparities in sponsorship and earnings:\n")
    summary[
        [
            "gender",
            "n_accounts",
            "mean_sponsors",
            "median_sponsors",
            "mean_earnings",
            "median_earnings",
            "mean_followers",
            "mean_public_repos",
        ]
    ]