<a href="https://colab.research.google.com/github/ProfessorPatrickSlatraigh/cis9557__baseline/blob/main/cis9557_ForbesBillionairesNWperCapita.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>CIS9557 - Forbes Billionares Net Worth per Capita</h1>

Business Analytics: Billionaire Wealth Per Capita Analysis  
Course: Graduate Business Analytics  
Purpose: Analyze total billionaire net worth per capita by country (2010-2023)  
  
This program processes Forbes billionaire data and UN population data to identify  


In [None]:
import pandas as pd
import numpy as np
import re

In [19]:
"""
Revised code with:
1. Billionaire net worth converted to whole dollars (instead of billions)
2. Population data already converts from thousands to actual population
"""

def parse_net_worth(net_worth_str):
    """
    Parse net worth string (e.g., '2.0 B', '1.5 B') to numeric value in whole dollars.

    Parameters:
    -----------
    net_worth_str : str
        Net worth string in format like '2.0 B' or '1.5 B'

    Returns:
    --------
    float : Net worth value in whole dollars, or NaN if parsing fails
    """
    if pd.isna(net_worth_str):
        return np.nan

    # Remove whitespace and convert to uppercase
    net_worth_str = str(net_worth_str).strip().upper()

    # Match pattern: number followed by 'B' (billions)
    match = re.match(r'([0-9.]+)\s*B', net_worth_str)

    if match:
        # Convert billions to whole dollars
        return float(match.group(1)) * 1e9

    return np.nan


def load_billionaire_data(filepath):
    """
    Load and process billionaire data from Forbes CSV file.

    Parameters:
    -----------
    filepath : str
        Path to the billionaires CSV file

    Returns:
    --------
    pd.DataFrame : Processed billionaire data with parsed net worth values in dollars
    """
    print("Loading billionaire data...")
    df = pd.read_csv(filepath)

    print(f"Total records loaded: {len(df)}")
    print(f"Years available: {df['year'].min()} to {df['year'].max()}")

    # Parse net worth values (now in whole dollars)
    df['net_worth_dollars'] = df['net_worth'].apply(parse_net_worth)

    # Remove records with missing net worth
    df = df.dropna(subset=['net_worth_dollars'])

    print(f"Records with valid net worth: {len(df)}")

    return df


def load_population_data(filepath):
    """
    Load and process UN population data.

    Parameters:
    -----------
    filepath : str
        Path to the population CSV file

    Returns:
    --------
    pd.DataFrame : Processed population data with actual population values
    """
    print("\nLoading population data...")
    df = pd.read_csv(filepath, encoding='utf-8-sig')  # Handle BOM if present

    print(f"Total records loaded: {len(df)}")

    # Filter to include only records where ISO3_code is not blank
    df = df[df['ISO3_code'].notna() & (df['ISO3_code'] != '')]

    print(f"Records with valid ISO3_code: {len(df)}")
    print(f"Years available: {df['Time'].min()} to {df['Time'].max()}")

    # Population is in thousands, convert to actual population
    df['population'] = df['PopTotal'] * 1000

    return df


def load_country_mapping(filepath):
    """
    Load country name mapping table from CSV file.

    This function loads an external mapping file that contains the standardized
    country names between Forbes billionaire data and UN WPP population data.

    Parameters:
    -----------
    filepath : str
        Path to the country mapping CSV file

    Returns:
    --------
    pd.DataFrame : Country mapping table with columns:
                   - countryForbes: Country name in Forbes data
                   - countryWPP: Country name in UN WPP data
                   - ISO3: ISO3 country code
    """
    print("\nLoading country mapping table...")
    df_mapping = pd.read_csv(filepath)

    print(f"Mapping records loaded: {len(df_mapping)}")
    print(f"Countries with mappings: {df_mapping['countryForbes'].nunique()}")

    # Display the mappings for verification
    print("\nCountry Mappings:")
    print("-" * 80)
    for idx, row in df_mapping.iterrows():
        print(f"  {row['countryForbes']:30s} -> {row['countryWPP']:40s} [{row['ISO3']}]")

    return df_mapping


def aggregate_billionaire_wealth(df_billionaires, year_start=2010, year_end=2023):
    """
    Aggregate total billionaire net worth by country and year.

    Parameters:
    -----------
    df_billionaires : pd.DataFrame
        Billionaire data with net worth in whole dollars
    year_start : int
        Starting year for analysis (default: 2010)
    year_end : int
        Ending year for analysis (default: 2023)

    Returns:
    --------
    pd.DataFrame : Aggregated data with total net worth in dollars by country and year
    """
    print(f"\nAggregating billionaire wealth for years {year_start} to {year_end}...")

    # Filter for specified year range
    df_filtered = df_billionaires[
        (df_billionaires['year'] >= year_start) &
        (df_billionaires['year'] <= year_end)
    ].copy()

    print(f"Records in year range: {len(df_filtered)}")

    # Group by country and year, summing net worth
    df_aggregated = df_filtered.groupby(
        ['country_of_citizenship', 'year'],
        as_index=False
    )['net_worth_dollars'].sum()

    df_aggregated.rename(
        columns={
            'country_of_citizenship': 'country',
            'net_worth_dollars': 'total_net_worth_dollars'
        },
        inplace=True
    )

    print(f"Unique countries with billionaires: {df_aggregated['country'].nunique()}")
    print(f"Country-year combinations: {len(df_aggregated)}")

    return df_aggregated


def prepare_population_data(df_population, year_start=2010, year_end=2023):
    """
    Prepare population data for specified year range.

    Parameters:
    -----------
    df_population : pd.DataFrame
        Population data from UN with actual population values
    year_start : int
        Starting year for analysis (default: 2010)
    year_end : int
        Ending year for analysis (default: 2023)

    Returns:
    --------
    pd.DataFrame : Filtered population data
    """
    print(f"\nPreparing population data for years {year_start} to {year_end}...")

    # Filter for specified year range
    df_filtered = df_population[
        (df_population['Time'] >= year_start) &
        (df_population['Time'] <= year_end)
    ].copy()

    # Select relevant columns and rename for clarity
    df_filtered = df_filtered[['Location', 'ISO3_code', 'Time', 'population']].copy()
    df_filtered.rename(
        columns={
            'Location': 'country',
            'Time': 'year'
        },
        inplace=True
    )

    print(f"Country-year combinations: {len(df_filtered)}")
    print(f"Unique countries: {df_filtered['country'].nunique()}")

    return df_filtered


def enrich_billionaire_data_with_mapping(df_billionaires, df_mapping):
    """
    Enrich billionaire data with standardized country names using mapping table.

    This function applies the country mapping to the Forbes billionaire data,
    replacing Forbes country names with UN WPP standardized names and adding
    ISO3 codes for better data integration.

    Parameters:
    -----------
    df_billionaires : pd.DataFrame
        Aggregated billionaire data with Forbes country names
    df_mapping : pd.DataFrame
        Country mapping table with countryForbes, countryWPP, and ISO3 columns

    Returns:
    --------
    pd.DataFrame : Enriched billionaire data with standardized country names
    """
    print("\nEnriching billionaire data with country mapping...")

    # Create a dictionary for mapping Forbes names to WPP names
    forbes_to_wpp = dict(zip(df_mapping['countryForbes'], df_mapping['countryWPP']))
    forbes_to_iso3 = dict(zip(df_mapping['countryForbes'], df_mapping['ISO3']))

    # Create enriched dataframe
    df_enriched = df_billionaires.copy()

    # Store original country name
    df_enriched['country_forbes'] = df_enriched['country']

    # Apply mapping to get WPP country name
    df_enriched['country_wpp'] = df_enriched['country'].map(forbes_to_wpp)

    # Apply mapping to get ISO3 code
    df_enriched['iso3_code'] = df_enriched['country'].map(forbes_to_iso3)

    # For countries not in mapping table, keep original name in country_wpp
    df_enriched['country_wpp'] = df_enriched['country_wpp'].fillna(df_enriched['country'])

    # Count mapped vs unmapped countries
    mapped_countries = df_enriched[df_enriched['iso3_code'].notna()]['country_forbes'].nunique()
    unmapped_countries = df_enriched[df_enriched['iso3_code'].isna()]['country_forbes'].nunique()

    print(f"Countries with mapping applied: {mapped_countries}")
    print(f"Countries without mapping (using original name): {unmapped_countries}")

    if unmapped_countries > 0:
        print("\nUnmapped countries (showing first 20):")
        unmapped_list = df_enriched[df_enriched['iso3_code'].isna()]['country_forbes'].unique()
        for country in sorted(unmapped_list)[:20]:
            print(f"  - {country}")

    # Use the WPP country name as the primary country field for merging
    df_enriched['country'] = df_enriched['country_wpp']

    return df_enriched


def merge_and_calculate_per_capita(df_billionaires_enriched, df_population):
    """
    Merge enriched billionaire data with population data and calculate per capita ratio.

    Parameters:
    -----------
    df_billionaires_enriched : pd.DataFrame
        Enriched billionaire data with standardized country names and net worth in dollars
    df_population : pd.DataFrame
        Population data by country and year with actual population values

    Returns:
    --------
    pd.DataFrame : Merged data with per capita calculations
    """
    print("\nMerging datasets and calculating per capita ratios...")

    # Merge datasets on country and year
    df_merged = pd.merge(
        df_billionaires_enriched,
        df_population[['country', 'year', 'population']],
        on=['country', 'year'],
        how='inner'
    )

    print(f"Successfully merged records: {len(df_merged)}")
    print(f"Unique countries in merged data: {df_merged['country'].nunique()}")

    # Display sample of merged data for verification
    if len(df_merged) > 0:
        print("\nSample merged records:")
        print("-" * 80)
        sample_cols = ['year', 'country', 'country_forbes', 'iso3_code', 'total_net_worth_dollars', 'population']
        available_cols = [col for col in sample_cols if col in df_merged.columns]
        print(df_merged[available_cols].head(3).to_string(index=False))

    # Calculate per capita ratio
    # Net worth is already in dollars, so simply divide by population
    df_merged['net_worth_per_capita'] = (
        df_merged['total_net_worth_dollars'] / df_merged['population']
    )

    return df_merged


def identify_top_countries_by_year(df_merged, top_n=10):
    """
    Identify top N countries by billionaire wealth per capita for each year.

    Parameters:
    -----------
    df_merged : pd.DataFrame
        Merged data with per capita calculations
    top_n : int
        Number of top countries to identify (default: 10)

    Returns:
    --------
    pd.DataFrame : Top countries for each year
    """
    print(f"\nIdentifying top {top_n} countries for each year...")

    # Sort by year and per capita ratio
    df_sorted = df_merged.sort_values(
        ['year', 'net_worth_per_capita'],
        ascending=[True, False]
    )

    # Get top N countries for each year
    df_top = df_sorted.groupby('year').head(top_n).reset_index(drop=True)

    # Add rank within each year
    df_top['rank'] = df_top.groupby('year').cumcount() + 1

    return df_top


def format_output(df_top):
    """
    Format the output for presentation.

    Parameters:
    -----------
    df_top : pd.DataFrame
        Top countries by year

    Returns:
    --------
    pd.DataFrame : Formatted output
    """
    # Select columns for output
    output_cols = [
        'year',
        'rank',
        'country',
        'total_net_worth_dollars',
        'population',
        'net_worth_per_capita'
    ]

    # Add optional columns if they exist
    if 'country_forbes' in df_top.columns:
        output_cols.insert(3, 'country_forbes')
    if 'iso3_code' in df_top.columns:
        output_cols.insert(3, 'iso3_code')

    df_output = df_top[output_cols].copy()

    # Format columns for readability
    df_output['total_net_worth_dollars'] = df_output['total_net_worth_dollars'].round(0).astype(int)
    df_output['population'] = df_output['population'].astype(int)
    df_output['net_worth_per_capita'] = df_output['net_worth_per_capita'].round(2)

    # Rename columns for clarity
    df_output.rename(
        columns={
            'total_net_worth_dollars': 'total_net_worth_usd',
            'net_worth_per_capita': 'net_worth_per_capita_usd'
        },
        inplace=True
    )

    return df_output


def generate_summary_statistics(df_top):
    """
    Generate summary statistics for the analysis.

    Parameters:
    -----------
    df_top : pd.DataFrame
        Top countries by year

    Returns:
    --------
    dict : Summary statistics
    """
    summary = {
        'total_years_analyzed': df_top['year'].nunique(),
        'years_range': f"{df_top['year'].min()} - {df_top['year'].max()}",
        'unique_countries_in_top_10': df_top['country'].nunique(),
        'countries_appearing_all_years': len(
            df_top.groupby('country').filter(
                lambda x: len(x) == df_top['year'].nunique()
            )['country'].unique()
        ),
        'max_per_capita_value': df_top['net_worth_per_capita'].max(),
        'max_per_capita_country_year': df_top.loc[
            df_top['net_worth_per_capita'].idxmax(),
            ['country', 'year']
        ].to_dict()
    }

    return summary


def main():
    """
    Main execution function for the billionaire per capita analysis.
    """
    print("=" * 80)
    print("BILLIONAIRE WEALTH PER CAPITA ANALYSIS (2010-2023)")
    print("Using External Country Mapping Table")
    print("=" * 80)

    # Define file paths
    # for local execution
    # billionaire_file = '/mnt/user-data/uploads/all_billionaires_1997_2024.csv'
    # population_file = '/mnt/user-data/uploads/WPP2024_TotalPopulationBySex.csv'
    # mapping_file = '/mnt/user-data/uploads/country_mapping.csv'

    # for Colab execution
    billionaire_file = '/content/all_billionaires_1997_2024.csv'
    population_file = '/content/WPP2024_TotalPopulationBySex.csv'
    mapping_file = '/content/country_mapping.csv'

    # Analysis parameters
    YEAR_START = 2010
    YEAR_END = 2023
    TOP_N = 10

    # Step 1: Load data files
    df_billionaires = load_billionaire_data(billionaire_file)
    df_population = load_population_data(population_file)
    df_mapping = load_country_mapping(mapping_file)

    # Step 2: Aggregate billionaire wealth by country and year
    df_billionaire_aggregated = aggregate_billionaire_wealth(
        df_billionaires,
        YEAR_START,
        YEAR_END
    )

    # Step 3: Prepare population data
    df_population_prepared = prepare_population_data(
        df_population,
        YEAR_START,
        YEAR_END
    )

    # Step 4: Enrich billionaire data with country mapping
    df_billionaire_enriched = enrich_billionaire_data_with_mapping(
        df_billionaire_aggregated,
        df_mapping
    )

    # Step 5: Merge and calculate per capita
    df_merged = merge_and_calculate_per_capita(
        df_billionaire_enriched,
        df_population_prepared
    )

    # Step 6: Identify top countries
    df_top = identify_top_countries_by_year(df_merged, TOP_N)

    # Step 7: Format output
    df_output = format_output(df_top)

    # Step 8: Generate summary statistics
    summary = generate_summary_statistics(df_top)

    # Display results
    print("\n" + "=" * 80)
    print("ANALYSIS RESULTS")
    print("=" * 80)

    print("\nSummary Statistics:")
    print(f"  Years analyzed: {summary['years_range']}")
    print(f"  Unique countries in top 10: {summary['unique_countries_in_top_10']}")
    print(f"  Countries appearing all years: {summary['countries_appearing_all_years']}")
    print(f"  Maximum per capita value: ${summary['max_per_capita_value']:,.2f}")
    print(f"  Country and year with max: {summary['max_per_capita_country_year']['country']} "
          f"({int(summary['max_per_capita_country_year']['year'])})")

    print("\n" + "=" * 80)
    print("TOP 10 COUNTRIES BY BILLIONAIRE WEALTH PER CAPITA (BY YEAR)")
    print("=" * 80)

    # Display results year by year
    for year in sorted(df_output['year'].unique()):
        print(f"\n{year}:")
        print("-" * 80)
        year_data = df_output[df_output['year'] == year]
        print(year_data.to_string(index=False))

    # Save results to CSV
    # for local execution
    # output_file = '/mnt/user-data/outputs/billionaire_per_capita_top10_by_year.csv'

    # for Colab execution
    output_file = '/content/billionaire_per_capita_top10_by_year.csv'

    df_output.to_csv(output_file, index=False)
    print(f"\n\nResults saved to: {output_file}")

    # Save summary by country
    country_summary = df_output.groupby('country').agg({
        'year': lambda x: f"{x.min()}-{x.max()}",
        'rank': 'mean',
        'net_worth_per_capita_usd': 'mean'
    }).reset_index()

    country_summary['avg_rank'] = country_summary['rank'].round(2)
    country_summary['avg_net_worth_per_capita_usd'] = country_summary['net_worth_per_capita_usd'].round(2)
    country_summary = country_summary[['country', 'year', 'avg_rank', 'avg_net_worth_per_capita_usd']]
    country_summary = country_summary.sort_values('avg_rank')

    # for local execution
    # summary_file = '/mnt/user-data/outputs/billionaire_per_capita_country_summary.csv'

    # for Colab execution
    summary_file = '/content/billionaire_per_capita_country_summary.csv'

    country_summary.to_csv(summary_file, index=False)
    print(f"Country summary saved to: {summary_file}")

    # Save detailed merged data for further analysis
    # for local execution
    # detailed_file = '/mnt/user-data/outputs/billionaire_per_capita_detailed_data.csv'

    # for Colab execution
    detailed_file = '/content/billionaire_per_capita_detailed_data.csv'

    df_merged_output = df_merged[[
        'year', 'country', 'country_forbes', 'iso3_code',
        'total_net_worth_dollars', 'population', 'net_worth_per_capita'
    ]].copy()

    df_merged_output['total_net_worth_dollars'] = df_merged_output['total_net_worth_dollars'].round(0).astype(int)
    df_merged_output['population'] = df_merged_output['population'].astype(int)
    df_merged_output['net_worth_per_capita'] = df_merged_output['net_worth_per_capita'].round(2)

    df_merged_output.to_csv(detailed_file, index=False)
    print(f"Detailed merged data saved to: {detailed_file}")

    print("\n" + "=" * 80)
    print("ANALYSIS COMPLETE")
    print("=" * 80)

    return df_output, country_summary, df_merged



In [20]:
if __name__ == "__main__":
    df_results, df_summary, df_detailed = main()

BILLIONAIRE WEALTH PER CAPITA ANALYSIS (2010-2023)
Using External Country Mapping Table
Loading billionaire data...
Total records loaded: 34511
Years available: 1997 to 2024
Records with valid net worth: 34511

Loading population data...


  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath, encoding='utf-8-sig')  # Handle BOM if present


Total records loaded: 720210
Records with valid ISO3_code: 346020
Years available: 1950 to 2100

Loading country mapping table...
Mapping records loaded: 15
Countries with mappings: 15

Country Mappings:
--------------------------------------------------------------------------------
  Czech Republic                 -> Czechia                                  [CZE]
  Eswatini (Swaziland)           -> Eswatini                                 [SWZ]
  Hong Kong                      -> China, Hong Kong SAR                     [HKG]
  Macao                          -> China, Macao SAR                         [MAC]
  Macau                          -> China, Macao SAR                         [MAC]
  Russia                         -> Russian Federation                       [RUS]
  South Korea                    -> Dem. People's Republic of Korea          [KOR]
  St. Kitts and Nevis            -> Saint Kitts and Nevis                    [KNA]
  Swaziland                      -> Eswatini       