In [2]:
import requests
import json
import pandas as pd
import geopandas as gpd
import os

# Get data from census.gov api:

## Racial Population Data Querying, Cleaning, and Formatting

In [None]:
API_KEY = ' '
API_KEY_END = f'&key={API_KEY}'

POPULATION_SCALES = {
    "State": "in=state:",
    "Counties": "for=county:*",
    "CongressionalDistricts": "for=congressional%20district:*"
}
POPULATION_SCALE_KEYS = {
    "State": "",
    "Counties": "counties_",
    "CongressionalDistricts": "cd_"
}
STATE_NAMES = {24: "Maryland", 45: "South Carolina"}
STATE_NAMES_LOWER = {24: "maryland", 45: "south_carolina"}
RACE_IDENTIFIERS = {
    "White": "S2101_C01_014E",
    "BlackAfricanAmerican": "S2101_C01_015E",
    "AmericanIndianAlaskaNative": "S2101_C01_016E",
    "Asian": "S2101_C01_017E",
    "NativeHawaiianOtherPacificIslander": "S2101_C01_018E",
    "HispanicOrLatino": "S2101_C01_021E",
    "Other": "S2101_C01_019E"
}
TOTAL_POP_IDENTIFIER = "S0101_C01_001E"
VOTING_AGE_POP_IDENTIFIERS = {
    "18-29": "S2902_C01_002E",
    "30-44": "S2902_C01_003E",
    "45-64": "S2902_C01_004E",
    "65+": "S2902_C01_005E"
}

In [None]:
OUTPUT_DIR = 'output2'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
def fetch_population_data(state, population_scale, race_key, api_key_end):
    """
    Fetch population data for a specific state, scale, and race.
    """
    base_url = 'https://api.census.gov/data/2022/acs/acs5/subject'
    if population_scale == "State":
        url = f'{base_url}?get=NAME,{RACE_IDENTIFIERS[race_key]}&for=state:{state}{api_key_end}'
    else:
        scale = POPULATION_SCALES["State"] if population_scale != "State" else ""
        url = (
            f'{base_url}?get=NAME,{RACE_IDENTIFIERS[race_key]}&'
            f'{POPULATION_SCALES["State"]}{state}&{POPULATION_SCALES[population_scale]}{api_key_end}'
        )
    response = requests.get(url)
    return response

def process_data_frame(df, population_scale, state, race_key):
    """
    Process the DataFrame based on the population scale.
    """
    state_name = STATE_NAMES[state]
    if population_scale == "State":
        df.insert(1, "Year", [2022] * len(df), False)
        df = df.rename(columns={0: "State", 2: "State FIPS Code"})
    elif population_scale == "Counties":
        df.insert(1, "Year", [2022] * len(df), False)
        df = df.rename(columns={0: "County Name", 1: "Estimated Population", 2: "State FIPS Code", 3: "County Number"})
        df['County Name'] = df['County Name'].str.replace(f', {state_name}', '')
    elif population_scale == "CongressionalDistricts":
        df.insert(1, "Year", [2022] * len(df), False)
        df.insert(2, "Congress", [118] * len(df), False)
        df = df.rename(columns={0: "District Name", 2: "State FIPS Code", 3: "District Number"})
        df['District Name'] = df['District Name'].str.replace(f' \\(118th Congress\\), {state_name}', '', regex=True)

    df = df.rename(columns={1: "Estimated Population"})
    save_path = f'{OUTPUT_DIR}/{STATE_NAMES_LOWER[state]}_{POPULATION_SCALE_KEYS[population_scale]}estimated_populations_{race_key}.json'
    df.to_json(save_path, orient="records", indent=4)
    return df


In [None]:
# Main Execution
for state in STATE_NAMES:
    for population_scale in POPULATION_SCALES:
        for race_key in RACE_IDENTIFIERS:
            response = fetch_population_data(state, population_scale, race_key, API_KEY_END)
            if response.status_code == 200:
                try:
                    data = response.json()
                    df = pd.DataFrame(data[1:], columns=data[0])
                    processed_df = process_data_frame(df, population_scale, state, race_key)
                    print(processed_df.head())
                except Exception as e:
                    print(f"Error processing data for {race_key} in {state}: {e}")
            else:
                print(f"Failed request ({response.status_code}) for {race_key} in {state}")


## Block Level demographics data

### South Carolina

In [31]:
df_sc_race = pd.read_csv('raw/census_block/race/sc_race_2022_bg/sc_race_2022_bg.csv')
print(df_sc_race.columns)

Index(['GEOID', 'STATEFP', 'STATE', 'COUNTYFP', 'COUNTY', 'TOT_POP22',
       'NHSP_POP22', 'HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22',
       'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22', '2OM_NHSP22', 'BLK_ALL22',
       'AIA_ALL22', 'ASN_ALL22', 'HPI_ALL22', 'OTH_ALL22'],
      dtype='object')


In [32]:
state_population = df_sc_race['TOT_POP22'].sum()

print(f"State Population: {state_population}")

State Population: 5142750


In [39]:
columns_to_sum = ['HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22', 'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22', '2OM_NHSP22']
total_sum_from_columns = df_sc_race[columns_to_sum].sum().sum()
print(f"Total Sum from Columns: {total_sum_from_columns}")

Total Sum from Columns: 5142750


In [42]:
df_sc_race['OTH_NHSP22'] = df_sc_race['OTH_NHSP22'] + df_sc_race['2OM_NHSP22']

In [43]:
selected_columns = [
    "GEOID",
    "STATEFP",
    "STATE",
    "COUNTYFP",
    "COUNTY",
    "TOT_POP22",
    "NHSP_POP22",
    "HSP_POP22",
    "WHT_NHSP22",
    "BLK_NHSP22",
    "AIA_NHSP22",
    "ASN_NHSP22",
    "HPI_NHSP22",
    "OTH_NHSP22",
]


In [44]:
df_sc_race_filtered = df_sc_race[selected_columns]


In [45]:
path = "processed_individual/sc_race_block.csv"
df_sc_race_filtered.to_csv(path, index=False)

### Maryland

In [74]:
df_md_race = pd.read_csv('raw/census_block/race/md_race_2022_bg/md_race_2022_bg.csv')
print(df_md_race.columns)

Index(['GEOID', 'STATEFP', 'STATE', 'COUNTYFP', 'COUNTY', 'TOT_POP22',
       'NHSP_POP22', 'HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22',
       'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22', '2OM_NHSP22', 'BLK_ALL22',
       'AIA_ALL22', 'ASN_ALL22', 'HPI_ALL22', 'OTH_ALL22'],
      dtype='object')


In [75]:
df_md_race['OTH_NHSP22'] = df_md_race['OTH_NHSP22'] + df_md_race['2OM_NHSP22']

In [76]:
df_md_race_filtered = df_md_race[selected_columns]

In [77]:
path = "processed_individual/md_race_block.csv"
df_md_race_filtered.to_csv(path, index=False)

# Preprocess State Summary

## South Carolina:

In [108]:
sc_race_df = pd.read_json("states/south_carolina/demographics/south_carolina_precincts_racial_population.json")
sc_econ_df = pd.read_json("states/south_carolina/economic/south_carolina_precincts_household_income.json")
sc_election_df = pd.read_json("states/south_carolina/election/sc_election.json")
sc_region_type_df = pd.read_json("states/south_carolina/geodata/south_carolina_precincts_region_type.json")

### State and racial population

In [109]:

state_population = sc_race_df['TOT_POP22'].sum()

total_state_pop = sc_race_df['TOT_POP22'].sum()

white_pop = sc_race_df['WHT_NHSP22'].sum()
black_pop = sc_race_df['BLK_NHSP22'].sum()
hispanic_pop = sc_race_df['HSP_POP22'].sum()
asian_pop = sc_race_df['ASN_NHSP22'].sum()
native_american_pop = sc_race_df['AIA_NHSP22'].sum()
islander_pop = sc_race_df['HPI_NHSP22'].sum()
other_pop = sc_race_df['OTH_NHSP22'].sum()
percent_white = (white_pop / total_state_pop) * 100
percent_black = (black_pop / total_state_pop) * 100
percent_hispanic = (hispanic_pop / total_state_pop) * 100
percent_asian = (asian_pop / total_state_pop) * 100
percent_native_american = (native_american_pop / total_state_pop) * 100
percent_islander = (islander_pop / total_state_pop) * 100
percent_other = (other_pop / total_state_pop) * 100

In [110]:
print(f"State Population: {state_population}")


State Population: 5142750


In [111]:

total_votes = sc_election_df['TOT_VOT'].sum()
votes_dem = sc_election_df['TOT_DEM'].sum()
votes_rep = sc_election_df['TOT_REP'].sum()
percent_dem = (votes_dem / total_votes) * 100
percent_rep = (votes_rep / total_votes) * 100

In [112]:
print(f"Total Votes: {total_votes}")
print(f"Total Votes for Democrats: {votes_dem}")
print(f"Total Votes for Republicans: {votes_rep}")
print(f"Percentage of Votes for Democrats: {percent_dem}")
print(f"Percentage of Votes for Republicans: {percent_rep}")

Total Votes: 1567833
Total Votes for Democrats: 515576
Total Votes for Republicans: 1052257
Percentage of Votes for Democrats: 32.88462482930261
Percentage of Votes for Republicans: 67.11537517069739


In [113]:
print(f"Total State Population: {total_state_pop}")
print(f"Total White Population: {white_pop}")
print(f"Total Black Population: {black_pop}")
print(f"Total Hispanic Population: {hispanic_pop}")
print(f"Total Asian Population: {asian_pop}")
print(f"Total Native American Population: {native_american_pop}")
print(f"Total Islander Population: {islander_pop}")
print(f"Total Other Population: {other_pop}")
print(f"Percentage of White Population: {percent_white}")
print(f"Percentage of Black Population: {percent_black}")
print(f"Percentage of Hispanic Population: {percent_hispanic}")
print(f"Percentage of Asian Population: {percent_asian}")
print(f"Percentage of Native American Population: {percent_native_american}")
print(f"Percentage of Islander Population: {percent_islander}")
print(f"Percentage of Other Population: {percent_other}")


Total State Population: 5142750
Total White Population: 3240171
Total Black Population: 1316074
Total Hispanic Population: 318875
Total Asian Population: 84972
Total Native American Population: 10304
Total Islander Population: 2443
Total Other Population: 169911
Percentage of White Population: 63.0046375966166
Percentage of Black Population: 25.590860920713627
Percentage of Hispanic Population: 6.200476398813865
Percentage of Asian Population: 1.6522677555782412
Percentage of Native American Population: 0.20035972971659133
Percentage of Islander Population: 0.047503767439599436
Percentage of Other Population: 3.3038938311214814


### Household Income Poppulation

In [114]:
income_columns = ['0_35K', '35K_60K', '60K-100K', '100K_125K', '125K_150K', '150K_MORE']
income_totals = sc_econ_df[income_columns].sum()
total_household_population = sc_econ_df['TOT_HOUS22'].sum()
income_distribution = (income_totals / total_household_population) * 100

In [115]:
print(f"Total Household Population: {total_household_population}")
print(f"Income Distribution: {income_distribution}")


Total Household Population: 2023085
Income Distribution: 0_35K        27.430978
35K_60K      20.147992
60K-100K     22.765232
100K_125K     9.222796
125K_150K     6.171268
150K_MORE    14.261734
dtype: float64


In [116]:
income_ranges = ['0_35K', '35K_60K', '60K-100K', '100K_125K', '125K_150K', '150K_MORE']
income_distribution_data = dict(zip(income_ranges, income_distribution.values))

### Region type population

In [117]:
print(sc_region_type_df.columns)

Index(['UNIQUE_ID', 'region_type', 'TOT_POP22'], dtype='object')


In [118]:
rural_population = 0
suburban_population = 0
urban_population = 0
for _, row in sc_region_type_df.iterrows():
    population = row["TOT_POP22"]
    region_type = row["region_type"]

    if region_type == "rural":
        rural_population += population
    elif region_type == "suburban":
        suburban_population += population
    elif region_type == "urban":
        urban_population += population
total_population = rural_population + suburban_population + urban_population
rural_percent = (rural_population / total_population) * 100 if total_population > 0 else 0
suburban_percent = (suburban_population / total_population) * 100 if total_population > 0 else 0
urban_percent = (urban_population / total_population) * 100 if total_population > 0 else 0

# Print the results
print(f"Rural Population: {rural_population}, Rural Percent: {rural_percent:.2f}%")
print(f"Suburban Population: {suburban_population}, Suburban Percent: {suburban_percent:.2f}%")
print(f"Urban Population: {urban_population}, Urban Percent: {urban_percent:.2f}%")

Rural Population: 1758073, Rural Percent: 34.19%
Suburban Population: 2394864, Suburban Percent: 46.57%
Urban Population: 989813, Urban Percent: 19.25%


### Create Data Summary for State

In [119]:
sc_summary_data = {
    "NAME": ["South Carolina"],
    "TOT_POP": [state_population],
    "TOT_WHITE": [white_pop],
    "TOT_BLACK": [black_pop],
    "TOT_HISP": [hispanic_pop],
    "TOT_ASIAN": [asian_pop],
    "TOT_NATIVE": [native_american_pop],
    "TOT_ISLANDER": [islander_pop],
    "TOT_OTHER": [other_pop],
    "DEM_VOT_DIS": [percent_dem],
    "REP_VOT_DIS": [percent_rep],
    # "WHITE_DIS": [percent_white],
    # "BLACK_DIS": [percent_black],
    # "HISP_DIS": [percent_hispanic],
    # "ASIAN_DIS": [percent_asian],
    # "NATIVE_DIS": [percent_native_american],
    # "ISLAND_DIS": [percent_islander],
    # "OTHER_DIS": [percent_other],
    "URBAN_DIS": [urban_percent],
    "SUBURBAN_DIS": [suburban_percent],
    "RURAL_DIS": [rural_percent],
    "TOT_HOUS": [total_household_population],
    "HOUS_INCOME_DIS": [income_distribution_data],
    "POV_LEVEL": [32470]
}

In [120]:
sc_summary_df = pd.DataFrame(sc_summary_data)


In [121]:
sc_summary_df.to_json('states/south_carolina/summary/south_carolina_summary.json', orient='records', lines=True, indent=4)

### Per Congressional district

In [145]:
rep_mapping = {
    "GCON01RMAC": {"Representative": "Nancy Mace", "Party": "REP", "Race": "White", "District": 1},
    "GCON02RWIL": {"Representative": "Joe Wilson", "Party": "REP", "Race": "White", "District": 2},
    "GCON03RDUN": {"Representative": "Jeff Duncan", "Party": "REP", "Race": "White", "District": 3},
    "GCON04RTIM": {"Representative": "William Timmons", "Party": "REP", "Race": "White", "District": 4},
    "GCON05RNOR": {"Representative": "Ralph Norman", "Party": "REP", "Race": "White", "District": 5},
    "GCON06DCLY": {"Representative": "James E. Clyburn", "Party": "DEM", "Race": "White", "District": 6},
    "GCON07RFRY": {"Representative": "Russell Fry", "Party": "REP", "Race": "White", "District": 7}
}


In [146]:
def add_representative_party_info(df, mapping, district_col="CONG_DIST"):
    """
    Adds 'Representative' and 'Party' columns to a DataFrame based on a district-to-representative mapping.

    :param df: pandas DataFrame containing district-level data
    :param mapping: Dictionary with representative and party information
    :param district_col: Name of the column in the DataFrame containing district identifiers
    :return: Modified DataFrame with 'Representative' and 'Party' columns added
    """
    df["Representative"] = None
    df["Party"] = None
    df["Race"] = None

    for index, row in df.iterrows():
        district = row[district_col]

        representative = None
        party = None
        race = None
        for key, rep in mapping.items():
            if rep["District"] == district:
                representative = rep["Representative"]
                party = rep["Party"]
                race = rep["Race"]
                break  
    
        df.at[index, "Representative"] = representative
        df.at[index, "Party"] = party
        df.at[index, "Race"] = race

    return df


In [147]:
def add_household_income_info(df, precinct_data, district_col="CONG_DIST"):
    """
    Adds 'Household Income' and 'Pct_Below_Poverty' 
    based on district-level aggregated precinct data.

    :param df: DataFrame containing district data without income information
    :param precinct_data: DataFrame containing precinct-level data
    :param district_col: District identifiers
    :return: Modified DataFrame with 'Household Income' and 'Pct_Below_Poverty' columns added
    """
    # Calculate metrics for each district
    district_metrics = precinct_data.groupby(district_col).apply(lambda group: {
        "Household Income": (group["MEDN_INC22"] * group["TOT_HOUS22"]).sum() / group["TOT_HOUS22"].sum() if group["TOT_HOUS22"].sum() > 0 else 0,
        "Pct_Below_Poverty": (group["0_35K"].sum() / group["TOT_HOUS22"].sum() * 100) if group["TOT_HOUS22"].sum() > 0 else 0
    }).to_dict()

    df["Household Income"] = None
    df["Pct_Below_Poverty"] = None

    for index, row in df.iterrows():
        district = row[district_col]
        metrics = district_metrics.get(district, {"Household Income": None, "Pct_Below_Poverty": None})
        df.at[index, "Household Income"] = metrics["Household Income"]
        df.at[index, "Pct_Below_Poverty"] = metrics["Pct_Below_Poverty"]

    return df


In [148]:
def add_region_population_percentages(df, precinct_data, district_col="CONG_DIST"):
    """
    Adds 'urban_pct', 'suburban_pct', and 'rural_pct' columns 
    based on district-level aggregated precinct data.

    :param df: DataFrame containing district data without region population percentages
    :param precinct_data: DataFrame containing precinct-level data
    :param district_col: District identifiers
    :return: Modified DataFrame with 'urban_pct', 'suburban_pct', and 'rural_pct' columns added
    """
    # Calculate metrics for each district
    district_metrics = precinct_data.groupby([district_col, "region_type"])["TOT_POP22"].sum().unstack(fill_value=0)

    # Add total population for each district
    district_metrics["Total_Pop"] = district_metrics.sum(axis=1)

    # Calculate percentages
    district_metrics["urban_pct"] = (district_metrics.get("urban", 0) / district_metrics["Total_Pop"]) * 100
    district_metrics["suburban_pct"] = (district_metrics.get("suburban", 0) / district_metrics["Total_Pop"]) * 100
    district_metrics["rural_pct"] = (district_metrics.get("rural", 0) / district_metrics["Total_Pop"]) * 100

    # Convert metrics to a dictionary
    district_metrics_dict = district_metrics[["urban_pct", "suburban_pct", "rural_pct"]].to_dict("index")

    # Initialize new columns in the DataFrame
    df["urban_pct"] = None
    df["suburban_pct"] = None
    df["rural_pct"] = None

    # Assign metrics to each district in the DataFrame
    for index, row in df.iterrows():
        district = row[district_col]
        metrics = district_metrics_dict.get(district, {"urban_pct": None, "suburban_pct": None, "rural_pct": None})
        df.at[index, "urban_pct"] = metrics["urban_pct"]
        df.at[index, "suburban_pct"] = metrics["suburban_pct"]
        df.at[index, "rural_pct"] = metrics["rural_pct"]

    return df


In [149]:
sc_election_cong_df = sc_election_df.groupby("CONG_DIST").agg({
    "TOT_REP": "sum",
    "TOT_DEM": "sum",
    "TOT_VOT": "sum",
}).reset_index()
sc_election_cong_df = add_representative_party_info(sc_election_cong_df, rep_mapping)

In [150]:
print(sc_region_type_df.columns.values)

['UNIQUE_ID' 'region_type' 'TOT_POP22']


In [151]:
sc_region_type_with_cong = pd.merge(
    sc_region_type_df,
    sc_econ_df[['UNIQUE_ID', 'CONG_DIST']],
    on='UNIQUE_ID',
    how='left' 
)

In [152]:
print(sc_region_type_with_cong.columns.values)

['UNIQUE_ID' 'region_type' 'TOT_POP22' 'CONG_DIST']


In [153]:
print(sc_election_cong_df)

   CONG_DIST  TOT_REP  TOT_DEM  TOT_VOT    Representative Party   Race
0          1   153319   115463   268782        Nancy Mace   REP  White
1          2   147171    97771   244942        Joe Wilson   REP  White
2          3   189130        0   189130       Jeff Duncan   REP  White
3          4   164618        0   164618   William Timmons   REP  White
4          5   154047    82958   237005      Ralph Norman   REP  White
5          6    79688   130473   210161  James E. Clyburn   DEM  White
6          7   164284    88911   253195       Russell Fry   REP  White


In [154]:
sc_election_cong_df = add_household_income_info(sc_election_cong_df, sc_econ_df)

  district_metrics = precinct_data.groupby(district_col).apply(lambda group: {


In [155]:
sc_election_cong_df = add_region_population_percentages(sc_election_cong_df, sc_region_type_with_cong)

In [156]:
print(sc_election_cong_df)

   CONG_DIST  TOT_REP  TOT_DEM  TOT_VOT    Representative Party   Race  \
0          1   153319   115463   268782        Nancy Mace   REP  White   
1          2   147171    97771   244942        Joe Wilson   REP  White   
2          3   189130        0   189130       Jeff Duncan   REP  White   
3          4   164618        0   164618   William Timmons   REP  White   
4          5   154047    82958   237005      Ralph Norman   REP  White   
5          6    79688   130473   210161  James E. Clyburn   DEM  White   
6          7   164284    88911   253195       Russell Fry   REP  White   

  Household Income Pct_Below_Poverty  urban_pct suburban_pct  rural_pct  
0     88744.763782         18.415168  19.590055    46.091865  34.318079  
1     72475.114049          23.99857  18.936308    49.212257  31.851435  
2     60055.075852         30.289377   14.95716    47.912248  37.130592  
3     70802.728617         24.940335  30.962907    53.487037  15.550056  
4     69542.360695         27.025359 

In [157]:
sc_election_cong_df['Vote_Pct_Rep'] = (sc_election_cong_df['TOT_REP'] / sc_election_cong_df['TOT_VOT']) * 100
sc_election_cong_df['Vote_Pct_Dem'] = (sc_election_cong_df['TOT_DEM'] / sc_election_cong_df['TOT_VOT']) * 100

In [158]:
print(sc_election_cong_df)

   CONG_DIST  TOT_REP  TOT_DEM  TOT_VOT    Representative Party   Race  \
0          1   153319   115463   268782        Nancy Mace   REP  White   
1          2   147171    97771   244942        Joe Wilson   REP  White   
2          3   189130        0   189130       Jeff Duncan   REP  White   
3          4   164618        0   164618   William Timmons   REP  White   
4          5   154047    82958   237005      Ralph Norman   REP  White   
5          6    79688   130473   210161  James E. Clyburn   DEM  White   
6          7   164284    88911   253195       Russell Fry   REP  White   

  Household Income Pct_Below_Poverty  urban_pct suburban_pct  rural_pct  \
0     88744.763782         18.415168  19.590055    46.091865  34.318079   
1     72475.114049          23.99857  18.936308    49.212257  31.851435   
2     60055.075852         30.289377   14.95716    47.912248  37.130592   
3     70802.728617         24.940335  30.962907    53.487037  15.550056   
4     69542.360695         27.02

In [159]:
# drop TOT_REP  TOT_DEM  TOT_VOT 
sc_election_cong_df.drop(columns=['TOT_REP', 'TOT_DEM', 'TOT_VOT'], inplace=True)

In [162]:
# save to json
sc_election_cong_df.to_json('states/south_carolina/congressional_districts/summary/sc_congressional_districts_summary.json', orient='records', indent=4)

## Maryland

In [182]:
md_race_df = pd.read_json("states/maryland/demographics/maryland_precincts_racial_population.json")
md_econ_df = pd.read_json("states/maryland/economic/maryland_precincts_household_income.json")
md_election_df = pd.read_json("states/maryland/election/md_election.json")
md_region_type_df = pd.read_json("states/maryland/geodata/maryland_precincts_region_type.json")

In [123]:

state_population = md_race_df['TOT_POP22'].sum()

total_state_pop = md_race_df['TOT_POP22'].sum()

white_pop = md_race_df['WHT_NHSP22'].sum()
black_pop = md_race_df['BLK_NHSP22'].sum()
hispanic_pop = md_race_df['HSP_POP22'].sum()
asian_pop = md_race_df['ASN_NHSP22'].sum()
native_american_pop = md_race_df['AIA_NHSP22'].sum()
islander_pop = md_race_df['HPI_NHSP22'].sum()
other_pop = md_race_df['OTH_NHSP22'].sum()
percent_white = (white_pop / total_state_pop) * 100
percent_black = (black_pop / total_state_pop) * 100
percent_hispanic = (hispanic_pop / total_state_pop) * 100
percent_asian = (asian_pop / total_state_pop) * 100
percent_native_american = (native_american_pop / total_state_pop) * 100
percent_islander = (islander_pop / total_state_pop) * 100
percent_other = (other_pop / total_state_pop) * 100

In [124]:
print(f"State Population: {state_population}")


State Population: 6161707


In [125]:

total_votes = md_election_df['TOT_VOT'].sum()
votes_dem = md_election_df['TOT_DEM'].sum()
votes_rep = md_election_df['TOT_REP'].sum()
percent_dem = (votes_dem / total_votes) * 100
percent_rep = (votes_rep / total_votes) * 100

In [126]:
print(f"Total Votes: {total_votes}")
print(f"Total Votes for Democrats: {votes_dem}")
print(f"Total Votes for Republicans: {votes_rep}")
print(f"Percentage of Votes for Democrats: {percent_dem}")
print(f"Percentage of Votes for Republicans: {percent_rep}")

Total Votes: 2865967
Total Votes for Democrats: 1884607
Total Votes for Republicans: 981360
Percentage of Votes for Democrats: 65.75815422857276
Percentage of Votes for Republicans: 34.241845771427236


In [127]:
print(f"Total State Population: {total_state_pop}")
print(f"Total White Population: {white_pop}")
print(f"Total Black Population: {black_pop}")
print(f"Total Hispanic Population: {hispanic_pop}")
print(f"Total Asian Population: {asian_pop}")
print(f"Total Native American Population: {native_american_pop}")
print(f"Total Islander Population: {islander_pop}")
print(f"Total Other Population: {other_pop}")
print(f"Percentage of White Population: {percent_white}")
print(f"Percentage of Black Population: {percent_black}")
print(f"Percentage of Hispanic Population: {percent_hispanic}")
print(f"Percentage of Asian Population: {percent_asian}")
print(f"Percentage of Native American Population: {percent_native_american}")
print(f"Percentage of Islander Population: {percent_islander}")
print(f"Percentage of Other Population: {percent_other}")


Total State Population: 6161707
Total White Population: 2989005
Total Black Population: 1815877
Total Hispanic Population: 672905
Total Asian Population: 396983
Total Native American Population: 8480
Total Islander Population: 2105
Total Other Population: 276352
Percentage of White Population: 48.509365992248576
Percentage of Black Population: 29.470356185388237
Percentage of Hispanic Population: 10.920756212523575
Percentage of Asian Population: 6.44274386951538
Percentage of Native American Population: 0.13762420056649885
Percentage of Islander Population: 0.03416261110760378
Percentage of Other Population: 4.484990928650129


In [128]:
income_columns = ['0_35K', '35K_60K', '60K-100K', '100K_125K', '125K_150K', '150K_MORE']
income_totals = md_econ_df[income_columns].sum()
total_household_population = md_econ_df['TOT_HOUS22'].sum()
income_distribution = (income_totals / total_household_population) * 100

In [129]:
print(f"Total Household Population: {total_household_population}")
print(f"Income Distribution: {income_distribution}")


Total Household Population: 2318124
Income Distribution: 0_35K        16.729131
35K_60K      13.783603
60K-100K     20.215398
100K_125K    10.762927
125K_150K     8.524522
150K_MORE    29.984418
dtype: float64


In [130]:
income_ranges = ['0_35K', '35K_60K', '60K-100K', '100K_125K', '125K_150K', '150K_MORE']
income_distribution_data = dict(zip(income_ranges, income_distribution.values))

In [131]:
rural_population = 0
suburban_population = 0
urban_population = 0
for _, row in md_region_type_df.iterrows():
    population = row["TOT_POP22"]
    region_type = row["region_type"]

    if region_type == "rural":
        rural_population += population
    elif region_type == "suburban":
        suburban_population += population
    elif region_type == "urban":
        urban_population += population
total_population = rural_population + suburban_population + urban_population
rural_percent = (rural_population / total_population) * 100 if total_population > 0 else 0
suburban_percent = (suburban_population / total_population) * 100 if total_population > 0 else 0
urban_percent = (urban_population / total_population) * 100 if total_population > 0 else 0

# Print the results
print(f"Rural Population: {rural_population}, Rural Percent: {rural_percent:.2f}%")
print(f"Suburban Population: {suburban_population}, Suburban Percent: {suburban_percent:.2f}%")
print(f"Urban Population: {urban_population}, Urban Percent: {urban_percent:.2f}%")


Rural Population: 1053754, Rural Percent: 17.10%
Suburban Population: 2823435, Suburban Percent: 45.82%
Urban Population: 2284518, Urban Percent: 37.08%


In [183]:
md_summary_data = {
    "NAME": ["Maryland"],
    "TOT_POP": [state_population],
    "TOT_WHITE": [white_pop],
    "TOT_BLACK": [black_pop],
    "TOT_HISP": [hispanic_pop],
    "TOT_ASIAN": [asian_pop],
    "TOT_NATIVE": [native_american_pop],
    "TOT_ISLANDER": [islander_pop],
    "TOT_OTHER": [other_pop],
    "DEM_VOT_DIS": [percent_dem],
    "REP_VOT_DIS": [percent_rep],
    # "WHITE_DIS": [percent_white],
    # "BLACK_DIS": [percent_black],
    # "HISP_DIS": [percent_hispanic],
    # "ASIAN_DIS": [percent_asian],
    # "NATIVE_DIS": [percent_native_american],
    # "ISLAND_DIS": [percent_islander],
    # "OTHER_DIS": [percent_other],
    "URBAN_DIS": [urban_percent],
    "SUBURBAN_DIS": [suburban_percent],
    "RURAL_DIS": [rural_percent],
    "TOT_HOUS": [total_household_population],
    "HOUS_INCOME_DIS": [income_distribution_data],
    "POV_LEVEL": [32470]
}

In [184]:
md_summary_df = pd.DataFrame(md_summary_data)


In [185]:
md_summary_df.to_json('states/maryland/summary/maryland_summary.json', orient='records', lines=True, indent=4)

### Per Congressional district

In [234]:
md_rep_mapping = {
    "GCON01RAH": {"Representative": "Andy Harris", "Party": "REP", "Race": "White", "District": 1},
    "GCON02DCDR": {"Representative": "Dutch Ruppersberger", "Party": "DEM", "Race": "White", "District": 2},
    "GCON03DJS": {"Representative": "John Sarbanes", "Party": "DEM", "Race": "White", "District": 3},
    "GCON04DAGB": {"Representative": "Anthony G. Brown", "Party": "DEM", "Race": "African American", "District": 4},
    "GCON05DSHH": {"Representative": "Steny H. Hoyer", "Party": "DEM", "Race": "White", "District": 5},
    "GCON06DDJT": {"Representative": "David Trone", "Party": "DEM", "Race": "White", "District": 6},
    "GCON07DKM": {"Representative": "Kweisi Mfume", "Party": "DEM", "Race": "African American", "District": 7},
    "GCON08DJR": {"Representative": "Jamie Raskin", "Party": "DEM", "Race": "White", "District": 8}
}

In [235]:
print(md_election_df.columns.values)

['NAME' 'district' 'UNIQUE_ID' 'NUMBER' 'TOT_REP' 'TOT_DEM' 'TOT_VOT'
 'LEAN' 'GCON01RAH' 'GCON02RJRS' 'GCON03RCA' 'GCON04RGEM' 'GCON05RCP'
 'GCON06RNCP' 'GCON07RKK' 'GCON08RGTC' 'GCON01DMM' 'GCON02DCDR'
 'GCON03DJS' 'GCON04DAGB' 'GCON05DSHH' 'GCON06DDJT' 'GCON07DKM'
 'GCON08DJR']


In [236]:
print(md_election_df['district'].value_counts())

district
0    932
1    232
6    217
8    207
2    166
7    127
3     96
5     66
Name: count, dtype: int64


In [237]:
md_election_cong_df = md_election_df.groupby("district").agg({
    "TOT_REP": "sum",
    "TOT_DEM": "sum",
    "TOT_VOT": "sum",
}).reset_index()
# md_election_cong_df = add_representative_party_info(md_election_cong_df, md_rep_mapping, district_col="district")

In [238]:
print(md_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT
0         0        0        0        0
1         1   206470   113511   319981
2         2    92434   175270   267704
3         3    50877   129848   180725
4         5    52292    84419   136711
5         6   121533   207959   329492
6         7    78270   124167   202437
7         8   127157   274716   401873


In [229]:
print(md_region_type_df.columns.values)

['UNIQUE_ID' 'region_type' 'TOT_POP22']


In [230]:
md_region_type_with_cong = pd.merge(
    md_region_type_df,
    md_econ_df[['UNIQUE_ID']],
    on='UNIQUE_ID',
    how='left' 
)

In [231]:
print(md_region_type_with_cong.columns.values)

['UNIQUE_ID' 'region_type' 'TOT_POP22']


In [232]:
print(md_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       Representative Party  \
0         0        0        0        0                 None  None   
1         1   206470   113511   319981          Andy Harris   REP   
2         2    92434   175270   267704  Dutch Ruppersberger   DEM   
3         3    50877   129848   180725        John Sarbanes   DEM   
4         5    52292    84419   136711       Steny H. Hoyer   DEM   
5         6   121533   207959   329492          David Trone   DEM   
6         7    78270   124167   202437         Kweisi Mfume   DEM   
7         8   127157   274716   401873         Jamie Raskin   DEM   

               Race  
0              None  
1             White  
2             White  
3             White  
4             White  
5             White  
6  African American  
7             White  


In [233]:
#remove district == 0
md_election_cong_df = md_election_cong_df[md_election_cong_df['district'] != 0]

In [None]:
md_election_cong_df = add_household_income_info(md_election_cong_df, md_econ_df)

  district_metrics = precinct_data.groupby(district_col).apply(lambda group: {


In [None]:
sc_election_cong_df = add_region_population_percentages(sc_election_cong_df, sc_region_type_with_cong)

In [None]:
print(sc_election_cong_df)

   CONG_DIST  TOT_REP  TOT_DEM  TOT_VOT    Representative Party   Race  \
0          1   153319   115463   268782        Nancy Mace   REP  White   
1          2   147171    97771   244942        Joe Wilson   REP  White   
2          3   189130        0   189130       Jeff Duncan   REP  White   
3          4   164618        0   164618   William Timmons   REP  White   
4          5   154047    82958   237005      Ralph Norman   REP  White   
5          6    79688   130473   210161  James E. Clyburn   DEM  White   
6          7   164284    88911   253195       Russell Fry   REP  White   

  Household Income Pct_Below_Poverty  urban_pct suburban_pct  rural_pct  
0     88744.763782         18.415168  19.590055    46.091865  34.318079  
1     72475.114049          23.99857  18.936308    49.212257  31.851435  
2     60055.075852         30.289377   14.95716    47.912248  37.130592  
3     70802.728617         24.940335  30.962907    53.487037  15.550056  
4     69542.360695         27.025359 

In [None]:
sc_election_cong_df['Vote_Pct_Rep'] = (sc_election_cong_df['TOT_REP'] / sc_election_cong_df['TOT_VOT']) * 100
sc_election_cong_df['Vote_Pct_Dem'] = (sc_election_cong_df['TOT_DEM'] / sc_election_cong_df['TOT_VOT']) * 100

In [None]:
print(sc_election_cong_df)

   CONG_DIST  TOT_REP  TOT_DEM  TOT_VOT    Representative Party   Race  \
0          1   153319   115463   268782        Nancy Mace   REP  White   
1          2   147171    97771   244942        Joe Wilson   REP  White   
2          3   189130        0   189130       Jeff Duncan   REP  White   
3          4   164618        0   164618   William Timmons   REP  White   
4          5   154047    82958   237005      Ralph Norman   REP  White   
5          6    79688   130473   210161  James E. Clyburn   DEM  White   
6          7   164284    88911   253195       Russell Fry   REP  White   

  Household Income Pct_Below_Poverty  urban_pct suburban_pct  rural_pct  \
0     88744.763782         18.415168  19.590055    46.091865  34.318079   
1     72475.114049          23.99857  18.936308    49.212257  31.851435   
2     60055.075852         30.289377   14.95716    47.912248  37.130592   
3     70802.728617         24.940335  30.962907    53.487037  15.550056   
4     69542.360695         27.02

In [None]:
# drop TOT_REP  TOT_DEM  TOT_VOT 
sc_election_cong_df.drop(columns=['TOT_REP', 'TOT_DEM', 'TOT_VOT'], inplace=True)

In [None]:
# save to json
sc_election_cong_df.to_json('states/south_carolina/congressional_districts/summary/sc_congressional_districts_summary.json', orient='records', indent=4)