In [629]:
import json
import pandas as pd
import geopandas as gpd

## Block Level demographics data

In [None]:
def preprocess_state_summary(df, file_path):
    state_population = df['TOT_POP22'].sum()
    df['OTH_NHSP22'] = df['OTH_NHSP22'] + df['2OM_NHSP22']
    columns_to_sum = ['HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22', 'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22']
    total_sum_from_columns = df[columns_to_sum].sum().sum()
    print(f"Total Sum from Columns: {total_sum_from_columns}")
    selected_columns = [
    "GEOID",
    "STATEFP",
    "STATE",
    "COUNTYFP",
    "COUNTY",
    "TOT_POP22",
    "NHSP_POP22",
    "HSP_POP22",
    "WHT_NHSP22",
    "BLK_NHSP22",
    "AIA_NHSP22",
    "ASN_NHSP22",
    "HPI_NHSP22",
    "OTH_NHSP22",
]
    df_filtered = df[selected_columns]
    df_filtered.to_csv(file_path, index=False)

### South Carolina

In [None]:
df_sc_race = pd.read_csv('raw/census_block/race/sc_race_2022_bg/sc_race_2022_bg.csv')


Index(['GEOID', 'STATEFP', 'STATE', 'COUNTYFP', 'COUNTY', 'TOT_POP22',
       'NHSP_POP22', 'HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22',
       'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22', '2OM_NHSP22', 'BLK_ALL22',
       'AIA_ALL22', 'ASN_ALL22', 'HPI_ALL22', 'OTH_ALL22'],
      dtype='object')


In [32]:
state_population = df_sc_race['TOT_POP22'].sum()

print(f"State Population: {state_population}")

State Population: 5142750


In [39]:
columns_to_sum = ['HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22', 'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22', '2OM_NHSP22']
total_sum_from_columns = df_sc_race[columns_to_sum].sum().sum()
print(f"Total Sum from Columns: {total_sum_from_columns}")

Total Sum from Columns: 5142750


In [42]:
df_sc_race['OTH_NHSP22'] = df_sc_race['OTH_NHSP22'] + df_sc_race['2OM_NHSP22']

In [None]:
selected_columns = [
    "GEOID",
    "STATEFP",
    "STATE",
    "COUNTYFP",
    "COUNTY",
    "TOT_POP22",
    "NHSP_POP22",
    "HSP_POP22",
    "WHT_NHSP22",
    "BLK_NHSP22",
    "AIA_NHSP22",
    "ASN_NHSP22",
    "HPI_NHSP22",
    "OTH_NHSP22",
]


In [None]:
df_sc_race_filtered = df_sc_race[selected_columns]


In [None]:
path = "processed_individual/sc_race_block.csv"


### Maryland

In [74]:
df_md_race = pd.read_csv('raw/census_block/race/md_race_2022_bg/md_race_2022_bg.csv')
print(df_md_race.columns)

Index(['GEOID', 'STATEFP', 'STATE', 'COUNTYFP', 'COUNTY', 'TOT_POP22',
       'NHSP_POP22', 'HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22',
       'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22', '2OM_NHSP22', 'BLK_ALL22',
       'AIA_ALL22', 'ASN_ALL22', 'HPI_ALL22', 'OTH_ALL22'],
      dtype='object')


In [75]:
df_md_race['OTH_NHSP22'] = df_md_race['OTH_NHSP22'] + df_md_race['2OM_NHSP22']

In [76]:
df_md_race_filtered = df_md_race[selected_columns]

In [77]:
path = "processed_individual/md_race_block.csv"
df_md_race_filtered.to_csv(path, index=False)

# Preprocess State Summary

## South Carolina:

In [757]:
party_columns = {
    'DEM': 'TOT_DEM',
    'REP': 'TOT_REP'
}
race_columns = {
    'WHITE': 'WHT_NHSP22',
    'BLACK': 'BLK_NHSP22',
    'HISPANIC': 'HSP_POP22',
    'ASIAN': 'ASN_NHSP22',
    'NATIVE_AMERICAN': 'AIA_NHSP22',
    'ISLANDER': 'HPI_NHSP22',
    'OTHER_RACE': 'OTH_NHSP22'
}
income_columns = ['0_35K', '35K_60K', '60K_100K', '100K_125K', '125K_150K', '150K_MORE']

POVERTY_LEVEL = 32470


### Functions:

In [758]:
def calculate_population_distribution(df, total_col, race_columns):
    total_population = int(df[total_col].sum())  # Convert to int
    population_summary = {'TOTAL_POPULATION': total_population}
    
    for race, col in race_columns.items():
        race_population = df[col].sum()
        race_percent = (race_population / total_population) * 100
        population_summary[f'{race}_PRECENT'] = float(race_percent)  # Convert to float
    
    return population_summary


In [759]:
def calculate_election_distribution(df, total_votes_col, party_columns):
    total_votes = int(df[total_votes_col].sum()) 
    election_summary = {'TOTAL_VOTES': total_votes}
    
    for party, col in party_columns.items():
        party_votes = int(df[col].sum())  
        election_summary[f'{party}_VOTES'] = party_votes
        party_percent = (party_votes / total_votes) * 100
        election_summary[f'{party}_PERCENT'] = float(party_percent) 
    
    return election_summary


In [760]:
def calculate_income_distribution(df, total_household_col, income_columns):
    income_totals = df[income_columns].sum().astype(int) 
    total_households = int(df[total_household_col].sum()) 
    income_distribution = (income_totals / total_households) * 100
    income_distribution = income_distribution.astype(float) 
    
    # Prepare summary
    household_income_summary = {
        'TOTAL_HOUSEHOLDS': total_households,
        'HOUSEHOLD_INCOME_DISTRIBUTION': income_distribution.to_dict()
    }
    return household_income_summary


In [761]:
def calculate_region_distribution(df, population_col, region_col):
    rural_population = int(df.loc[df[region_col] == "rural", population_col].sum())
    suburban_population = int(df.loc[df[region_col] == "suburban", population_col].sum())
    urban_population = int(df.loc[df[region_col] == "urban", population_col].sum())
    
    total_population = rural_population + suburban_population + urban_population
    
    rural_percent = (rural_population / total_population) * 100 if total_population > 0 else 0
    suburban_percent = (suburban_population / total_population) * 100 if total_population > 0 else 0
    urban_percent = (urban_population / total_population) * 100 if total_population > 0 else 0
    
    region_summary = {
        'RURAL_PERCENT': float(rural_percent),       # Convert to float
        'SUBURBAN_PERCENT': float(suburban_percent), # Convert to float
        'URBAN_PERCENT': float(urban_percent)        # Convert to float
    }
    return region_summary


### Import data

In [762]:
sc_race_df = pd.read_json("states/south_carolina/demographics/south_carolina_precincts_racial_population.json")
sc_econ_df = pd.read_json("states/south_carolina/economic/south_carolina_precincts_household_income.json")
sc_election_df = pd.read_json("states/south_carolina/election/sc_election.json")
sc_region_type_df = pd.read_json("states/south_carolina/geodata/south_carolina_precincts_region_type.json")

### State and racial population

In [763]:
population_summary = calculate_population_distribution(sc_race_df, 'TOT_POP22', race_columns)
election_summary = calculate_election_distribution(sc_election_df, 'TOT_VOT', party_columns)
income_summary = calculate_income_distribution(sc_econ_df, 'TOT_HOUS22', income_columns)
region_summary = calculate_region_distribution(sc_region_type_df, "TOT_POP22", "region_type")


### Create Data Summary for State

In [764]:
sc_summary_data = {
    "NAME": "South Carolina",
    **population_summary,
    **election_summary,
    **income_summary,
    **region_summary,
    "POVERTY_LEVEL": float(POVERTY_LEVEL) 
}


In [765]:
with open("states/south_carolina/summary/south_carolina_summary.json", "w") as file:
    json.dump(sc_summary_data, file, indent=4)

## Maryland

In [683]:
md_race_df = pd.read_json("states/maryland/demographics/maryland_precincts_racial_population.json")
md_econ_df = pd.read_json("states/maryland/economic/maryland_precincts_household_income.json")
md_election_df = pd.read_json("states/maryland/election/md_election_cd.json")
md_region_type_df = pd.read_json("states/maryland/geodata/maryland_precincts_region_type.json")

In [684]:
md_population_summary = calculate_population_distribution(md_race_df, 'TOT_POP22', race_columns)
md_election_summary = calculate_election_distribution(md_election_df, 'TOT_VOT', party_columns)
md_income_summary = calculate_income_distribution(md_econ_df, 'TOT_HOUS22', income_columns)
md_region_summary = calculate_region_distribution(md_region_type_df, "TOT_POP22", "region_type")

In [685]:
md_summary_data = {
    "NAME": "Maryland",
    **md_population_summary,
    **md_election_summary,
    **md_income_summary,
    **md_region_summary,
    "POVERTY_LEVEL": float(POVERTY_LEVEL) 
}

In [686]:
with open("states/maryland/summary/maryland_summary.json", "w") as file:
    json.dump(md_summary_data, file, indent=4)

# Preprocess Congressional District Summary

## Preprocess Congressional District Summary South Carolina

In [766]:
rep_mapping = {
    "GCON01RMAC": {"Representative": "Nancy Mace", "Party": "REP", "Race": "White", "District": 1},
    "GCON02RWIL": {"Representative": "Joe Wilson", "Party": "REP", "Race": "White", "District": 2},
    "GCON03RDUN": {"Representative": "Jeff Duncan", "Party": "REP", "Race": "White", "District": 3},
    "GCON04RTIM": {"Representative": "William Timmons", "Party": "REP", "Race": "White", "District": 4},
    "GCON05RNOR": {"Representative": "Ralph Norman", "Party": "REP", "Race": "White", "District": 5},
    "GCON06DCLY": {"Representative": "James E. Clyburn", "Party": "DEM", "Race": "White", "District": 6},
    "GCON07RFRY": {"Representative": "Russell Fry", "Party": "REP", "Race": "White", "District": 7}
}


### Functions

In [767]:
def add_representative_party_info(df, mapping, district_col="CONG_DIST"):
    df["Representative"] = None
    df["Party"] = None
    df["Race"] = None

    for index, row in df.iterrows():
        district = row[district_col]

        representative = None
        party = None
        race = None
        for key, rep in mapping.items():
            if rep["District"] == district:
                representative = rep["Representative"]
                party = rep["Party"]
                race = rep["Race"]
                break  
    
        df.at[index, "Representative"] = representative
        df.at[index, "Party"] = party
        df.at[index, "Race"] = race

    return df


In [768]:
def add_household_income_info(df, precinct_data, district_col="CONG_DIST"):
    """
    Adds 'Household Income' and 'Pct_Below_Poverty' 
    based on district-level aggregated precinct data.

    :param df: DataFrame containing district data without income information
    :param precinct_data: DataFrame containing precinct-level data
    :param district_col: District identifiers
    :return: Modified DataFrame with 'Household Income' and 'Pct_Below_Poverty' columns added
    """
    # Calculate metrics for each district
    district_metrics = precinct_data.groupby(district_col).apply(lambda group: {
        "Household Income": (group["MEDN_INC22"] * group["TOT_HOUS22"]).sum() / group["TOT_HOUS22"].sum() if group["TOT_HOUS22"].sum() > 0 else 0,
        "Pct_Below_Poverty": (group["0_35K"].sum() / group["TOT_HOUS22"].sum() * 100) if group["TOT_HOUS22"].sum() > 0 else 0
    }).to_dict()

    df["Household Income"] = None
    df["Pct_Below_Poverty"] = None

    for index, row in df.iterrows():
        district = row[district_col]
        metrics = district_metrics.get(district, {"Household Income": None, "Pct_Below_Poverty": None})
        df.at[index, "Household Income"] = metrics["Household Income"]
        df.at[index, "Pct_Below_Poverty"] = metrics["Pct_Below_Poverty"]

    return df


In [769]:
def add_region_population_percentages(df, precinct_data, district_col="CONG_DIST"):
    district_metrics = precinct_data.groupby([district_col, "region_type"])["TOT_POP22"].sum().unstack(fill_value=0)

    district_metrics["Total_Pop"] = district_metrics.sum(axis=1)

    district_metrics["urban_pct"] = (district_metrics.get("urban", 0) / district_metrics["Total_Pop"]) * 100
    district_metrics["suburban_pct"] = (district_metrics.get("suburban", 0) / district_metrics["Total_Pop"]) * 100
    district_metrics["rural_pct"] = (district_metrics.get("rural", 0) / district_metrics["Total_Pop"]) * 100

    district_metrics_dict = district_metrics[["urban_pct", "suburban_pct", "rural_pct"]].to_dict("index")

    df["urban_pct"] = None
    df["suburban_pct"] = None
    df["rural_pct"] = None

    for index, row in df.iterrows():
        district = row[district_col]
        metrics = district_metrics_dict.get(district, {"urban_pct": None, "suburban_pct": None, "rural_pct": None})
        df.at[index, "urban_pct"] = metrics["urban_pct"]
        df.at[index, "suburban_pct"] = metrics["suburban_pct"]
        df.at[index, "rural_pct"] = metrics["rural_pct"]

    return df


In [770]:
def process_election_data(election_df, econ_df, region_type_df, rep_mapping, district_col="CONG_DIST"):
    # Group election data by congressional district and sum votes
    election_each_district_df = election_df.groupby(district_col).agg({
        "TOT_REP": "sum",
        "TOT_DEM": "sum",
        "TOT_VOT": "sum",
    }).reset_index()

    election_each_district_df = add_representative_party_info(election_each_district_df, rep_mapping)

    region_type_with_district = pd.merge(
        region_type_df,
        econ_df[['UNIQUE_ID', 'CONG_DIST']],
        on='UNIQUE_ID',
        how='left' 
    )

    election_each_district_df = add_household_income_info(election_each_district_df, econ_df, district_col)

    election_each_district_df = add_region_population_percentages(election_each_district_df, region_type_with_district, district_col)

    election_each_district_df['Vote_Pct_Rep'] = (election_each_district_df['TOT_REP'] / election_each_district_df['TOT_VOT']) * 100
    election_each_district_df['Vote_Pct_Dem'] = (election_each_district_df['TOT_DEM'] / election_each_district_df['TOT_VOT']) * 100

    election_each_district_df.drop(columns=['TOT_REP', 'TOT_DEM', 'TOT_VOT'], inplace=True)

    return election_each_district_df


### Main Script

In [771]:
sc_election_each_district_df = process_election_data(sc_election_df, sc_econ_df, sc_region_type_df, rep_mapping)

  district_metrics = precinct_data.groupby(district_col).apply(lambda group: {


In [772]:
# save to json
sc_election_each_district_df.to_json('states/south_carolina/congressional_districts/summary/sc_congressional_districts_summary.json', orient='records', indent=4)

## Maryland

In [755]:
md_rep_mapping = {
    "GCON01RAH": {"Representative": "Andy Harris", "Party": "REP", "Race": "White", "District": 1},
    "GCON02DCDR": {"Representative": "Dutch Ruppersberger", "Party": "DEM", "Race": "White", "District": 2},
    "GCON03DJS": {"Representative": "John Sarbanes", "Party": "DEM", "Race": "White", "District": 3},
    "GCON04DAGB": {"Representative": "Anthony G. Brown", "Party": "DEM", "Race": "African American", "District": 4},
    "GCON05DSHH": {"Representative": "Steny H. Hoyer", "Party": "DEM", "Race": "White", "District": 5},
    "GCON06DDJT": {"Representative": "David Trone", "Party": "DEM", "Race": "White", "District": 6},
    "GCON07DKM": {"Representative": "Kweisi Mfume", "Party": "DEM", "Race": "African American", "District": 7},
    "GCON08DJR": {"Representative": "Jamie Raskin", "Party": "DEM", "Race": "White", "District": 8}
}

In [756]:
print(md_election_df.columns.values)

['UNIQUE_ID' 'district' 'TOT_REP' 'TOT_DEM' 'TOT_VOT' 'LEAN' 'GCON01RAH'
 'GCON02RJRS' 'GCON03RCA' 'GCON04RGEM' 'GCON05RCP' 'GCON06RNCP'
 'GCON07RKK' 'GCON08RGTC' 'GCON01DMM' 'GCON02DCDR' 'GCON03DJS'
 'GCON04DAGB' 'GCON05DSHH' 'GCON06DDJT' 'GCON07DKM' 'GCON08DJR']


In [705]:
print(md_election_df['district'].value_counts())

district
7    315
1    297
4    263
3    251
6    243
5    235
2    224
8    207
Name: count, dtype: int64


In [706]:
md_election_cong_df = md_election_df.groupby("district").agg({
    "TOT_REP": "sum",
    "TOT_DEM": "sum",
    "TOT_VOT": "sum",
}).reset_index()
md_election_cong_df = add_representative_party_info(md_election_cong_df, md_rep_mapping, district_col="district")

In [707]:
print(md_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       Representative Party  \
0         1   250901   143877   394778          Andy Harris   REP   
1         2   106355   224836   331191  Dutch Ruppersberger   DEM   
2         3   112117   260358   372475        John Sarbanes   DEM   
3         4    71671   282119   353790     Anthony G. Brown   DEM   
4         5   123525   274210   397735       Steny H. Hoyer   DEM   
5         6   143599   215540   359139          David Trone   DEM   
6         7    92825   237084   329909         Kweisi Mfume   DEM   
7         8   127157   274716   401873         Jamie Raskin   DEM   

               Race  
0             White  
1             White  
2             White  
3  African American  
4             White  
5             White  
6  African American  
7             White  


In [708]:
print(md_region_type_df.columns.values)

['UNIQUE_ID' 'region_type' 'TOT_POP22']


In [709]:
print(md_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       Representative Party  \
0         1   250901   143877   394778          Andy Harris   REP   
1         2   106355   224836   331191  Dutch Ruppersberger   DEM   
2         3   112117   260358   372475        John Sarbanes   DEM   
3         4    71671   282119   353790     Anthony G. Brown   DEM   
4         5   123525   274210   397735       Steny H. Hoyer   DEM   
5         6   143599   215540   359139          David Trone   DEM   
6         7    92825   237084   329909         Kweisi Mfume   DEM   
7         8   127157   274716   401873         Jamie Raskin   DEM   

               Race  
0             White  
1             White  
2             White  
3  African American  
4             White  
5             White  
6  African American  
7             White  


In [710]:
print("Columns in md_econ_df:", md_econ_df.columns)
print("Columns in md_election_df:", md_election_df.columns)


Columns in md_econ_df: Index(['NAME', 'NUMBER', 'UNIQUE_ID', 'MEDN_INC22', 'TOT_HOUS22', '0_35K',
       '35K_60K', '60K_100K', '100K_125K', '125K_150K', '150K_MORE'],
      dtype='object')
Columns in md_election_df: Index(['UNIQUE_ID', 'district', 'TOT_REP', 'TOT_DEM', 'TOT_VOT', 'LEAN',
       'GCON01RAH', 'GCON02RJRS', 'GCON03RCA', 'GCON04RGEM', 'GCON05RCP',
       'GCON06RNCP', 'GCON07RKK', 'GCON08RGTC', 'GCON01DMM', 'GCON02DCDR',
       'GCON03DJS', 'GCON04DAGB', 'GCON05DSHH', 'GCON06DDJT', 'GCON07DKM',
       'GCON08DJR'],
      dtype='object')


In [711]:
md_econ_df.columns = md_econ_df.columns.str.strip()
md_election_df.columns = md_election_df.columns.str.strip()

In [None]:
md_election_each_district_df = process_election_data(md_election_df, md_econ_df, md_region_type_df, md_rep_mapping, district_col="district")

In [712]:
md_election_merged_df = pd.merge(
    md_econ_df,
    md_election_df[['UNIQUE_ID', 'district']],
    on='UNIQUE_ID',
    how='left'
)


In [713]:
print(md_election_merged_df['district'].value_counts())

district
7.0    315
4.0    263
3.0    251
1.0    243
5.0    235
2.0    224
6.0    217
8.0    207
Name: count, dtype: int64


In [714]:
md_election_cong_df = add_household_income_info(md_election_cong_df, md_election_merged_df, district_col="district")

  district_metrics = precinct_data.groupby(district_col).apply(lambda group: {


In [715]:
md_region_type_with_cong_df = pd.merge(
    md_region_type_df,
    md_election_df[['UNIQUE_ID', 'district']],  # Ensure UNIQUE_ID is included
    on='UNIQUE_ID',
    how='left'
)

In [716]:
sc_election_cong_df = add_region_population_percentages(md_election_cong_df, md_region_type_with_cong_df, district_col="district")

In [717]:
print(sc_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       Representative Party  \
0         1   250901   143877   394778          Andy Harris   REP   
1         2   106355   224836   331191  Dutch Ruppersberger   DEM   
2         3   112117   260358   372475        John Sarbanes   DEM   
3         4    71671   282119   353790     Anthony G. Brown   DEM   
4         5   123525   274210   397735       Steny H. Hoyer   DEM   
5         6   143599   215540   359139          David Trone   DEM   
6         7    92825   237084   329909         Kweisi Mfume   DEM   
7         8   127157   274716   401873         Jamie Raskin   DEM   

               Race Household Income Pct_Below_Poverty  urban_pct  \
0             White     96314.100471         17.671644    38.4567   
1             White     86826.122014         19.816719  42.653239   
2             White    110717.108875         15.320607  29.620697   
3  African American    106543.858785         14.130405  39.005547   
4             White    120658.082

In [718]:
md_election_cong_df['Vote_Pct_Rep'] = (md_election_cong_df['TOT_REP'] / md_election_cong_df['TOT_VOT']) * 100
md_election_cong_df['Vote_Pct_Dem'] = (md_election_cong_df['TOT_DEM'] / md_election_cong_df['TOT_VOT']) * 100

In [719]:
print(md_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       Representative Party  \
0         1   250901   143877   394778          Andy Harris   REP   
1         2   106355   224836   331191  Dutch Ruppersberger   DEM   
2         3   112117   260358   372475        John Sarbanes   DEM   
3         4    71671   282119   353790     Anthony G. Brown   DEM   
4         5   123525   274210   397735       Steny H. Hoyer   DEM   
5         6   143599   215540   359139          David Trone   DEM   
6         7    92825   237084   329909         Kweisi Mfume   DEM   
7         8   127157   274716   401873         Jamie Raskin   DEM   

               Race Household Income Pct_Below_Poverty  urban_pct  \
0             White     96314.100471         17.671644    38.4567   
1             White     86826.122014         19.816719  42.653239   
2             White    110717.108875         15.320607  29.620697   
3  African American    106543.858785         14.130405  39.005547   
4             White    120658.082

In [720]:
# rename district to CONG_DIST
md_election_cong_df.rename(columns={'district': 'CONG_DIST'}, inplace=True)

In [721]:
# save to json
md_election_cong_df.to_json('states/maryland/congressional_districts/summary/md_congressional_districts_summary.json', orient='records', indent=4)