In [8]:
import json
import pandas as pd
import geopandas as gpd

## Block Level demographics data

In [15]:
def preprocess_state_summary(df, file_path):
    state_population = df['TOT_POP22'].sum()
    df['OTH_NHSP22'] = df['OTH_NHSP22'] + df['2OM_NHSP22']
    columns_to_sum = ['HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22', 'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22']
    total_sum_from_columns = df[columns_to_sum].sum().sum()
    print(f"Total Sum from Columns: {total_sum_from_columns}")
    selected_columns = [
    "GEOID",
    "STATEFP",
    "STATE",
    "COUNTYFP",
    "COUNTY",
    "TOT_POP22",
    "NHSP_POP22",
    "HSP_POP22",
    "WHT_NHSP22",
    "BLK_NHSP22",
    "AIA_NHSP22",
    "ASN_NHSP22",
    "HPI_NHSP22",
    "OTH_NHSP22",
]
    df_filtered = df[selected_columns]
    df_filtered.to_csv(file_path, index=False)

### South Carolina

In [16]:
df_sc_race = pd.read_csv('raw/census_block/race/sc_race_2022_bg/sc_race_2022_bg.csv')


In [17]:
state_population = df_sc_race['TOT_POP22'].sum()

print(f"State Population: {state_population}")

State Population: 5142750


In [18]:
columns_to_sum = ['HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22', 'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22', '2OM_NHSP22']
total_sum_from_columns = df_sc_race[columns_to_sum].sum().sum()
print(f"Total Sum from Columns: {total_sum_from_columns}")

Total Sum from Columns: 5142750


In [19]:
df_sc_race['OTH_NHSP22'] = df_sc_race['OTH_NHSP22'] + df_sc_race['2OM_NHSP22'] + df_sc_race['AIA_NHSP22'] + df_sc_race['HPI_NHSP22']

In [20]:
selected_columns = [
    "GEOID",
    "STATEFP",
    "STATE",
    "COUNTYFP",
    "COUNTY",
    "TOT_POP22",
    "NHSP_POP22",
    "HSP_POP22",
    "WHT_NHSP22",
    "BLK_NHSP22",
    "ASN_NHSP22",
    "OTH_NHSP22",
]


In [21]:
df_sc_race_filtered = df_sc_race[selected_columns]


In [22]:
path = "processed_individual/sc_race_block.csv"
df_sc_race_filtered.to_csv(path, index=False)


### Maryland

In [74]:
df_md_race = pd.read_csv('raw/census_block/race/md_race_2022_bg/md_race_2022_bg.csv')
print(df_md_race.columns)

Index(['GEOID', 'STATEFP', 'STATE', 'COUNTYFP', 'COUNTY', 'TOT_POP22',
       'NHSP_POP22', 'HSP_POP22', 'WHT_NHSP22', 'BLK_NHSP22', 'AIA_NHSP22',
       'ASN_NHSP22', 'HPI_NHSP22', 'OTH_NHSP22', '2OM_NHSP22', 'BLK_ALL22',
       'AIA_ALL22', 'ASN_ALL22', 'HPI_ALL22', 'OTH_ALL22'],
      dtype='object')


In [None]:
df_md_race['OTH_NHSP22'] = df_md_race['OTH_NHSP22'] + df_md_race['2OM_NHSP22'] + df_md_race['AIA_NHSP22'] + df_md_race['HPI_NHSP22']

In [76]:
df_md_race_filtered = df_md_race[selected_columns]

In [77]:
path = "processed_individual/md_race_block.csv"
df_md_race_filtered.to_csv(path, index=False)

# Preprocess State Summary

## South Carolina:

In [210]:
party_columns = {
    'DEM': 'TOT_DEM',
    'REP': 'TOT_REP'
}
race_columns = {
    'WHITE': 'WHT_NHSP22',
    'BLACK': 'BLK_NHSP22',
    'HISPANIC': 'HSP_POP22',
    'ASIAN': 'ASN_NHSP22',
    # 'NATIVE_AMERICAN': 'AIA_NHSP22',
    # 'ISLANDER': 'AIA_NHSP22',
    'OTHER_RACE': 'OTH_NHSP22'
}
income_columns = ['0_35K', '35K_60K', '60K_100K', '100K_125K', '125K_150K', '150K_MORE']

POVERTY_LEVEL = 32470 # Federal poverty level (FPL) for a family of five in 2022


### Functions:

In [211]:
def calculate_population_distribution(df, total_col, race_columns):
    total_population = int(df[total_col].sum()) 
    population_summary = {'TOTAL_POPULATION': total_population}
    
    race_percentages = []
    for race, col in race_columns.items():
        race_total_population = df[col].sum()
        race_percent = (race_total_population / total_population) * 100
        race_percentages.append(round(race_percent))
    
    # Adjust the last percentage to ensure the sum equals 100
    adjustment = 100 - sum(race_percentages)
    race_percentages[-1] += adjustment
    
    # Build the summary dictionary
    for i, (race, _) in enumerate(race_columns.items()):
        population_summary[f'{race}_PERCENT'] = race_percentages[i]
    
    return population_summary


In [212]:
def calculate_election_distribution(df, total_votes_col, party_columns):
    total_votes = int(df[total_votes_col].sum()) 
    election_summary = {'TOTAL_VOTES': total_votes}
    
    party_percentages = []
    for party, col in party_columns.items():
        party_votes = int(df[col].sum())
        election_summary[f'{party}_VOTES'] = party_votes
        party_percentages.append((party, (party_votes / total_votes) * 100))
    
    # Round percentages and adjust the last one
    rounded_percentages = [round(percent) for _, percent in party_percentages]
    adjustment = 100 - sum(rounded_percentages)
    rounded_percentages[-1] += adjustment  # Adjust the last percentage
    
    for i, (party, _) in enumerate(party_percentages):
        election_summary[f'{party}_PERCENT'] = rounded_percentages[i]
    
    return election_summary

In [213]:
def calculate_income_distribution(df, total_household_col, income_columns):
    income_totals = df[income_columns].sum().astype(int)
    total_households = int(df[total_household_col].sum())
    
    # Calculate initial percentages
    income_percentages = (income_totals / total_households) * 100
    
    # Round percentages and adjust the last one
    rounded_percentages = income_percentages.round().astype(int).tolist()
    adjustment = 100 - sum(rounded_percentages)
    rounded_percentages[-1] += adjustment  # Adjust the last percentage
    
    # Create the dictionary with adjusted percentages
    income_distribution = {
        col: rounded_percentages[i] for i, col in enumerate(income_columns)
    }
    
    household_income_summary = {
        'TOTAL_HOUSEHOLDS': total_households,
        'HOUSEHOLD_INCOME_DISTRIBUTION': income_distribution
    }
    return household_income_summary


In [214]:
def calculate_region_distribution(df, population_col, region_col):
    # Calculate population totals for each region
    rural_population = int(df.loc[df[region_col] == "rural", population_col].sum())
    suburban_population = int(df.loc[df[region_col] == "suburban", population_col].sum())
    urban_population = int(df.loc[df[region_col] == "urban", population_col].sum())
    
    total_population = rural_population + suburban_population + urban_population
    
    # Calculate percentages
    if total_population > 0:
        rural_percent = (rural_population / total_population) * 100
        suburban_percent = (suburban_population / total_population) * 100
        urban_percent = (urban_population / total_population) * 100

        # Round percentages
        percentages = [round(rural_percent), round(suburban_percent), round(urban_percent)]
        
        # Adjust the last percentage to ensure the total sums to 100
        adjustment = 100 - sum(percentages)
        percentages[-1] += adjustment

        # Assign the adjusted percentages
        rural_percent, suburban_percent, urban_percent = percentages
    else:
        rural_percent = suburban_percent = urban_percent = 0
    
    # Create summary dictionary
    region_summary = {
        'RURAL_PERCENT': rural_percent,
        'SUBURBAN_PERCENT': suburban_percent,
        'URBAN_PERCENT': urban_percent
    }
    
    return region_summary


### Import data

In [215]:
sc_race_df = pd.read_json("states/south_carolina/demographics/south_carolina_precincts_racial_population.json")
sc_econ_df = pd.read_json("states/south_carolina/economic/south_carolina_precincts_household_income.json")
sc_election_df = pd.read_json("states/south_carolina/election/sc_election.json")
sc_region_type_df = pd.read_json("states/south_carolina/geodata/south_carolina_precincts_region_type.json")

In [216]:
sc_race_df['OTH_NHSP22'] = sc_race_df['OTH_NHSP22'] + sc_race_df['AIA_NHSP22'] + sc_race_df['AIA_NHSP22']

### State and racial population

In [217]:
population_summary = calculate_population_distribution(sc_race_df, 'TOT_POP22', race_columns)
election_summary = calculate_election_distribution(sc_election_df, 'TOT_VOT', party_columns)
income_summary = calculate_income_distribution(sc_econ_df, 'TOT_HOUS22', income_columns)
region_summary = calculate_region_distribution(sc_region_type_df, "TOT_POP22", "region_type")


### Create Data Summary for State

In [218]:
sc_summary_data = {
    "NAME": "South Carolina",
    **population_summary,
    **election_summary,
    **income_summary,
    **region_summary,
    "POVERTY_LEVEL": int(POVERTY_LEVEL) 
}


In [219]:
with open("states/south_carolina/summary/south_carolina_summary.json", "w") as file:
    json.dump(sc_summary_data, file, indent=4)

## Maryland

In [92]:
md_race_df = pd.read_json("states/maryland/demographics/maryland_precincts_racial_population.json")
md_econ_df = pd.read_json("states/maryland/economic/maryland_precincts_household_income.json")
md_election_df = pd.read_json("states/maryland/election/md_election_cd.json")
md_region_type_df = pd.read_json("states/maryland/geodata/maryland_precincts_region_type.json")

In [93]:
md_population_summary = calculate_population_distribution(md_race_df, 'TOT_POP22', race_columns)
md_election_summary = calculate_election_distribution(md_election_df, 'TOT_VOT', party_columns)
md_income_summary = calculate_income_distribution(md_econ_df, 'TOT_HOUS22', income_columns)
md_region_summary = calculate_region_distribution(md_region_type_df, "TOT_POP22", "region_type")

In [94]:
md_summary_data = {
    "NAME": "Maryland",
    **md_population_summary,
    **md_election_summary,
    **md_income_summary,
    **md_region_summary,
    "POVERTY_LEVEL": float(POVERTY_LEVEL) 
}

In [95]:
with open("states/maryland/summary/maryland_summary.json", "w") as file:
    json.dump(md_summary_data, file, indent=4)

# Preprocess Congressional District Summary

## Preprocess Congressional District Summary South Carolina

In [220]:
rep_mapping = {
    "GCON01RMAC": {"REPRESENTATIVE": "Nancy Mace", "PARTY": "Republican", "RACE": "White", "District": 1},
    "GCON02RWIL": {"REPRESENTATIVE": "Joe Wilson", "PARTY": "Republican", "RACE": "White", "District": 2},
    "GCON03RDUN": {"REPRESENTATIVE": "Jeff Duncan", "PARTY": "Republican", "RACE": "White", "District": 3},
    "GCON04RTIM": {"REPRESENTATIVE": "William Timmons", "PARTY": "Republican", "RACE": "White", "District": 4},
    "GCON05RNOR": {"REPRESENTATIVE": "Ralph Norman", "PARTY": "Republican", "RACE": "White", "District": 5},
    "GCON06DCLY": {"REPRESENTATIVE": "James E. Clyburn", "PARTY": "Democratic", "RACE": "White", "District": 6},
    "GCON07RFRY": {"REPRESENTATIVE": "Russell Fry", "PARTY": "Republican", "RACE": "White", "District": 7}
}


In [221]:
def add_representative_party_info(df, mapping, district_col="CONG_DIST"):
    """
    Adds 'Representative' and 'Party' columns to a DataFrame based on a district-to-representative mapping.

    :param df: pandas DataFrame containing district-level data
    :param mapping: Dictionary with representative and party information
    :param district_col: Name of the column in the DataFrame containing district identifiers
    :return: Modified DataFrame with 'Representative' and 'Party' columns added
    """
    df["REPRESENTATIVE"] = None
    df["PARTY"] = None
    df["RACE"] = None

    for index, row in df.iterrows():
        district = row[district_col]

        representative = None
        party = None
        race = None
        for key, rep in mapping.items():
            if rep["District"] == district:
                representative = rep["REPRESENTATIVE"]
                party = rep["PARTY"]
                race = rep["RACE"]
                break  
    
        df.at[index, "REPRESENTATIVE"] = representative
        df.at[index, "PARTY"] = party
        df.at[index, "RACE"] = race

    return df


In [222]:
def add_household_income_info(df, precinct_data, district_col="CONG_DIST"):
    """
    Adds 'Household Income' and 'Pct_Below_Poverty' 
    based on district-level aggregated precinct data.

    :param df: DataFrame containing district data without income information
    :param precinct_data: DataFrame containing precinct-level data
    :param district_col: District identifiers
    :return: Modified DataFrame with 'Household Income' and 'Pct_Below_Poverty' columns added
    """
    # Calculate metrics for each district
    district_metrics = precinct_data.groupby(district_col).apply(lambda group: {
        "AVERAGE_HOUSEHOLD_INCOME": round((group["MEDN_INC22"] * group["TOT_HOUS22"]).sum() / group["TOT_HOUS22"].sum()) if group["TOT_HOUS22"].sum() > 0 else 0,
        "PERCENT_BELOW_POVERTY": round((group["0_35K"].sum() / group["TOT_HOUS22"].sum()) * 100) if group["TOT_HOUS22"].sum() > 0 else 0
    }).to_dict()

    # Add new columns to the DataFrame
    df["AVERAGE_HOUSEHOLD_INCOME"] = None
    df["PERCENT_BELOW_POVERTY"] = None

    # Populate the new columns with district metrics
    for index, row in df.iterrows():
        district = row[district_col]
        metrics = district_metrics.get(district, {"AVERAGE_HOUSEHOLD_INCOME": None, "PERCENT_BELOW_POVERTY": None})
        df.at[index, "AVERAGE_HOUSEHOLD_INCOME"] = int(metrics["AVERAGE_HOUSEHOLD_INCOME"]) if metrics["AVERAGE_HOUSEHOLD_INCOME"] is not None else None
        df.at[index, "PERCENT_BELOW_POVERTY"] = int(metrics["PERCENT_BELOW_POVERTY"]) if metrics["PERCENT_BELOW_POVERTY"] is not None else None

    return df



In [223]:
def add_region_population_percentages(df, precinct_data, district_col="CONG_DIST"):
    """
    Adds 'RURAL_PERCENT', 'SUBURBAN_PERCENT', and 'URBAN_PERCENT' columns 
    based on district-level aggregated precinct data.

    :param df: DataFrame containing district data without region population percentages
    :param precinct_data: DataFrame containing precinct-level data
    :param district_col: District identifiers
    :return: Modified DataFrame with 'RURAL_PERCENT', 'SUBURBAN_PERCENT', and 'URBAN_PERCENT' columns added
    """
    # Calculate metrics for each district
    district_metrics = precinct_data.groupby([district_col, "region_type"])["TOT_POP22"].sum().unstack(fill_value=0)

    # Add total population for each district
    district_metrics["Total_Pop"] = district_metrics.sum(axis=1)

    # Calculate percentages and round
    district_metrics["URBAN_PERCENT"] = round((district_metrics.get("urban", 0) / district_metrics["Total_Pop"]) * 100).astype(int)
    district_metrics["SUBURBAN_PERCENT"] = round((district_metrics.get("suburban", 0) / district_metrics["Total_Pop"]) * 100).astype(int)
    district_metrics["RURAL_PERCENT"] = round((district_metrics.get("rural", 0) / district_metrics["Total_Pop"]) * 100).astype(int)

    # Adjust the last percentage to ensure the sum is exactly 100
    district_metrics["RURAL_PERCENT"] += 100 - (
        district_metrics["URBAN_PERCENT"] + district_metrics["SUBURBAN_PERCENT"] + district_metrics["RURAL_PERCENT"]
    )

    # Convert metrics to a dictionary
    district_metrics_dict = district_metrics[["URBAN_PERCENT", "SUBURBAN_PERCENT", "RURAL_PERCENT"]].to_dict("index")

    # Initialize new columns in the DataFrame
    df["RURAL_PERCENT"] = None
    df["SUBURBAN_PERCENT"] = None
    df["URBAN_PERCENT"] = None

    # Assign metrics to each district in the DataFrame
    for index, row in df.iterrows():
        district = row[district_col]
        metrics = district_metrics_dict.get(district, {"URBAN_PERCENT": None, "SUBURBAN_PERCENT": None, "RURAL_PERCENT": None})
        df.at[index, "RURAL_PERCENT"] = metrics["RURAL_PERCENT"]
        df.at[index, "SUBURBAN_PERCENT"] = metrics["SUBURBAN_PERCENT"]
        df.at[index, "URBAN_PERCENT"] = metrics["URBAN_PERCENT"]

    return df


In [224]:
sc_election_cong_df = sc_election_df.groupby("CONG_DIST").agg({
    "TOT_REP": "sum",
    "TOT_DEM": "sum",
    "TOT_VOT": "sum",
}).reset_index()
sc_election_cong_df = add_representative_party_info(sc_election_cong_df, rep_mapping)

In [225]:
print(sc_region_type_df.columns.values)

['UNIQUE_ID' 'region_type' 'TOT_POP22']


In [226]:
sc_region_type_with_cong = pd.merge(
    sc_region_type_df,
    sc_econ_df[['UNIQUE_ID', 'CONG_DIST']],
    on='UNIQUE_ID',
    how='left' 
)

In [227]:
print(sc_region_type_with_cong.columns.values)

['UNIQUE_ID' 'region_type' 'TOT_POP22' 'CONG_DIST']


In [228]:
print(sc_election_cong_df)

   CONG_DIST  TOT_REP  TOT_DEM  TOT_VOT    REPRESENTATIVE       PARTY   RACE
0          1   153319   115463   268782        Nancy Mace  Republican  White
1          2   147171    97771   244942        Joe Wilson  Republican  White
2          3   189130        0   189130       Jeff Duncan  Republican  White
3          4   164618        0   164618   William Timmons  Republican  White
4          5   154047    82958   237005      Ralph Norman  Republican  White
5          6    79688   130473   210161  James E. Clyburn  Democratic  White
6          7   164284    88911   253195       Russell Fry  Republican  White


In [229]:
sc_election_cong_df = add_household_income_info(sc_election_cong_df, sc_econ_df)

  district_metrics = precinct_data.groupby(district_col).apply(lambda group: {


In [230]:
sc_election_cong_df = add_region_population_percentages(sc_election_cong_df, sc_region_type_with_cong)

In [231]:
print(sc_election_cong_df)

   CONG_DIST  TOT_REP  TOT_DEM  TOT_VOT    REPRESENTATIVE       PARTY   RACE  \
0          1   153319   115463   268782        Nancy Mace  Republican  White   
1          2   147171    97771   244942        Joe Wilson  Republican  White   
2          3   189130        0   189130       Jeff Duncan  Republican  White   
3          4   164618        0   164618   William Timmons  Republican  White   
4          5   154047    82958   237005      Ralph Norman  Republican  White   
5          6    79688   130473   210161  James E. Clyburn  Democratic  White   
6          7   164284    88911   253195       Russell Fry  Republican  White   

  AVERAGE_HOUSEHOLD_INCOME PERCENT_BELOW_POVERTY RURAL_PERCENT  \
0                    88931                    18            34   
1                    72580                    24            31   
2                    60055                    30            37   
3                    70803                    25            16   
4                    69585   

In [232]:
sc_election_cong_df['REP_PERCENT'] = (sc_election_cong_df['TOT_REP'] / sc_election_cong_df['TOT_VOT']) * 100
sc_election_cong_df['DEM_PERCENT'] = (sc_election_cong_df['TOT_DEM'] / sc_election_cong_df['TOT_VOT']) * 100
# Round percentages to integers
sc_election_cong_df['REP_PERCENT'] = sc_election_cong_df['REP_PERCENT'].round().astype(int)
sc_election_cong_df['DEM_PERCENT'] = sc_election_cong_df['DEM_PERCENT'].round().astype(int)

# Adjust the last percentage to ensure no rounding error if needed
sc_election_cong_df['REP_PERCENT'] += 100 - (sc_election_cong_df['REP_PERCENT'] + sc_election_cong_df['DEM_PERCENT'])

In [233]:
print(sc_election_cong_df)

   CONG_DIST  TOT_REP  TOT_DEM  TOT_VOT    REPRESENTATIVE       PARTY   RACE  \
0          1   153319   115463   268782        Nancy Mace  Republican  White   
1          2   147171    97771   244942        Joe Wilson  Republican  White   
2          3   189130        0   189130       Jeff Duncan  Republican  White   
3          4   164618        0   164618   William Timmons  Republican  White   
4          5   154047    82958   237005      Ralph Norman  Republican  White   
5          6    79688   130473   210161  James E. Clyburn  Democratic  White   
6          7   164284    88911   253195       Russell Fry  Republican  White   

  AVERAGE_HOUSEHOLD_INCOME PERCENT_BELOW_POVERTY RURAL_PERCENT  \
0                    88931                    18            34   
1                    72580                    24            31   
2                    60055                    30            37   
3                    70803                    25            16   
4                    69585   

In [234]:
# drop TOT_REP  TOT_DEM  TOT_VOT 
sc_election_cong_df.drop(columns=['TOT_REP', 'TOT_DEM', 'TOT_VOT'], inplace=True)

In [235]:
import json

# Define the election metadata
election_info = "2022 United States House of Representatives"

# Convert DataFrame to JSON and structure it
election_summary = {
    "NAME": "South Carolina",
    "election": election_info,
    "data": sc_election_cong_df.to_dict(orient='records')  # Convert DataFrame to a list of dictionaries
}

# Save to JSON file with the desired format
with open('states/south_carolina/congressional_districts/summary/sc_congressional_districts_summary.json', 'w') as file:
    json.dump(election_summary, file, indent=4)


## Maryland

In [191]:
md_rep_mapping = {
    "GCON01RAH": {"REPRESENTATIVE": "Andy Harris", "PARTY": "Republican", "RACE": "White", "District": 1},
    "GCON02DCDR": {"REPRESENTATIVE": "Dutch Ruppersberger", "PARTY": "Democratic", "RACE": "White", "District": 2},
    "GCON03DJS": {"REPRESENTATIVE": "John Sarbanes", "PARTY": "Democratic", "RACE": "White", "District": 3},
    "GCON04DAGB": {"REPRESENTATIVE": "Anthony G. Brown", "PARTY": "Democratic", "RACE": "African American", "District": 4},
    "GCON05DSHH": {"REPRESENTATIVE": "Steny H. Hoyer", "PARTY": "Democratic", "RACE": "White", "District": 5},
    "GCON06DDJT": {"REPRESENTATIVE": "David Trone", "PARTY": "Democratic", "RACE": "White", "District": 6},
    "GCON07DKM": {"REPRESENTATIVE": "Kweisi Mfume", "PARTY": "Democratic", "RACE": "African American", "District": 7},
    "GCON08DJR": {"REPRESENTATIVE": "Jamie Raskin", "PARTY": "Democratic", "RACE": "White", "District": 8}
}

In [192]:
print(md_election_df.columns.values)

['UNIQUE_ID' 'district' 'TOT_REP' 'TOT_DEM' 'TOT_VOT' 'LEAN' 'GCON01RAH'
 'GCON02RJRS' 'GCON03RCA' 'GCON04RGEM' 'GCON05RCP' 'GCON06RNCP'
 'GCON07RKK' 'GCON08RGTC' 'GCON01DMM' 'GCON02DCDR' 'GCON03DJS'
 'GCON04DAGB' 'GCON05DSHH' 'GCON06DDJT' 'GCON07DKM' 'GCON08DJR']


In [193]:
print(md_election_df['district'].value_counts())

district
7    315
1    297
4    263
3    251
6    243
5    235
2    224
8    207
Name: count, dtype: int64


In [194]:
md_election_cong_df = md_election_df.groupby("district").agg({
    "TOT_REP": "sum",
    "TOT_DEM": "sum",
    "TOT_VOT": "sum",
}).reset_index()
md_election_cong_df = add_representative_party_info(md_election_cong_df, md_rep_mapping, district_col="district")

In [195]:
print(md_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       REPRESENTATIVE       PARTY  \
0         1   250901   143877   394778          Andy Harris  Republican   
1         2   106355   224836   331191  Dutch Ruppersberger  Democratic   
2         3   112117   260358   372475        John Sarbanes  Democratic   
3         4    71671   282119   353790     Anthony G. Brown  Democratic   
4         5   123525   274210   397735       Steny H. Hoyer  Democratic   
5         6   143599   215540   359139          David Trone  Democratic   
6         7    92825   237084   329909         Kweisi Mfume  Democratic   
7         8   127157   274716   401873         Jamie Raskin  Democratic   

               RACE  
0             White  
1             White  
2             White  
3  African American  
4             White  
5             White  
6  African American  
7             White  


In [196]:
print(md_region_type_df.columns.values)

['UNIQUE_ID' 'region_type' 'TOT_POP22']


In [197]:
print(md_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       REPRESENTATIVE       PARTY  \
0         1   250901   143877   394778          Andy Harris  Republican   
1         2   106355   224836   331191  Dutch Ruppersberger  Democratic   
2         3   112117   260358   372475        John Sarbanes  Democratic   
3         4    71671   282119   353790     Anthony G. Brown  Democratic   
4         5   123525   274210   397735       Steny H. Hoyer  Democratic   
5         6   143599   215540   359139          David Trone  Democratic   
6         7    92825   237084   329909         Kweisi Mfume  Democratic   
7         8   127157   274716   401873         Jamie Raskin  Democratic   

               RACE  
0             White  
1             White  
2             White  
3  African American  
4             White  
5             White  
6  African American  
7             White  


In [198]:
print("Columns in md_econ_df:", md_econ_df.columns)
print("Columns in md_election_df:", md_election_df.columns)


Columns in md_econ_df: Index(['NAME', 'NUMBER', 'JURSCODE', 'VOTESPRE', 'G20PREDBID', 'G20PRERTRU',
       'G20PRELJOR', 'G20PREGHAW', 'G20PREBSEG', 'G20PREOWRI', 'UNIQUE_ID',
       'MEDN_INC22', 'TOT_HOUS22', '0_35K', '35K_60K', '60K_100K', '100K_125K',
       '125K_150K', '150K_MORE'],
      dtype='object')
Columns in md_election_df: Index(['UNIQUE_ID', 'district', 'TOT_REP', 'TOT_DEM', 'TOT_VOT', 'LEAN',
       'GCON01RAH', 'GCON02RJRS', 'GCON03RCA', 'GCON04RGEM', 'GCON05RCP',
       'GCON06RNCP', 'GCON07RKK', 'GCON08RGTC', 'GCON01DMM', 'GCON02DCDR',
       'GCON03DJS', 'GCON04DAGB', 'GCON05DSHH', 'GCON06DDJT', 'GCON07DKM',
       'GCON08DJR'],
      dtype='object')


In [199]:
md_econ_df.columns = md_econ_df.columns.str.strip()
md_election_df.columns = md_election_df.columns.str.strip()

In [200]:
md_election_merged_df = pd.merge(
    md_econ_df,
    md_election_df[['UNIQUE_ID', 'district']],
    on='UNIQUE_ID',
    how='left'
)


In [201]:
print(md_election_merged_df['district'].value_counts())

district
7.0    315
4.0    263
3.0    251
1.0    243
5.0    235
2.0    224
6.0    217
8.0    207
Name: count, dtype: int64


In [202]:
md_election_cong_df = add_household_income_info(md_election_cong_df, md_election_merged_df, district_col="district")

  district_metrics = precinct_data.groupby(district_col).apply(lambda group: {


In [203]:
md_region_type_with_cong_df = pd.merge(
    md_region_type_df,
    md_election_df[['UNIQUE_ID', 'district']],  # Ensure UNIQUE_ID is included
    on='UNIQUE_ID',
    how='left'
)

In [204]:
sc_election_cong_df = add_region_population_percentages(md_election_cong_df, md_region_type_with_cong_df, district_col="district")

In [205]:
print(sc_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       REPRESENTATIVE       PARTY  \
0         1   250901   143877   394778          Andy Harris  Republican   
1         2   106355   224836   331191  Dutch Ruppersberger  Democratic   
2         3   112117   260358   372475        John Sarbanes  Democratic   
3         4    71671   282119   353790     Anthony G. Brown  Democratic   
4         5   123525   274210   397735       Steny H. Hoyer  Democratic   
5         6   143599   215540   359139          David Trone  Democratic   
6         7    92825   237084   329909         Kweisi Mfume  Democratic   
7         8   127157   274716   401873         Jamie Raskin  Democratic   

               RACE AVERAGE_HOUSEHOLD_INCOME PERCENT_BELOW_POVERTY  \
0             White                    96314                    18   
1             White                    86826                    20   
2             White                   110717                    15   
3  African American                   106544

In [206]:
md_election_cong_df['REP_PERCENT'] = (md_election_cong_df['TOT_REP'] / md_election_cong_df['TOT_VOT']) * 100
md_election_cong_df['DEM_PERCENT'] = (md_election_cong_df['TOT_DEM'] / md_election_cong_df['TOT_VOT']) * 100

# Round percentages to integers
md_election_cong_df['REP_PERCENT'] = md_election_cong_df['REP_PERCENT'].round().astype(int)
md_election_cong_df['DEM_PERCENT'] = md_election_cong_df['DEM_PERCENT'].round().astype(int)

# Adjust the last percentage to ensure no rounding error if needed
md_election_cong_df['REP_PERCENT'] += 100 - (md_election_cong_df['REP_PERCENT'] + md_election_cong_df['DEM_PERCENT'])

In [207]:
print(md_election_cong_df)

   district  TOT_REP  TOT_DEM  TOT_VOT       REPRESENTATIVE       PARTY  \
0         1   250901   143877   394778          Andy Harris  Republican   
1         2   106355   224836   331191  Dutch Ruppersberger  Democratic   
2         3   112117   260358   372475        John Sarbanes  Democratic   
3         4    71671   282119   353790     Anthony G. Brown  Democratic   
4         5   123525   274210   397735       Steny H. Hoyer  Democratic   
5         6   143599   215540   359139          David Trone  Democratic   
6         7    92825   237084   329909         Kweisi Mfume  Democratic   
7         8   127157   274716   401873         Jamie Raskin  Democratic   

               RACE AVERAGE_HOUSEHOLD_INCOME PERCENT_BELOW_POVERTY  \
0             White                    96314                    18   
1             White                    86826                    20   
2             White                   110717                    15   
3  African American                   106544

In [236]:
# rename district to CONG_DIST
md_election_cong_df.rename(columns={'district': 'CONG_DIST'}, inplace=True)

In [237]:
## drop TOT_REP  TOT_DEM  TOT_VOT
md_election_cong_df.drop(columns=['TOT_REP', 'TOT_DEM', 'TOT_VOT'], inplace=True)

In [239]:
election_info = "2022 United States House of Representatives"

# Convert DataFrame to JSON and structure it
election_summary = {
    "NAME": "Maryland",
    "election": election_info,
    "data": md_election_cong_df.to_dict(orient='records')  # Convert DataFrame to a list of dictionaries
}

# Save to JSON file with the desired format
with open('states/maryland/congressional_districts/summary/md_congressional_districts_summary.json', 'w') as file:
    json.dump(election_summary, file, indent=4)