In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from census import Census
from us import states
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
file_path = '../data/project_data.csv' 
data = pd.read_csv(file_path)


In [3]:
data.head()

Unnamed: 0,Year,District,Total_Population,18-34,35-64,65 and older,Male_Population,Female_Population,Male_Percentage,Female_Percentage,...,Asian_Percentage,Native_Hawaiian_Percentage,Other_Race_Percentage,Two_or_More_Races_Percentage,Median_Household_Income,Educational_Attainment,Unemployment,Presidential_Winner,Representative_Winner,Turnout
0,2008,Iowa's first,586388,65700,113554,36456,287150,299238,48.969283,51.030717,...,1.125705,0.04468,0.843639,1.736393,46902,511197.6667,19399.66667,D,D,0.7669
1,2012,Iowa's first,761813,86187,147494,50775,375743,386070,49.322209,50.677791,...,1.134793,0.050669,1.066535,1.92672,51678,502929.0,22978.0,D,D,0.7072
2,2016,Iowa's first,769896,88535,146537,55975,381574,388322,49.561759,50.438241,...,1.4915,0.116639,1.096122,1.927793,55366,511564.0,18758.0,R,R,0.702
3,2020,Iowa's first,772873,87156,144053,62285,382939,389934,49.547468,50.452532,...,1.704032,0.187224,0.880352,2.796578,61857,519100.0,16463.0,R,R,0.7381
4,2008,MO 4th,676106,78447,135570,43852,335027,341079,49.552437,50.447563,...,0.909916,0.015826,0.750474,1.901033,43427,473697.3333,28115.33333,R,D,0.7028


In [4]:
data_cleaned = data.drop(columns=['Presidential_Winner', 'Representative_Winner', 'District', 'Year'])

X = data_cleaned.drop(columns=['Turnout'])
y = data_cleaned['Turnout']


In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
mlp = MLPRegressor(max_iter=1000, random_state=42)

param_grid = {
    'hidden_layer_sizes': [(16,), (32,), (64,), (128,), (64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01],
}

grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=3, verbose=1)
grid_search.fit(X_scaled, y)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

final_model = MLPRegressor(
    max_iter=1000,
    random_state=42,
    hidden_layer_sizes=best_params['hidden_layer_sizes'],
    activation=best_params['activation'],
    solver=best_params['solver'],
    learning_rate=best_params['learning_rate'],
    alpha=best_params['alpha'],
)

final_model.fit(X_scaled, y)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
Best Parameters: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (128,), 'learning_rate': 'adaptive', 'solver': 'sgd'}


### Getting current district data to predict 2024 turnout

In [7]:
API_KEY = '8206bc1789ae1bb3348218afaa09fa0f680744f5'
c = Census(API_KEY)

def get_demographics_data(state, district, year):

    state_fips = states.lookup(state).fips


    data = c.acs5.state_congressional_district(
        ('NAME', 'B01003_001E',
        'B02001_002E',
        'B02001_003E',
        'B02001_004E',
        'B02001_005E',
        'B02001_006E',
        'B02001_007E',
        'B02001_008E',
        'B01001_007E',
        'B01001_008E',
        'B01001_009E',
        'B01001_010E',
        'B01001_011E',
        'B01001_012E',
        'B01001_013E',
        'B01001_014E',  # Age: 40 to 44 years
        'B01001_015E',  # Age: 45 to 49 years
        'B01001_016E',  # Age: 50 to 54 years
        'B01001_017E',  # Age: 55 to 59 years
        'B01001_018E',  # Age: 60 to 61 years
        'B01001_019E',  # Age: 62 to 64 years
        'B01001_020E',  # Age: 65 to 66 years
        'B01001_021E',  # Age: 67 to 69 years
        'B01001_022E',  # Age: 70 to 74 years
        'B01001_023E',  # Age: 75 to 79 years
        'B01001_024E',  # Age: 80 to 84 years
        'B01001_025E',  # Age: 85 years and over
        'B01001_002E',  # Male total
        'B01001_026E',  # Female total
        'B19013_001E',
        'B15003_001E',
        'B23025_005E'
        ),
        state_fips, district, year=year)

    df = pd.DataFrame(data)

    df = df.rename(columns={
        'NAME': 'District',
        'B01003_001E': 'Total_Population',
        'B02001_002E': 'White_Alone',
        'B02001_003E': 'Black_or_African_American_Alone',
        'B02001_004E': 'American_Indian_and_Alaska_Native_Alone',
        'B02001_005E': 'Asian_Alone',
        'B02001_006E': 'Native_Hawaiian_and_Other_Pacific_Islander_Alone',
        'B02001_007E': 'Some_Other_Race_Alone',
        'B02001_008E': 'Two_or_More_Races',
        'B01001_002E': 'Male_Population',
        'B01001_026E': 'Female_Population',
        'B19013_001E': 'Median_Household_Income',
        'B15003_001E': 'Educational_Attainment',
        'B23025_005E': 'Unemployment'
    })

    df['18-34'] = (df['B01001_007E'] + df['B01001_008E'] + df['B01001_009E'] +
                        df['B01001_010E'] + df['B01001_011E'] + df['B01001_012E'])

    df['35-64'] = (df['B01001_013E'] + df['B01001_014E'] + df['B01001_015E'] +
                                df['B01001_016E'] + df['B01001_017E'] + df['B01001_018E'] +
                                df['B01001_019E'])

    df['65 and older'] = (df['B01001_020E'] + df['B01001_021E'] + df['B01001_022E'] +
                    df['B01001_023E'] + df['B01001_024E'] + df['B01001_025E'])

    df['Male_Percentage'] = (df['Male_Population'] / df['Total_Population']) * 100
    df['Female_Percentage'] = (df['Female_Population'] / df['Total_Population']) * 100

    df['White_Percentage'] = (df['White_Alone'] / df['Total_Population']) * 100
    df['Black_Percentage'] = (df['Black_or_African_American_Alone'] / df['Total_Population']) * 100
    df['American_Indian_Percentage'] = (df['American_Indian_and_Alaska_Native_Alone'] / df['Total_Population']) * 100
    df['Asian_Percentage'] = (df['Asian_Alone'] / df['Total_Population']) * 100
    df['Native_Hawaiian_Percentage'] = (df['Native_Hawaiian_and_Other_Pacific_Islander_Alone'] / df['Total_Population']) * 100
    df['Other_Race_Percentage'] = (df['Some_Other_Race_Alone'] / df['Total_Population']) * 100
    df['Two_or_More_Races_Percentage'] = (df['Two_or_More_Races'] / df['Total_Population']) * 100

    df = df[['Total_Population', '18-34', '35-64',
        '65 and older', 'Male_Population', 'Female_Population', 'Male_Percentage',
        'Female_Percentage', 'White_Percentage', 'Black_Percentage',
        'American_Indian_Percentage', 'Asian_Percentage',
        'Native_Hawaiian_Percentage', 'Other_Race_Percentage',
        'Two_or_More_Races_Percentage', 'Median_Household_Income',
        'Educational_Attainment', 
        'Unemployment'
        ]]

    return df


current_data = get_demographics_data('IA', '01', 2022)

In [8]:
# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_scaled, y)





In [9]:
# Predictions
current_data_scaled = scaler.transform(current_data)

# Predict turnout for the current district
current_turnout_prediction_NN = final_model.predict(current_data_scaled)
current_turnout_prediction_RF = rf_model.predict(current_data_scaled)

print("Neural Netowrk's Predicted Voter Turnout:", current_turnout_prediction_NN[0])
print("Random Forst Regressor's Predicted Voter Turnout:", current_turnout_prediction_RF[0])

Neural Netowrk's Predicted Voter Turnout: 0.7838694626006255
Random Forst Regressor's Predicted Voter Turnout: 0.6951750000000005
