In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from data_prep import county_info_2016, county_info_2018, results_info, create_targets
from model_prep import run_model
from visualizations import display_results

np.random.seed(2020)

## Data

The demographic information was obtained from the U.S. Cencus website. The 5 year study was used in order to obtain the most accurate information for smaller countues. The results data was obtained from the MIT Election Data and Science Lab.

In [2]:
features_2016 = county_info_2016()
features_2018 = county_info_2018()
results_2016 = results_info(2016)
# results_2012 = results_info(2012)

In [3]:
df = pd.merge(features_2016, results_2016, on='County')
df = create_targets(df)

## Methedology

In [4]:
y_R = df['Percent Republican']
y_D = df['Percent Democrat']
y_O = df['Percent Republican']

dropped_features = ['County', 'State', 'Republican', 'Democrat', 'Other',
                    'Total population', 'Percent Republican',
                    'Percent Democrat', 'Percent Other']

X = df.drop(columns=dropped_features)

In [5]:
X_train, X_test, y_train_R, y_test_R = train_test_split(X, y_R, random_state=2020)
X_train, X_test, y_train_D, y_test_D = train_test_split(X, y_D, random_state=2020)
X_train, X_test, y_train_O, y_test_O = train_test_split(X, y_O, random_state=2020)

In [6]:
# Scale data
scaler = StandardScaler()
# # mmscaler = MinMaxScaler()
scaler.fit(X_train)
# X_train, X_test, y_train, y_test = train_test_split(X, y_R)

targets = [y_R, y_D, y_O]

# for target in targets:

# linreg = LinearRegression()
# linreg.fit(X_train, y_train)

# run_model(linreg, X_train, X_test, y_train, y_test)

NameError: name 'y_train' is not defined

In [7]:
from sklearn.pipeline import Pipeline

pipe_scl_lin = Pipeline([('scl', StandardScaler()), ('lin', LinearRegression())])
pipe_mms_lin = Pipeline([('mms', MinMaxScaler()), ('lin', LinearRegression())])

pipelines = [pipe_scl_lin, pipe_mms_lin]
pipeline_names = ['Standard Scalar Linear','Min Max Linear']

for target in targets:
    for pipe in pipelines:
        print(pipe)
        pipe.fit(target, X_train, y_train)
    
# Compare accuracies
for index, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipeline_names[index], val.score(X_test, y_test)))

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lin',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)
Pipeline(memory=None,
         steps=[('mms', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('lin',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)
Standard Scalar Linear pipeline test accuracy: 0.499
Min Max Linear pipeline test accuracy: 0.499


## Evaluate model

In [7]:
cv_results = cross_validate(linreg, X_train, y_train, cv=3)
cv_results['test_score']

array([0.47340205, 0.48222382, 0.49402013])

In [8]:
R_score = linreg.score(X_train, y_train)
R_score_adj = 1 - (1-R_score)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
R_score, R_score_adj

(0.5007163542092132, 0.4966880684072258)

In [9]:
# Run feature selection
rfe = RFECV(LinearRegression(),cv=5)
X_rfe_train = rfe.fit_transform(X_train, y_train)
X_rfe_test = rfe.transform(X_test)
lm = LinearRegression().fit(X_rfe_train, y_train)
run_model(lm, X_rfe_train, X_rfe_test, y_train, y_test)
print ('The optimal number of features is: ', rfe.n_features_)

Training R^2 : 0.5004194112607195
Training Root Mean Square Error 0.05889046221323055

----------------

Testing R^2 : 0.4983977248615823
Testing Root Mean Square Error 0.06044273150173074
The optimal number of features is:  17


In [10]:
# Calculates a Pearson correlation coefficient and the p-value for testing non-correlation.
for column in X_train.columns.tolist():
    print(scipy.stats.pearsonr(X_train[column], y_train), column)

(-0.28181079020950217, 2.4011139641929113e-42) Percent Less than 9th grade
(-0.2260358652191395, 1.8539187422309877e-27) Percent 9th to 12th grade, no diploma
(0.3224882957064082, 1.2904552138944084e-55) Percent High school graduate
(0.2081323932773334, 1.9307173494920278e-23) Percent Some college, no degree
(0.2073429016714844, 2.8489022812289295e-23) Percent Associate's degree
(-0.09358880736573988, 8.723918845290635e-06) Percent Bachelor's degree
(-0.2950135855476507, 2.021395424694845e-46) Percent Graduate or professional degree
(0.5870174126624991, 1.6522454247205585e-208) Median age
(-0.008782749015473122, 0.6771327730251022) Sex ratio (males per 100 females)
(0.5927263139382977, 1.4849130554733257e-213) Male Median age
(0.5471927119330278, 6.05329407713593e-176) Female Median age
(0.058290718385103156, 0.005678563239162909) Percent White
(-0.07272588524788848, 0.000555732972079907) Percent Black or African American
(0.010775052825567627, 0.6094655900693202) Percent American Indi

In [11]:
y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

y_train_df = pd.DataFrame(y_train)
train_actual = y_train_df.merge(df['Republican'], left_index=True, right_index=True)
train_actual = train_actual.merge(df['Total population'],
                                  left_index=True, right_index=True)

y_test_df = pd.DataFrame(y_test)
test_actual = y_test_df.merge(df['Republican'], left_index=True, right_index=True)
test_actual = test_actual.merge(df['Total population'],
                                  left_index=True, right_index=True)

In [12]:
train_df = display_results(train_actual, y_hat_train, df['County'])
test_df = display_results(test_actual, y_hat_test, df['County'])

639.5210503309995 639.5210503309996 17.43887510387618
209.32997235653346 212.2582480071143 17.596085114372567


In [13]:
train_df.head()

Unnamed: 0,County,Percent Republican,Republican,Total population,Predictions,Percent off,Percent off Abs,R Votes Prediction
0,"Wilcox County, Alabama, 2016",0.156219,1737,11119,0.254548,-38.6288,38.6288,2830.315598
1,"Winston County, Alabama, 2016",0.384292,9228,24013,0.263344,45.9277,45.9277,6323.681365
2,"Apache County, Arizona, 2016",0.113897,8240,72346,0.220976,-48.4572,48.4572,15986.718882
3,"Cochise County, Arizona, 2016",0.219166,28092,128177,0.26131,-16.128,16.128,33493.898017
5,"Gila County, Arizona, 2016",0.266684,14182,53179,0.317587,-16.0279,16.0279,16888.940083


In [14]:
test_df.head()

Unnamed: 0,County,Percent Republican,Republican,Total population,Predictions,Percent off,Percent off Abs,R Votes Prediction
4,"Coconino County, Arizona, 2016",0.152886,21108,138064,0.162364,-5.83796,5.83796,22416.676828
8,"La Paz County, Arizona, 2016",0.197153,4003,20304,0.363013,-45.6898,45.6898,7370.616974
10,"Mohave County, Arizona, 2016",0.286217,58282,203629,0.356485,-19.7114,19.7114,72590.654559
11,"Navajo County, Arizona, 2016",0.19016,20577,108209,0.226597,-16.0802,16.0802,24519.852584
12,"Pima County, Arizona, 2016",0.166871,167428,1003338,0.215052,-22.4044,22.4044,215769.991302


In [15]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth=3)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [16]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

# Make predictions and evaluate 
y_pred = regressor.predict(X_test)
print('MSE score:', mse(y_test, y_pred))
print('R-sq score:', r2_score(y_test,y_pred))

MSE score: 0.004686905131005424
R-sq score: 0.3564867470519061


In [17]:
y_check = regressor.predict(X_train)

In [18]:
# X_grid = np.arange(min(X), max(X), 0.01)
# X_grid = X_grid.reshape((len(X_grid), 1))
# plt.figure(figsize=(15,6))
# plt.scatter(X, y, color = 'red', label='data')
# plt.plot(X_grid, regressor.predict(X_grid), color = 'green', label='Regression function')
# plt.title('Decision Tree Regression')
# plt.xlabel('Features')
# plt.ylabel('Target')
# plt.legend()
# plt.show()

In [19]:
max(X)

'Sex ratio (males per 100 females)'

In [20]:
train2_df = display_results(train_actual, y_check, df['County'])
test2_df = display_results(test_actual, y_pred, df['County'])

639.5210503309995 639.5210503309996 18.164264831319187
209.32997235653346 211.28739613340971 19.27501234546834


In [21]:
train2_df.head()

Unnamed: 0,County,Percent Republican,Republican,Total population,Predictions,Percent off,Percent off Abs,R Votes Prediction
0,"Wilcox County, Alabama, 2016",0.156219,1737,11119,0.26153,-40.2672,40.2672,2907.951732
1,"Winston County, Alabama, 2016",0.384292,9228,24013,0.308488,24.5726,24.5726,7407.730014
2,"Apache County, Arizona, 2016",0.113897,8240,72346,0.159919,-28.778,28.778,11569.466746
3,"Cochise County, Arizona, 2016",0.219166,28092,128177,0.308488,-28.9549,28.9549,39541.107318
5,"Gila County, Arizona, 2016",0.266684,14182,53179,0.357632,-25.4306,25.4306,19018.515807


In [22]:
test2_df.head()

Unnamed: 0,County,Percent Republican,Republican,Total population,Predictions,Percent off,Percent off Abs,R Votes Prediction
4,"Coconino County, Arizona, 2016",0.152886,21108,138064,0.212979,-28.2155,28.2155,29404.689655
8,"La Paz County, Arizona, 2016",0.197153,4003,20304,0.357632,-44.8726,44.8726,7261.361532
10,"Mohave County, Arizona, 2016",0.286217,58282,203629,0.357632,-19.969,19.969,72824.260614
11,"Navajo County, Arizona, 2016",0.19016,20577,108209,0.26153,-27.2895,27.2895,28299.896481
12,"Pima County, Arizona, 2016",0.166871,167428,1003338,0.211523,-21.1096,21.1096,212228.668926


# 2020 Predictions

In [23]:
# Rio Arriba County, New Mexico does not have a reproted Household Median or Mean income 
Rio_2016_Median = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Median income']
Rio_2016_Mean = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Mean income']

In [24]:
# Rio Arriba County, New Mexico is missing mean and median income info. Replacing with 2016 info.
values = {'Households Median income': Rio_2016_Median.values[0],
          'Households Mean income': Rio_2016_Mean.values[0]}

features_2018 = features_2018.fillna(value=values)
features_2020 = features_2018.drop(columns=['County','Total population'])

In [25]:
predictions_2020 = linreg.predict(features_2020)

In [26]:
R_2020_percent = pd.DataFrame(predictions_2020)

In [27]:
_ = features_2018['County'].str.split(',', expand=True)
_ = _.rename(columns={1: 'State'})

In [28]:
results_2020 = pd.DataFrame.merge(features_2018['County'],
                                  _['State'],
                                  left_index=True, right_index=True)

results_2020 = results_2020.merge(features_2018['Total population'],
                                  left_index=True, right_index=True)

results_2020 = results_2020.merge(R_2020_percent, left_index=True, right_index=True)
results_2020 = results_2020.rename(columns={0: 'R Percent'})
results_2020['R Votes'] = (results_2020['R Percent'] * results_2020['Total population']).round()

In [29]:
results_2020

Unnamed: 0,County,State,Total population,R Percent,R Votes
0,"Autauga County, Alabama",Alabama,55200,0.235508,13000.0
1,"Baldwin County, Alabama",Alabama,208107,0.282138,58715.0
2,"Barbour County, Alabama",Alabama,25782,0.206376,5321.0
3,"Bibb County, Alabama",Alabama,22527,0.285384,6429.0
4,"Blount County, Alabama",Alabama,57645,0.271991,15679.0
...,...,...,...,...,...
3215,"Vega Baja Municipio, Puerto Rico",Puerto Rico,53371,0.206291,11010.0
3216,"Vieques Municipio, Puerto Rico",Puerto Rico,8771,0.272371,2389.0
3217,"Villalba Municipio, Puerto Rico",Puerto Rico,22993,0.266773,6134.0
3218,"Yabucoa Municipio, Puerto Rico",Puerto Rico,34149,0.255644,8730.0
