In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from data_prep import county_info_2016, county_info_2018, results_info, create_targets
from model_prep import run_model
from visualizations import display_results

np.random.seed(2020)

## Data

The demographic information was obtained from the U.S. Cencus website. The 5 year study was used in order to obtain the most accurate information for smaller countues. The results data was obtained from the MIT Election Data and Science Lab.

In [2]:
features_2016 = county_info_2016()
features_2018 = county_info_2018()
results_2016 = results_info(2016)
# results_2012 = results_info(2012)

In [3]:
# results_2016

In [4]:
df = pd.merge(features_2016, results_2016, on='County')

In [5]:
df = create_targets(df)

In [6]:
# Create targets
# df['Percent Republican'] = df['Republican']/df['Total population']
# df['Percent Democrat'] = df['Democrat']/df['Total population']
# df['Percent Other'] = df['Other']/df['Total population']

In [7]:
df.head()

Unnamed: 0,County,Percent Less than 9th grade,"Percent 9th to 12th grade, no diploma",Percent High school graduate,"Percent Some college, no degree",Percent Associate's degree,Percent Bachelor's degree,Percent Graduate or professional degree,Total population,Median age,...,Percent Hispanic or Latino,Households Median income,Households Mean income,State,Republican,Democrat,Other,Percent Republican,Percent Democrat,Percent Other
0,"Wilcox County, Alabama, 2016",6.3,13.8,44.9,17.4,6.3,7.3,4.0,11119,40.2,...,2.6,24442,37636,Alabama,1737,4339,42,0.156219,0.390233,0.00377732
1,"Winston County, Alabama, 2016",9.2,14.8,34.3,20.3,8.2,8.0,5.1,24013,44.1,...,3.8,33896,45481,Alabama,9228,872,213,0.384292,0.0363137,0.0088702
2,"Apache County, Arizona, 2016",11.3,10.5,33.5,26.5,7.2,7.2,3.9,72346,33.5,...,2.1,32460,42584,Arizona,8240,17083,2338,0.113897,0.236129,0.0323169
3,"Cochise County, Arizona, 2016",6.2,7.2,23.2,29.2,11.0,14.8,8.4,128177,40.3,...,2.9,45383,58772,Arizona,28092,17450,4473,0.219166,0.13614,0.0348971
4,"Coconino County, Arizona, 2016",4.2,6.9,21.6,23.7,9.4,19.9,14.3,138064,30.7,...,3.9,51106,69267,Arizona,21108,32404,6272,0.152886,0.234703,0.0454282


In [8]:
y = df['Percent Republican']
dropped_features = ['County', 'State', 'Republican', 'Democrat', 'Other',
                    'Total population', 'Percent Republican',
                    'Percent Democrat', 'Percent Other']
X = df.drop(columns=dropped_features)

In [9]:
# Scale data
scaler = StandardScaler()
# mmscaler = MinMaxScaler()
scaler.fit(X)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

run_model(linreg, X_train, X_test, y_train, y_test)

Training R^2 : 0.5007163542092132
Training Root Mean Square Error 0.05887295782337749

----------------

Testing R^2 : 0.4992196954893293
Testing Root Mean Square Error 0.06039318774712403


## Evaluate model

In [11]:
cv_results = cross_validate(linreg, X_train, y_train, cv=3)
cv_results['test_score']

array([0.47340205, 0.48222382, 0.49402013])

In [12]:
R_score = linreg.score(X_train, y_train)
R_score_adj = 1 - (1-R_score)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
R_score, R_score_adj

(0.5007163542092132, 0.4966880684072258)

In [13]:
# Run feature selection
rfe = RFECV(LinearRegression(),cv=5)
X_rfe_train = rfe.fit_transform(X_train, y_train)
X_rfe_test = rfe.transform(X_test)
lm = LinearRegression().fit(X_rfe_train, y_train)
run_model(lm, X_rfe_train, X_rfe_test, y_train, y_test)
print ('The optimal number of features is: ', rfe.n_features_)

Training R^2 : 0.5004194112607195
Training Root Mean Square Error 0.05889046221323055

----------------

Testing R^2 : 0.4983977248615823
Testing Root Mean Square Error 0.06044273150173074
The optimal number of features is:  17


In [14]:
# Calculates a Pearson correlation coefficient and the p-value for testing non-correlation.
for column in X_train.columns.tolist():
    print(scipy.stats.pearsonr(X_train[column], y_train), column)

(-0.28181079020950217, 2.4011139641929113e-42) Percent Less than 9th grade
(-0.2260358652191395, 1.8539187422309877e-27) Percent 9th to 12th grade, no diploma
(0.3224882957064082, 1.2904552138944084e-55) Percent High school graduate
(0.2081323932773334, 1.9307173494920278e-23) Percent Some college, no degree
(0.2073429016714844, 2.8489022812289295e-23) Percent Associate's degree
(-0.09358880736573988, 8.723918845290635e-06) Percent Bachelor's degree
(-0.2950135855476507, 2.021395424694845e-46) Percent Graduate or professional degree
(0.5870174126624991, 1.6522454247205585e-208) Median age
(-0.008782749015473122, 0.6771327730251022) Sex ratio (males per 100 females)
(0.5927263139382977, 1.4849130554733257e-213) Male Median age
(0.5471927119330278, 6.05329407713593e-176) Female Median age
(0.058290718385103156, 0.005678563239162909) Percent White
(-0.07272588524788848, 0.000555732972079907) Percent Black or African American
(0.010775052825567627, 0.6094655900693202) Percent American Indi

In [15]:
y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

y_train_df = pd.DataFrame(y_train)
train_actual = y_train_df.merge(df['Republican'], left_index=True, right_index=True)
train_actual = train_actual.merge(df['Total population'],
                                  left_index=True, right_index=True)

y_test_df = pd.DataFrame(y_test)
test_actual = y_test_df.merge(df['Republican'], left_index=True, right_index=True)
test_actual = test_actual.merge(df['Total population'],
                                  left_index=True, right_index=True)

In [16]:
train_df = display_results(train_actual, y_hat_train, df['County'])
test_df = display_results(test_actual, y_hat_test, df['County'])

639.5210503309995 639.5210503309996 17.43887510387618
209.32997235653346 212.2582480071143 17.596085114372567


In [17]:
train_df

Unnamed: 0,County,Percent Republican,Republican,Total population,Predictions,Percent off,Percent off Abs,R Votes Prediction
0,"Wilcox County, Alabama, 2016",0.156219,1737,11119,0.254548,-38.6288,38.6288,2830.315598
1,"Winston County, Alabama, 2016",0.384292,9228,24013,0.263344,45.9277,45.9277,6323.681365
2,"Apache County, Arizona, 2016",0.113897,8240,72346,0.220976,-48.4572,48.4572,15986.718882
3,"Cochise County, Arizona, 2016",0.219166,28092,128177,0.261310,-16.128,16.128,33493.898017
5,"Gila County, Arizona, 2016",0.266684,14182,53179,0.317587,-16.0279,16.0279,16888.940083
...,...,...,...,...,...,...,...,...
2992,"Platte County, Wyoming, 2016",0.393249,3437,8740,0.353386,11.2803,11.2803,3088.597305
2994,"Sublette County, Wyoming, 2016",0.339813,3409,10032,0.337185,0.779394,0.779394,3382.635945
2995,"Sweetwater County, Wyoming, 2016",0.271222,12154,44812,0.281473,-3.64184,3.64184,12613.357770
2998,"Washakie County, Wyoming, 2016",0.348581,2911,8351,0.326640,6.71722,6.71722,2727.769737


In [18]:
test_df

Unnamed: 0,County,Percent Republican,Republican,Total population,Predictions,Percent off,Percent off Abs,R Votes Prediction
4,"Coconino County, Arizona, 2016",0.152886,21108,138064,0.162364,-5.83796,5.83796,22416.676828
8,"La Paz County, Arizona, 2016",0.197153,4003,20304,0.363013,-45.6898,45.6898,7370.616974
10,"Mohave County, Arizona, 2016",0.286217,58282,203629,0.356485,-19.7114,19.7114,72590.654559
11,"Navajo County, Arizona, 2016",0.19016,20577,108209,0.226597,-16.0802,16.0802,24519.852584
12,"Pima County, Arizona, 2016",0.166871,167428,1003338,0.215052,-22.4044,22.4044,215769.991302
...,...,...,...,...,...,...,...,...
2984,"Goshen County, Wyoming, 2016",0.326148,4418,13546,0.301920,8.02462,8.02462,4089.808489
2991,"Park County, Wyoming, 2016",0.382182,11115,29083,0.303337,25.9926,25.9926,8821.948514
2993,"Sheridan County, Wyoming, 2016",0.343069,10266,29924,0.296652,15.647,15.647,8877.012783
2996,"Teton County, Wyoming, 2016",0.173319,3921,22623,0.258552,-32.9655,32.9655,5849.230735


In [19]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth=3)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [20]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

# Make predictions and evaluate 
y_pred = regressor.predict(X_test)
print('MSE score:', mse(y_test, y_pred))
print('R-sq score:', r2_score(y_test,y_pred))

MSE score: 0.004686905131005424
R-sq score: 0.3564867470519061


In [21]:
y_check = regressor.predict(X_train)

In [22]:
# X_grid = np.arange(min(X), max(X), 0.01)
# X_grid = X_grid.reshape((len(X_grid), 1))
# plt.figure(figsize=(15,6))
# plt.scatter(X, y, color = 'red', label='data')
# plt.plot(X_grid, regressor.predict(X_grid), color = 'green', label='Regression function')
# plt.title('Decision Tree Regression')
# plt.xlabel('Features')
# plt.ylabel('Target')
# plt.legend()
# plt.show()

In [23]:
max(X)

'Sex ratio (males per 100 females)'

In [24]:
train2_df = display_results(train_actual, y_check, df['County'])
test2_df = display_results(test_actual, y_pred, df['County'])

639.5210503309995 639.5210503309996 18.164264831319187
209.32997235653346 211.28739613340971 19.27501234546834


In [26]:
train2_df.head()

Unnamed: 0,County,Percent Republican,Republican,Total population,Predictions,Percent off,Percent off Abs,R Votes Prediction
0,"Wilcox County, Alabama, 2016",0.156219,1737,11119,0.26153,-40.2672,40.2672,2907.951732
1,"Winston County, Alabama, 2016",0.384292,9228,24013,0.308488,24.5726,24.5726,7407.730014
2,"Apache County, Arizona, 2016",0.113897,8240,72346,0.159919,-28.778,28.778,11569.466746
3,"Cochise County, Arizona, 2016",0.219166,28092,128177,0.308488,-28.9549,28.9549,39541.107318
5,"Gila County, Arizona, 2016",0.266684,14182,53179,0.357632,-25.4306,25.4306,19018.515807


In [27]:
test2_df.head()

Unnamed: 0,County,Percent Republican,Republican,Total population,Predictions,Percent off,Percent off Abs,R Votes Prediction
4,"Coconino County, Arizona, 2016",0.152886,21108,138064,0.212979,-28.2155,28.2155,29404.689655
8,"La Paz County, Arizona, 2016",0.197153,4003,20304,0.357632,-44.8726,44.8726,7261.361532
10,"Mohave County, Arizona, 2016",0.286217,58282,203629,0.357632,-19.969,19.969,72824.260614
11,"Navajo County, Arizona, 2016",0.19016,20577,108209,0.26153,-27.2895,27.2895,28299.896481
12,"Pima County, Arizona, 2016",0.166871,167428,1003338,0.211523,-21.1096,21.1096,212228.668926


# 2020 Predictions

In [28]:
features_2020 = features_2018.drop(columns=['County','Total population'])

In [29]:
features_2020.head()

Unnamed: 0,Percent Less than 9th grade,"Percent 9th to 12th grade, no diploma",Percent High school graduate,"Percent Some college, no degree",Percent Associate's degree,Percent Bachelor's degree,Percent Graduate or professional degree,Median age,Sex ratio (males per 100 females),Male Median age,Female Median age,Percent White,Percent Black or African American,Percent American Indian and Alaska Native,Percent Asian,Percent Hispanic or Latino,Households Median income,Households Mean income
0,2.6,8.7,32.6,20.3,8.1,15.9,11.8,37.8,94.9,36.9,38.9,76.9,19.1,0.3,1.0,2.8,58786.0,75515.0
1,2.7,7.0,27.6,22.0,9.4,20.7,10.6,42.8,94.6,41.8,44.1,86.3,9.5,0.7,0.8,4.5,55962.0,77212.0
2,8.2,18.8,35.7,18.1,7.0,7.8,4.4,39.9,113.3,38.1,43.4,47.4,47.6,0.3,0.4,4.3,34186.0,47909.0
3,5.7,11.1,47.3,18.6,5.8,7.6,3.9,39.9,117.1,37.5,43.1,76.7,22.3,0.0,0.2,2.4,45340.0,58529.0
4,7.5,12.4,34.0,21.4,12.0,8.1,4.5,40.8,97.3,40.2,41.7,95.5,1.5,0.2,0.3,9.1,48695.0,60646.0


In [30]:
predictions_2020 = linreg.predict(features_2020)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [44]:
features_2018.isna().sum()

County                                       0
Percent Less than 9th grade                  0
Percent 9th to 12th grade, no diploma        0
Percent High school graduate                 0
Percent Some college, no degree              0
Percent Associate's degree                   0
Percent Bachelor's degree                    0
Percent Graduate or professional degree      0
Total population                             0
Median age                                   0
Sex ratio (males per 100 females)            0
Male Median age                              0
Female Median age                            0
Percent White                                0
Percent Black or African American            0
Percent American Indian and Alaska Native    0
Percent Asian                                0
Percent Hispanic or Latino                   0
Households Median income                     1
Households Mean income                       1
dtype: int64

In [45]:
features_2018[features_2018.isnull().any(axis=1)]

Unnamed: 0,County,Percent Less than 9th grade,"Percent 9th to 12th grade, no diploma",Percent High school graduate,"Percent Some college, no degree",Percent Associate's degree,Percent Bachelor's degree,Percent Graduate or professional degree,Total population,Median age,Sex ratio (males per 100 females),Male Median age,Female Median age,Percent White,Percent Black or African American,Percent American Indian and Alaska Native,Percent Asian,Percent Hispanic or Latino,Households Median income,Households Mean income
1816,"Rio Arriba County, New Mexico",4.8,9.0,31.3,27.9,8.7,12.5,6.0,39307,40.6,96.0,40.9,40.3,54.4,0.6,16.3,0.5,71.4,,


In [None]:
# Rio Arriba County, New Mexico does not have a reproted Household Median or Mean income 

In [69]:
Rio_2016_Median = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Median income']
Rio_2016_Mean = features_2016[features_2016['County'] == 'Rio Arriba County, New Mexico, 2016']['Households Mean income']

In [68]:
Rio_2016_Median

1936    33972
Name: Households Median income, dtype: int64

In [73]:
Rio_2016_Mean

1936    49556
Name: Households Mean income, dtype: int64

In [79]:
features_2018.iloc[1816]['Households Median income'] = Rio_2016_Median
features_2018.iloc[1816]['Households Mean income'] = Rio_2016_Mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [80]:
features_2018.iloc[1816]['Households Median income']

nan