In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import metrics


In [2]:
#Read Data in
df=pd.read_csv("merged_train.csv")
df.head(5)

Unnamed: 0,State,County,FIPS,Total Population,"Percent White, not Hispanic or Latino","Percent Black, not Hispanic or Latino",Percent Hispanic or Latino,Percent Foreign Born,Percent Female,Percent Age 29 and Under,Percent Age 65 and Older,Median Household Income,Percent Unemployed,Percent Less than High School Degree,Percent Less than Bachelor's Degree,Percent Rural,Democratic,Republican,Party
0,AZ,apache,4001,72346,18.571863,0.486551,5.947806,1.719515,50.598513,45.854643,13.322091,32460,15.807433,21.758252,88.941063,74.061076,16298,7810,1
1,AZ,cochise,4003,128177,56.299492,3.714395,34.403208,11.458374,49.069646,37.902276,19.756275,45383,8.567108,13.409171,76.837055,36.301067,17383,26929,0
2,AZ,coconino,4005,138064,54.619597,1.342855,13.711033,4.825298,50.581614,48.946141,10.873943,51106,8.238305,11.085381,65.791439,31.466066,34240,19249,1
3,AZ,gila,4007,53179,63.222325,0.55285,18.548675,4.249798,50.29617,32.23829,26.397638,40593,12.129932,15.729958,82.262624,41.062,7643,12180,0
4,AZ,graham,4009,37529,51.461536,1.811932,32.097844,4.385942,46.313518,46.393456,12.315809,47422,14.424104,14.580797,86.675944,46.437399,3368,6870,0


***Task 1***

In [3]:
#Task 1: split the dataset into training and test set using the holdout method

x_train,x_test,y_train,y_test=train_test_split(df.iloc[:,:-3], df.iloc[:,-3:], random_state=1, test_size=0.23)


In [4]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 191 to 1061
Data columns (total 16 columns):
State                                    920 non-null object
County                                   920 non-null object
FIPS                                     920 non-null int64
Total Population                         920 non-null int64
Percent White, not Hispanic or Latino    920 non-null float64
Percent Black, not Hispanic or Latino    920 non-null float64
Percent Hispanic or Latino               920 non-null float64
Percent Foreign Born                     920 non-null float64
Percent Female                           920 non-null float64
Percent Age 29 and Under                 920 non-null float64
Percent Age 65 and Older                 920 non-null float64
Median Household Income                  920 non-null int64
Percent Unemployed                       920 non-null float64
Percent Less than High School Degree     920 non-null float64
Percent Less than Bachelor's Deg

In [5]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275 entries, 49 to 180
Data columns (total 16 columns):
State                                    275 non-null object
County                                   275 non-null object
FIPS                                     275 non-null int64
Total Population                         275 non-null int64
Percent White, not Hispanic or Latino    275 non-null float64
Percent Black, not Hispanic or Latino    275 non-null float64
Percent Hispanic or Latino               275 non-null float64
Percent Foreign Born                     275 non-null float64
Percent Female                           275 non-null float64
Percent Age 29 and Under                 275 non-null float64
Percent Age 65 and Older                 275 non-null float64
Median Household Income                  275 non-null int64
Percent Unemployed                       275 non-null float64
Percent Less than High School Degree     275 non-null float64
Percent Less than Bachelor's Degre

***Task 2***

In [6]:
#Task 2: Scaling the x_train and x_test sets
scaler=StandardScaler().fit(x_train.iloc[:,3:])
x_train_scaled=scaler.transform(x_train.iloc[:,3:])
x_test_scaled=scaler.transform(x_test.iloc[:,3:])



***Task 3: Regression models***

In [93]:
#using linear regression with all variables for democratic votes

model=linear_model.LinearRegression().fit(x_train_scaled,y_train['Democratic'])
y_pred=model.predict(x_test_scaled)
r2=metrics.r2_score(y_test['Democratic'],y_pred)
radj=1-(1-r2)*(275-1)/(275-13-1)

print ("Coefficients: \t",model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)




Coefficients: 	 [65180.35305484 -3445.88365077  -829.88729492 -7093.73389513
  3103.84879685  -226.64749932 -3658.42000593  -856.54320693
   428.57929225   685.6592625   3574.19800722 -9492.84900608
   568.81147654]
Intercept: 	 26459.871739130427
R-squared: 	 0.9066241634739943
Adjusted R-squared: 	 0.9019732597389826


In [94]:
#using linear regression with all variables for Republican votes

model=linear_model.LinearRegression().fit(x_train_scaled,y_train['Republican'])
y_pred=model.predict(x_test_scaled)
r2=metrics.r2_score(y_test['Republican'],y_pred)
radj=1-(1-r2)*(275-1)/(275-13-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)


Coefficients: 	 [39554.93330223  1936.35120859 -2549.42696013   983.35778642
 -4728.0802429   -628.76439648  -807.93106198  2393.54933962
  5332.70154246  2022.6990234   2935.10256247 -2703.77747028
 -5904.11707453]
Intercept: 	 21058.953260869566
R-squared: 	 0.8204008463950343
Adjusted R-squared: 	 0.8114552946829096


In [95]:
#using lasso Regression with all variables for Democratic votes

model=linear_model.Lasso(alpha=1).fit(x_train_scaled,y_train['Democratic'])
y_pred=model.predict(x_test_scaled)
r2=metrics.r2_score(y_test['Democratic'],y_pred)
radj=1-(1-r2)*(275-1)/(275-13-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)




Coefficients: 	 [65179.58429607 -3432.21060658  -820.69664388 -7080.49766461
  3104.35235033  -226.200928   -3650.7653268   -850.38104973
   428.74654506   685.67904905  3567.34876125 -9487.66345589
   567.2483701 ]
Intercept: 	 26459.871739130427
R-squared: 	 0.9066358324394637
Adjusted R-squared: 	 0.9019855099172913


In [96]:
#using Lasso Regression with all variables for Republican votes

model=linear_model.Lasso(alpha=1).fit(x_train_scaled,y_train['Republican'])
y_pred=model.predict(x_test_scaled)
r2=metrics.r2_score(y_test['Republican'],y_pred)
radj=1-(1-r2)*(275-1)/(275-13-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)


Coefficients: 	 [39551.13609361  1930.04425597 -2549.39478081   976.47849193
 -4719.38496994  -626.34790664  -808.32303017  2389.73750211
  5328.74147455  2020.21972591  2927.89556845 -2698.59094515
 -5901.28777988]
Intercept: 	 21058.953260869566
R-squared: 	 0.82037207773618
Adjusted R-squared: 	 0.8114250931023499


From above, Lasso regression does not seem to be setting any coefficients to 0 which might indicate that all variables are useful in predicting the result. Both linear regression and lasso regression have similar R-squared and adjusted R-squared values. However, We can still experiment with different combinations of variables to see if any is similar or better than using all variables.

In [97]:
#using Lasso regression with Total population,ethnic,education and %rural/urban features for democratic votes

model=linear_model.Lasso(alpha=1).fit(x_train_scaled[:,[0,1,2,3,10,11,12]],y_train['Democratic'])
y_pred=model.predict(x_test_scaled[:,[0,1,2,3,10,11,12]])
r2=metrics.r2_score(y_test['Democratic'],y_pred)
radj=1-(1-r2)*(275-1)/(275-7-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)

Coefficients: 	 [ 66468.36469058  -2835.10254871   -340.50358531  -5893.09326654
   3962.74735093 -10090.03936004   1280.20928827]
Intercept: 	 26459.871739130427
R-squared: 	 0.9077410782516582
Adjusted R-squared: 	 0.9053223050223009


In [98]:
#using Lasso regression with Total population,ethnic,education and %rural/urban fatures for Republican votes

model=linear_model.Lasso(alpha=1).fit(x_train_scaled[:,[0,1,2,3,10,11,12]],y_train['Republican'])
y_pred=model.predict(x_test_scaled[:,[0,1,2,3,10,11,12]])
r2=metrics.r2_score(y_test['Republican'],y_pred)
radj=1-(1-r2)*(275-1)/(275-7-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)

Coefficients: 	 [38416.73437929  3367.89386346 -1646.38789433   268.19841996
   180.58887006 -2912.17794442 -4488.39026087]
Intercept: 	 21058.953260869566
R-squared: 	 0.8040982146474943
Adjusted R-squared: 	 0.79896221278432


In [99]:
#using Lasso regression with Total population,ethnic and age data for democratic votes

model=linear_model.Lasso(alpha=1).fit(x_train_scaled[:,[0,1,2,3,6,7,8]],y_train['Democratic'])
y_pred=model.predict(x_test_scaled[:,[0,1,2,3,6,7,8]])
r2=metrics.r2_score(y_test['Democratic'],y_pred)
radj=1-(1-r2)*(275-1)/(275-7-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)

Coefficients: 	 [ 6.78800747e+04 -5.39046680e+03 -7.68959219e+00 -6.14312062e+03
 -3.92120632e+02  1.94163995e+03  5.19953941e+03]
Intercept: 	 26459.871739130434
R-squared: 	 0.9022677349940688
Adjusted R-squared: 	 0.8997054658740631


In [100]:
#using Lasso regression with Total population,ethnic and age data for Republican votes

model=linear_model.Lasso(alpha=1).fit(x_train_scaled[:,[0,1,2,3,6,7,8]],y_train['Republican'])
y_pred=model.predict(x_test_scaled[:,[0,1,2,3,6,7,8]])
r2=metrics.r2_score(y_test['Republican'],y_pred)
radj=1-(1-r2)*(275-1)/(275-7-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)

Coefficients: 	 [39992.45174544  2189.17010832  -938.00766816   337.82540405
   995.03343305  1506.47451008  5006.53370505]
Intercept: 	 21058.953260869566
R-squared: 	 0.8027955282636687
Adjusted R-squared: 	 0.7976253735739521


In [101]:
#All features except %female, %foreign born and %unemployed
#using Lasso for democratic votes

model=linear_model.Lasso(alpha=1).fit(x_train_scaled[:,[0,1,2,3,6,7,8,10,11,12]],y_train['Democratic'])
y_pred=model.predict(x_test_scaled[:,[0,1,2,3,6,7,8,10,11,12]])
r2=metrics.r2_score(y_test['Democratic'],y_pred)
radj=1-(1-r2)*(275-1)/(275-10-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)

Coefficients: 	 [ 6.60304196e+04 -4.49357971e+03 -1.20217111e+03 -6.56464100e+03
 -4.24321391e+03 -1.11765974e+03  6.21279244e+02  4.79205671e+03
 -1.02998143e+04  1.93350324e+01]
Intercept: 	 26459.871739130427
R-squared: 	 0.9088724499515641
Adjusted R-squared: 	 0.9054206488133658


In [102]:
#All features except %female, %foreign born and %unemployed
#using Lasso for democratic votes

model=linear_model.Lasso(alpha=1).fit(x_train_scaled[:,[0,1,2,3,6,7,8,10,11,12]],y_train['Republican'])
y_pred=model.predict(x_test_scaled[:,[0,1,2,3,6,7,8,10,11,12]])
r2=metrics.r2_score(y_test['Republican'],y_pred)
radj=1-(1-r2)*(275-1)/(275-10-1)

print ("Coefficients: \t", model.coef_)
print ("Intercept: \t",model.intercept_)
print ("R-squared: \t",r2)
print("Adjusted R-squared: \t", radj)

Coefficients: 	 [38289.87057722  1918.94147029 -2170.81703375 -1241.08657453
  -851.65073067  2001.10788055  3943.25424507  1761.99202865
 -1144.30127907 -5694.18878151]
Intercept: 	 21058.953260869566
R-squared: 	 0.8025381328054344
Adjusted R-squared: 	 0.7950585166238221


From our results, we can see that using all variables is consistently better than using various subsets, especially for predicting republican votes. i.e all variables are playing an important role in the final vote tally. When using all variables, Lasso and Linear Regression have similar scores, however, it would still be advisable to go with Lasso Regression to lower the coefficients and ensure it generalizes better.

***Task 4: Classification models***

In [8]:
# Classification model #1: K-NN
print("Model: K-Nearest Neighbors")
num_neighbors = [3,4,5]

# create models for each number of nearest neighbors
for n in num_neighbors:
    classifier = KNeighborsClassifier(n_neighbors=n)  
    classifier.fit(x_train_scaled, y_train['Party'])

    y_pred = classifier.predict(x_test_scaled)

    accuracy = metrics.accuracy_score(y_test['Party'], y_pred)
    error = 1 - accuracy
    precision = metrics.precision_score(y_test['Party'], y_pred, average = None)
    recall = metrics.recall_score(y_test['Party'], y_pred, average = None)
    F1_score = (2 * recall * precision) / (recall + precision)
    print("Number of neighbors: ", n, " ------------")
    print('Accuracy: \t', round(accuracy,3))
    print('Precision: \t', precision)
    print('Recall: \t', recall)
    print('F1 Score: \t', F1_score)

print('\nBest K-NN model: nn = 4')

Model: K-Nearest Neighbors
Number of neighbors:  3  ------------
Accuracy: 	 0.782
Precision: 	 [0.81278539 0.66071429]
Recall: 	 [0.9035533  0.47435897]
F1 Score: 	 [0.85576923 0.55223881]
Number of neighbors:  4  ------------
Accuracy: 	 0.796
Precision: 	 [0.80257511 0.76190476]
Recall: 	 [0.94923858 0.41025641]
F1 Score: 	 [0.86976744 0.53333333]
Number of neighbors:  5  ------------
Accuracy: 	 0.785
Precision: 	 [0.81944444 0.66101695]
Recall: 	 [0.89847716 0.5       ]
F1 Score: 	 [0.85714286 0.56934307]

Best K-NN model: nn = 4


In [34]:
# Classification model #2: Naive Bayes
print("Model: Naive Bayes")

classifier = GaussianNB()  
classifier.fit(x_train_scaled, y_train['Party'])

y_pred = classifier.predict(x_test_scaled)

accuracy = metrics.accuracy_score(y_test['Party'], y_pred)
error = 1 - accuracy
precision = metrics.precision_score(y_test['Party'], y_pred, average = None)
recall = metrics.recall_score(y_test['Party'], y_pred, average = None)
F1_score = (2 * recall * precision) / (recall + precision)
print('Accuracy: \t', round(accuracy,3))
print('Precision: \t', precision)
print('Recall: \t', recall)
print('F1 Score: \t', F1_score)

Model: Naive Bayes
Accuracy: 	 0.753
Precision: 	 [0.79723502 0.5862069 ]
Recall: 	 [0.87817259 0.43589744]
F1 Score: 	 [0.83574879 0.5       ]


In [37]:
# Classification model #3: SVM
print("Model: Support Vector Machines")
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for k in kernels:
    print("Kernel: ", k)
    classifier = SVC(kernel=k)  
    classifier.fit(x_train_scaled, y_train['Party'])

    y_pred = classifier.predict(x_test_scaled)

    accuracy = metrics.accuracy_score(y_test['Party'], y_pred)
    error = 1 - accuracy
    precision = metrics.precision_score(y_test['Party'], y_pred, average = None)
    recall = metrics.recall_score(y_test['Party'], y_pred, average = None)
    F1_score = (2 * recall * precision) / (recall + precision)
    print('Accuracy: \t', round(accuracy,3))
    print('Precision: \t', precision)
    print('Recall: \t', recall)
    print('F1 Score: \t', F1_score)
    
print('\nBest SVM model: rbf')

Model: Support Vector Machines
Kernel:  linear
Accuracy: 	 0.789
Precision: 	 [0.79079498 0.77777778]
Recall: 	 [0.95939086 0.35897436]
F1 Score: 	 [0.86697248 0.49122807]
Kernel:  poly
Accuracy: 	 0.778
Precision: 	 [0.77868852 0.77419355]
Recall: 	 [0.96446701 0.30769231]
F1 Score: 	 [0.861678   0.44036697]
Kernel:  rbf
Accuracy: 	 0.815
Precision: 	 [0.8173913 0.8      ]
Recall: 	 [0.95431472 0.46153846]
F1 Score: 	 [0.88056206 0.58536585]
Kernel:  sigmoid
Accuracy: 	 0.662
Precision: 	 [0.73853211 0.36842105]
Recall: 	 [0.81725888 0.26923077]
F1 Score: 	 [0.77590361 0.31111111]

Best SVM model: rbf


In [None]:
# Metrics without feature selection:
                Accuracy    Precision        Recall         F1 Score
KNN - 3         0.782       [0.813 0.661]   [0.904 0.474]   [0.856 0.552]
KNN - 4         0.796       [0.803 0.762]   [0.949 0.410]   [0.870 0.533]
KNN - 5         0.785       [0.797 0.586]   [0.878 0.436]   [0.836 0.5  ]
NB              0.753       [0.797 0.586]   [0.878 0.436]   [0.836 0.5  ]
SVM - linear    0.789       [0.791 0.778]   [0.959 0.359]   [0.867 0.491]
SVM - poly      0.778       [0.779 0.774]   [0.964 0.308]   [0.862 0.440]
SVM - rbf       0.815       [0.817 0.8  ]   [0.954 0.462]   [0.881 0.585]
SVM - sigmoid   0.662       [0.739 0.368]   [0.817 0.269]   [0.776 0.311]

**Task 4 Analysis:**
*Best model:* SVM using 'rbf' as the kernel. It gave us the highest accuracy, precision, and f1-score. To select the parameters of the models, we chose 3 different numbers of nearest neighbors for the K-NN model and all the possible kernels (except for 'precomputed') for the SVM model. In terms of features, we used all 16 feautures (!!!)

***Task 5: Clustering models***

In [None]:
# Task 5

***Task 6: Map***

In [None]:
#Task 6

***Task 7: Prediction***

In [None]:
# Task 7