In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import metrics


In [2]:
#Read Data in
df=pd.read_csv("merged_train.csv")
df.head(5)

Unnamed: 0,State,County,FIPS,Total Population,"Percent White, not Hispanic or Latino","Percent Black, not Hispanic or Latino",Percent Hispanic or Latino,Percent Foreign Born,Percent Female,Percent Age 29 and Under,Percent Age 65 and Older,Median Household Income,Percent Unemployed,Percent Less than High School Degree,Percent Less than Bachelor's Degree,Percent Rural,Democratic,Republican,Party
0,AZ,apache,4001,72346,18.571863,0.486551,5.947806,1.719515,50.598513,45.854643,13.322091,32460,15.807433,21.758252,88.941063,74.061076,16298,7810,1
1,AZ,cochise,4003,128177,56.299492,3.714395,34.403208,11.458374,49.069646,37.902276,19.756275,45383,8.567108,13.409171,76.837055,36.301067,17383,26929,0
2,AZ,coconino,4005,138064,54.619597,1.342855,13.711033,4.825298,50.581614,48.946141,10.873943,51106,8.238305,11.085381,65.791439,31.466066,34240,19249,1
3,AZ,gila,4007,53179,63.222325,0.55285,18.548675,4.249798,50.29617,32.23829,26.397638,40593,12.129932,15.729958,82.262624,41.062,7643,12180,0
4,AZ,graham,4009,37529,51.461536,1.811932,32.097844,4.385942,46.313518,46.393456,12.315809,47422,14.424104,14.580797,86.675944,46.437399,3368,6870,0


***Task 1***

In [3]:
#Task 1: split the dataset into training and test set using the holdout method

x_train,x_test,y_train,y_test=train_test_split(df.iloc[:,:-3], df.iloc[:,-3:], random_state=1, test_size=0.23)


In [4]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 191 to 1061
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   State                                  920 non-null    object 
 1   County                                 920 non-null    object 
 2   FIPS                                   920 non-null    int64  
 3   Total Population                       920 non-null    int64  
 4   Percent White, not Hispanic or Latino  920 non-null    float64
 5   Percent Black, not Hispanic or Latino  920 non-null    float64
 6   Percent Hispanic or Latino             920 non-null    float64
 7   Percent Foreign Born                   920 non-null    float64
 8   Percent Female                         920 non-null    float64
 9   Percent Age 29 and Under               920 non-null    float64
 10  Percent Age 65 and Older               920 non-null    float64
 11  Med

In [5]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275 entries, 49 to 180
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   State                                  275 non-null    object 
 1   County                                 275 non-null    object 
 2   FIPS                                   275 non-null    int64  
 3   Total Population                       275 non-null    int64  
 4   Percent White, not Hispanic or Latino  275 non-null    float64
 5   Percent Black, not Hispanic or Latino  275 non-null    float64
 6   Percent Hispanic or Latino             275 non-null    float64
 7   Percent Foreign Born                   275 non-null    float64
 8   Percent Female                         275 non-null    float64
 9   Percent Age 29 and Under               275 non-null    float64
 10  Percent Age 65 and Older               275 non-null    float64
 11  Media

***Task 2***

In [6]:
#Task 2: Scaling the x_train and x_test sets
scaler=StandardScaler().fit(x_train.iloc[:,3:])
x_train_scaled=scaler.transform(x_train.iloc[:,3:])
x_test_scaled=scaler.transform(x_test.iloc[:,3:])



***Task 4: Classification models***

In [33]:
# Classification model #1: K-NN
print("Model: K-Nearest Neighbors")
num_neighbors = [1,3,5]

# create models for each number of nearest neighbors
for n in num_neighbors:
    classifier = KNeighborsClassifier(n_neighbors=n)  
    classifier.fit(x_train_scaled, y_train['Party'])

    y_pred = classifier.predict(x_test_scaled)

    accuracy = metrics.accuracy_score(y_test['Party'], y_pred)
    error = 1 - accuracy
    precision = metrics.precision_score(y_test['Party'], y_pred, average = None)
    recall = metrics.recall_score(y_test['Party'], y_pred, average = None)
    F1_score = (2 * recall * precision) / (recall + precision)
    print("Number of neighbors: ", n, " ------------")
    print('Accuracy: \t', round(accuracy,3))
    print('Precision: \t', precision)
    print('Recall: \t', recall)
    print('F1 Score: \t', F1_score)

print('\nBest K-NN model: nn = 5')

Model: K-Nearest Neighbors
Number of neighbors:  1  ------------
Accuracy: 	 0.76
Precision: 	 [0.82587065 0.58108108]
Recall: 	 [0.84263959 0.55128205]
F1 Score: 	 [0.83417085 0.56578947]
Number of neighbors:  3  ------------
Accuracy: 	 0.782
Precision: 	 [0.81278539 0.66071429]
Recall: 	 [0.9035533  0.47435897]
F1 Score: 	 [0.85576923 0.55223881]
Number of neighbors:  5  ------------
Accuracy: 	 0.785
Precision: 	 [0.81944444 0.66101695]
Recall: 	 [0.89847716 0.5       ]
F1 Score: 	 [0.85714286 0.56934307]

Best K-NN model: nn = 5


In [34]:
# Classification model #2: Naive Bayes
print("Model: Naive Bayes")

classifier = GaussianNB()  
classifier.fit(x_train_scaled, y_train['Party'])

y_pred = classifier.predict(x_test_scaled)

accuracy = metrics.accuracy_score(y_test['Party'], y_pred)
error = 1 - accuracy
precision = metrics.precision_score(y_test['Party'], y_pred, average = None)
recall = metrics.recall_score(y_test['Party'], y_pred, average = None)
F1_score = (2 * recall * precision) / (recall + precision)
print('Accuracy: \t', round(accuracy,3))
print('Precision: \t', precision)
print('Recall: \t', recall)
print('F1 Score: \t', F1_score)

Model: Naive Bayes
Accuracy: 	 0.753
Precision: 	 [0.79723502 0.5862069 ]
Recall: 	 [0.87817259 0.43589744]
F1 Score: 	 [0.83574879 0.5       ]


In [37]:
# Classification model #3: SVM
print("Model: Support Vector Machines")
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for k in kernels:
    print("Kernel: ", k)
    classifier = SVC(kernel=k)  
    classifier.fit(x_train_scaled, y_train['Party'])

    y_pred = classifier.predict(x_test_scaled)

    accuracy = metrics.accuracy_score(y_test['Party'], y_pred)
    error = 1 - accuracy
    precision = metrics.precision_score(y_test['Party'], y_pred, average = None)
    recall = metrics.recall_score(y_test['Party'], y_pred, average = None)
    F1_score = (2 * recall * precision) / (recall + precision)
    print('Accuracy: \t', round(accuracy,3))
    print('Precision: \t', precision)
    print('Recall: \t', recall)
    print('F1 Score: \t', F1_score)
    
print('\nBest SVM model: rbf')

Model: Support Vector Machines
Kernel:  linear
Accuracy: 	 0.789
Precision: 	 [0.79079498 0.77777778]
Recall: 	 [0.95939086 0.35897436]
F1 Score: 	 [0.86697248 0.49122807]
Kernel:  poly
Accuracy: 	 0.778
Precision: 	 [0.77868852 0.77419355]
Recall: 	 [0.96446701 0.30769231]
F1 Score: 	 [0.861678   0.44036697]
Kernel:  rbf
Accuracy: 	 0.815
Precision: 	 [0.8173913 0.8      ]
Recall: 	 [0.95431472 0.46153846]
F1 Score: 	 [0.88056206 0.58536585]
Kernel:  sigmoid
Accuracy: 	 0.662
Precision: 	 [0.73853211 0.36842105]
Recall: 	 [0.81725888 0.26923077]
F1 Score: 	 [0.77590361 0.31111111]

Best SVM model: rbf


In [None]:
# Metrics without feature selection:
                Accuracy    Precision        Recall         F1 Score
KNN - 1         0.76        [0.826 0.581]   [0.843 0.551]   [0.834 0.566]
KNN - 3         0.782       [0.813 0.661]   [0.904 0.474]   [0.856 0.552]
KNN - 5         0.753       [0.797 0.586]   [0.878 0.436]   [0.836 0.5  ]
NB              0.753       [0.797 0.586]   [0.878 0.436]   [0.836 0.5  ]
SVM - linear    0.789       [0.791 0.778]   [0.959 0.359]   [0.867 0.491]
SVM - poly      0.778       [0.779 0.774]   [0.964 0.308]   [0.862 0.440]
SVM - rbf       0.815       [0.817 0.8  ]   [0.954 0.462]   [0.881 0.585]
SVM - sigmoid   0.662       [0.739 0.368]   [0.817 0.269]   [0.776 0.311]

**Task 4 Analysis:**
*Best model:* SVM using 'rbf' as the kernel. It gave us the highest accuracy, precision, and f1-score. To select the parameters of the models, we chose 3 different numbers of nearest neighbors for the K-NN model and all the possible kernels (except for 'precomputed') for the SVM model. In terms of features, we used all 16 feautures (!!!)

***Task 5: Clustering models***

In [None]:
# Task 5

***Task 6: Map***

In [None]:
#Task 6

***Task 7: Prediction***

In [None]:
# Task 7