In [15]:
from pydataset import data


import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [10]:


data('voteincome', show_doc=True)

#data('voteincome')

voteincome

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Sample Turnout and Demographic Data from the 2000 Current Population Survey

### Description

This data set contains turnout and demographic data from a sample of
respondents to the 2000 Current Population Survey (CPS). The states
represented are South Carolina and Arkansas. The data represent only a sample
and results from this example should not be used in publication.

### Usage

    data(voteincome)

### Format

A data frame containing 7 variables ("state", "year", "vote", "income",
"education", "age", "female") and 1500 observations.

`state`

a factor variable with levels equal to "AR" (Arkansas) and "SC" (South
Carolina)

`year`

an integer vector

`vote`

an integer vector taking on values "1" (Voted) and "0" (Did Not Vote)

`income`

an integer vector ranging from "4" (Less than \$5000) to "17" (Greater than
\$75000) denoting family income. See the CPS codebook for more info

In [11]:
df = data('voteincome')

In [12]:
df = df.drop(columns = ['state', 'year'])

In [13]:
df

Unnamed: 0,vote,income,education,age,female
1,1,9,2,73,0
2,1,11,2,24,0
3,0,12,2,24,1
4,1,16,4,40,0
5,1,10,4,85,1
6,1,12,3,78,1
7,0,14,4,31,0
8,1,10,1,75,0
9,1,17,2,54,0
10,1,8,1,78,0


In [16]:
df.dropna(inplace=True) # handle missing age values

X = df[['income','education','age','female']]
y = df[['vote']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()


Unnamed: 0,income,education,age,female
892,17,4,38,1
1386,15,3,47,1
1187,17,2,85,1
490,11,3,76,1
1237,12,2,19,0


## K = 4

In [17]:
knn = KNeighborsClassifier(n_neighbors=4, weights='uniform')

In [18]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [19]:
y_pred = knn.predict(X_train)

In [20]:
y_pred_proba = knn.predict_proba(X_train)

In [21]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.88


In [22]:
print(confusion_matrix(y_train, y_pred))

[[ 99  55]
 [ 69 827]]


In [24]:
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

              precision    recall  f1-score   support

           0       0.59      0.64      0.61       154
           1       0.94      0.92      0.93       896

    accuracy                           0.88      1050
   macro avg       0.76      0.78      0.77      1050
weighted avg       0.89      0.88      0.88      1050

Accuracy of KNN classifier on test set: 0.81


## K = 3

In [26]:
knn3 = KNeighborsClassifier(n_neighbors=3, weights='uniform')

In [27]:
knn3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [28]:
y_pred = knn3.predict(X_train)
y_pred_proba = knn3.predict_proba(X_train)

In [29]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn3.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.91


In [30]:
print(confusion_matrix(y_train, y_pred))

[[ 87  67]
 [ 24 872]]


In [38]:
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn3.score(X_test, y_test)))

              precision    recall  f1-score   support

           0       0.67      0.99      0.80       154
           1       1.00      0.92      0.96       896

    accuracy                           0.93      1050
   macro avg       0.84      0.96      0.88      1050
weighted avg       0.95      0.93      0.93      1050

Accuracy of KNN classifier on test set: 0.82


## K = 2

In [32]:
knn2 = KNeighborsClassifier(n_neighbors=2, weights='uniform')

In [33]:
knn2.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [34]:
y_pred = knn2.predict(X_train)
y_pred_proba = knn2.predict_proba(X_train)

In [35]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn2.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.93


In [36]:
print(confusion_matrix(y_train, y_pred))

[[153   1]
 [ 74 822]]


In [39]:
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn2.score(X_test, y_test)))

              precision    recall  f1-score   support

           0       0.67      0.99      0.80       154
           1       1.00      0.92      0.96       896

    accuracy                           0.93      1050
   macro avg       0.84      0.96      0.88      1050
weighted avg       0.95      0.93      0.93      1050

Accuracy of KNN classifier on test set: 0.79


## K = 1

In [40]:
knn1 = KNeighborsClassifier(n_neighbors=1, weights='uniform')

In [41]:
knn1.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [42]:
y_pred = knn1.predict(X_train)
y_pred_proba = knn1.predict_proba(X_train)

In [43]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn1.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))

Accuracy of KNN classifier on training set: 0.98
[[144  10]
 [ 13 883]]


In [44]:
print(classification_report(y_train, y_pred))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn1.score(X_test, y_test)))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       154
           1       0.99      0.99      0.99       896

    accuracy                           0.98      1050
   macro avg       0.95      0.96      0.96      1050
weighted avg       0.98      0.98      0.98      1050

Accuracy of KNN classifier on test set: 0.84


In [48]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       154
           1       0.99      0.99      0.99       896

    accuracy                           0.98      1050
   macro avg       0.95      0.96      0.96      1050
weighted avg       0.98      0.98      0.98      1050



In [50]:
from sklearn import __version__
__version__

'0.21.2'