# Classification - K Nearest Neighbors

In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import neighbors

df_census = pd.read_csv('census1994.csv')
print(df_census);

             Date  Age          WorkClass  fnlwgt      education  \
0       3/20/1994   39          State-gov   77516      Bachelors   
1       1/14/1994   50   Self-emp-not-inc   83311      Bachelors   
2       8/14/1994   38            Private  215646        HS-grad   
3       3/17/1994   53            Private  234721           11th   
4       9/20/1994   28            Private  338409      Bachelors   
5      11/28/1994   37            Private  284582        Masters   
6        3/2/1994   49            Private  160187            9th   
7      11/27/1994   52   Self-emp-not-inc  209642        HS-grad   
8      12/25/1994   31            Private   45781        Masters   
9      10/10/1994   42            Private  159449      Bachelors   
10     10/29/1994   37            Private  280464   Some-college   
11     11/10/1994   30          State-gov  141297      Bachelors   
12       1/4/1994   23            Private  122272      Bachelors   
13      2/15/1994   32            Private  20501

In [3]:
first_five_recs = df_census.head();
print(first_five_recs);

        Date  Age          WorkClass  fnlwgt   education  education-num  \
0  3/20/1994   39          State-gov   77516   Bachelors             13   
1  1/14/1994   50   Self-emp-not-inc   83311   Bachelors             13   
2  8/14/1994   38            Private  215646     HS-grad              9   
3  3/17/1994   53            Private  234721        11th              7   
4  9/20/1994   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race   gender  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country   class  
0 

In [4]:
last_five_recs = df_census.tail();
print(last_five_recs);

             Date  Age      WorkClass  fnlwgt    education  education-num  \
32556  12/29/1994   27        Private  257302   Assoc-acdm             12   
32557   1/30/1994   40        Private  154374      HS-grad              9   
32558   9/26/1994   58        Private  151910      HS-grad              9   
32559    4/5/1994   22        Private  201490      HS-grad              9   
32560    3/5/1994   52   Self-emp-inc  287927      HS-grad              9   

            marital-status          occupation relationship    race   gender  \
32556   Married-civ-spouse        Tech-support         Wife   White   Female   
32557   Married-civ-spouse   Machine-op-inspct      Husband   White     Male   
32558              Widowed        Adm-clerical    Unmarried   White   Female   
32559        Never-married        Adm-clerical    Own-child   White     Male   
32560   Married-civ-spouse     Exec-managerial         Wife   White   Female   

       capital-gain  capital-loss  hours-per-week  nativ

In [5]:
for col in df_census.columns: 
    print(col)

Date
Age
WorkClass
fnlwgt
education
education-num
marital-status
occupation
relationship
race
gender
capital-gain
capital-loss
hours-per-week
native-country
class


In [6]:
#Splitting the dataset 70% for training, and 30% for testing the classifier.

df_c = df_census[['Age', 'education-num', 'class']].copy()
df_c['class']=df_c['class'].map({' <=50K': 0, ' >50K': 1, ' <=50K.': 0, ' >50K.': 1})

X = df_c.iloc[:, 0:3];
print(X);
Y = df_c.iloc[:, 0];
print(Y);
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3);
print (X_train);
print (Y_test);

       Age  education-num  class
0       39             13      0
1       50             13      0
2       38              9      0
3       53              7      0
4       28             13      0
5       37             14      0
6       49              5      0
7       52              9      1
8       31             14      1
9       42             13      1
10      37             10      1
11      30             13      1
12      23             13      0
13      32             12      0
14      40             11      1
15      34              4      0
16      25              9      0
17      32              9      0
18      38              7      0
19      43             14      1
20      40             16      1
21      54              9      0
22      35              5      0
23      43              7      0
24      59              9      0
25      56             13      1
26      19              9      0
27      54             10      1
28      39              9      0
29      49

In [7]:
array = df_c.values
X = array[:,0:16]
Y = array[:,2]
test = SelectKBest(score_func=chi2, k=3)
fit = test.fit(X_train, Y_train)
features = fit.transform(X_train)

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train,Y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [8]:
knn.predict(X_test)[0:5]

array([39, 53, 39, 30, 72], dtype=int64)

In [9]:
knn.score(X_test, Y_test)

0.9915037363087317

In [10]:
knn_cv = KNeighborsClassifier(n_neighbors=3)
cv_scores = cross_val_score(knn_cv, X, Y, cv=5)
print(cv_scores)
mean = np.mean(cv_scores)
print('mean:', mean)

[0.99416552 0.995086   0.99401106 0.99554668 0.99662162]
mean: 0.9950861742777912


In [11]:
knn_cv = KNeighborsClassifier(n_neighbors=5)
cv_scores = cross_val_score(knn_cv, X, Y, cv=5)
print(cv_scores)
mean = np.mean(cv_scores)
print('mean:', mean)

[0.99216951 0.99370393 0.99109337 0.99308968 0.99401106]
mean: 0.9928135083075202


In [15]:
knn_cv = KNeighborsClassifier(n_neighbors=4)
cv_scores = cross_val_score(knn_cv, X, Y, cv=5)
print(cv_scores)
mean = np.mean(cv_scores)
print('mean:', mean)

[0.99155535 0.99324324 0.99047912 0.99355037 0.99401106]
mean: 0.9925678269241145


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

knn2 = neighbors.KNeighborsClassifier()
print('KNN score: %f' % knn2.fit(X_train, Y_train).score(X_test, Y_test))

KNN score: 0.990992


In [17]:
knn.predict(X_test)[0:5]

array([39, 53, 39, 30, 72], dtype=int64)

In [18]:
knn.score(X_test, Y_test)

0.9915037363087317