##KNN

Importing libraries

In [3]:
import pandas as pd
import requests
import json
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import statistics
#https://archive.ics.uci.edu/ml/datasets/bank+marketing

Obtaining Data

In [4]:
url = 'https://api.apispreadsheets.com/api/dataset/bank-marketing/'

In [5]:
response = requests.get(url)
dataset = json.loads(response.text)

In [6]:
dataset["data"][0].keys()

dict_keys(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'])

In [7]:
df = pd.json_normalize(dataset["data"]) #json to table
# df = df.rename(columns={"balance": "y"}) For other datasets w/o y as final col name

In [8]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79.0,1,-1.0,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220.0,1,339.0,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185.0,1,330.0,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199.0,4,-1.0,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226.0,1,-1.0,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329.0,5,-1.0,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153.0,1,-1.0,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151.0,11,-1.0,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129.0,4,211.0,3,other,no


Features and labels

In [9]:
feature_df = df.drop(columns=["y"])
y = df['y']
y

Unnamed: 0,y
0,no
1,no
2,no
3,no
4,no
...,...
4516,no
4517,no
4518,no
4519,no


In [10]:
X = pd.get_dummies(feature_df)
X

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79.0,1,-1.0,0,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1,33,4789,11,220.0,1,339.0,4,False,False,False,...,False,False,True,False,False,False,True,False,False,False
2,35,1350,16,185.0,1,330.0,1,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,30,1476,3,199.0,4,-1.0,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
4,59,0,5,226.0,1,-1.0,0,False,True,False,...,False,False,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,-333,30,329.0,5,-1.0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4517,57,-3313,9,153.0,1,-1.0,0,False,False,False,...,False,False,True,False,False,False,False,False,False,True
4518,57,295,19,151.0,11,-1.0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4519,28,1137,6,129.0,4,211.0,3,False,True,False,...,False,False,False,False,False,False,False,True,False,False


80-20 Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Model

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [12]:
for nn_neighbors in range(30):
  if nn_neighbors == 0:
    continue
  knn = KNeighborsClassifier(n_neighbors=nn_neighbors)
  knn.fit(X_train, y_train)
  y_pred = knn.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  print("Accuracy (%) for kNN with ", nn_neighbors, " nearest neighbors is: ", round(accuracy * 100,2))

Accuracy (%) for kNN with  1  nearest neighbors is:  84.31
Accuracy (%) for kNN with  2  nearest neighbors is:  87.4
Accuracy (%) for kNN with  3  nearest neighbors is:  85.75
Accuracy (%) for kNN with  4  nearest neighbors is:  88.18
Accuracy (%) for kNN with  5  nearest neighbors is:  87.96
Accuracy (%) for kNN with  6  nearest neighbors is:  88.18
Accuracy (%) for kNN with  7  nearest neighbors is:  88.07
Accuracy (%) for kNN with  8  nearest neighbors is:  88.84
Accuracy (%) for kNN with  9  nearest neighbors is:  88.29
Accuracy (%) for kNN with  10  nearest neighbors is:  88.84
Accuracy (%) for kNN with  11  nearest neighbors is:  89.17
Accuracy (%) for kNN with  12  nearest neighbors is:  88.95
Accuracy (%) for kNN with  13  nearest neighbors is:  89.17
Accuracy (%) for kNN with  14  nearest neighbors is:  88.73
Accuracy (%) for kNN with  15  nearest neighbors is:  88.62
Accuracy (%) for kNN with  16  nearest neighbors is:  88.84
Accuracy (%) for kNN with  17  nearest neighbors i

Cross-Validation

In [13]:
from sklearn.model_selection import cross_val_score

for nn_neighbors in range(30):
  if nn_neighbors == 0:
    continue
  knn = KNeighborsClassifier(n_neighbors=nn_neighbors)
  accuracy = cross_val_score(knn, X, y, cv=5)

  print("Accuracy (%) for kNN with ", nn_neighbors, " nearest neighbors is: ",round(statistics.mean(accuracy)*100, 2))

Accuracy (%) for kNN with  1  nearest neighbors is:  85.09
Accuracy (%) for kNN with  2  nearest neighbors is:  87.97
Accuracy (%) for kNN with  3  nearest neighbors is:  86.68
Accuracy (%) for kNN with  4  nearest neighbors is:  87.81
Accuracy (%) for kNN with  5  nearest neighbors is:  87.3
Accuracy (%) for kNN with  6  nearest neighbors is:  87.59
Accuracy (%) for kNN with  7  nearest neighbors is:  87.28
Accuracy (%) for kNN with  8  nearest neighbors is:  87.86
Accuracy (%) for kNN with  9  nearest neighbors is:  87.79
Accuracy (%) for kNN with  10  nearest neighbors is:  88.21
Accuracy (%) for kNN with  11  nearest neighbors is:  88.1
Accuracy (%) for kNN with  12  nearest neighbors is:  88.3
Accuracy (%) for kNN with  13  nearest neighbors is:  88.1
Accuracy (%) for kNN with  14  nearest neighbors is:  88.48
Accuracy (%) for kNN with  15  nearest neighbors is:  88.43
Accuracy (%) for kNN with  16  nearest neighbors is:  88.54
Accuracy (%) for kNN with  17  nearest neighbors is: 

Normalizing a feature

In [14]:
# Scale the training data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

array([[-0.86564103, -0.41832772,  1.0998847 , ..., -0.21304657,
        -0.17037239,  0.46813499],
       [-0.11376402, -0.41344086, -0.94816038, ..., -0.21304657,
        -0.17037239, -2.13613598],
       [-0.30173327, -0.34535063, -1.06863362, ..., -0.21304657,
        -0.17037239,  0.46813499],
       ...,
       [-0.58368715,  0.45413945,  0.4975185 , ..., -0.21304657,
        -0.17037239,  0.46813499],
       [ 0.35615911, -0.39617396,  0.25657202, ..., -0.21304657,
        -0.17037239,  0.46813499],
       [ 1.29600537,  0.48443798, -0.1048477 , ..., -0.21304657,
        -0.17037239,  0.46813499]])

In [15]:
# Fit a knn model on the scaled data
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

In [16]:
# Scale the test data
X_test_scaled = scaler.transform(X_test)
print("accuracy with scaling: ", knn.score(X_test_scaled, y_test)*100)

accuracy with scaling:  88.06629834254144


In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.9027624309392265