# Importing libraries

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# Import the dataset

Link: https://raw.githubusercontent.com/omairaasim/machine_learning/master/project_11_k_nearest_neighbor/iphone_purchase_records.csv

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/omairaasim/machine_learning/master/project_11_k_nearest_neighbor/iphone_purchase_records.csv')

In [5]:
df.head()

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [6]:
df.shape

(400, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gender           400 non-null    object
 1   Age              400 non-null    int64 
 2   Salary           400 non-null    int64 
 3   Purchase Iphone  400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 12.6+ KB


In [8]:
df.describe()

Unnamed: 0,Age,Salary,Purchase Iphone
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


In [9]:
df.Gender.value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [10]:
df.loc[df['Purchase Iphone']==1,"Gender"].value_counts()

Female    77
Male      66
Name: Gender, dtype: int64

# Spliting of data

In [13]:
df.head(2)

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,Male,19,19000,0
1,Male,35,20000,0


In [14]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# Label Encoding

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
enc = LabelEncoder()

In [20]:
X.Gender = enc.fit_transform(X.Gender)

In [21]:
X

Unnamed: 0,Gender,Age,Salary
0,1,19,19000
1,1,35,20000
2,0,26,43000
3,0,27,57000
4,1,19,76000
...,...,...,...
395,0,46,41000
396,1,51,23000
397,0,50,20000
398,1,36,33000


In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Gender  400 non-null    int64
 1   Age     400 non-null    int64
 2   Salary  400 non-null    int64
dtypes: int64(3)
memory usage: 9.5 KB


# Spliting the data into sets

In [23]:
skf = StratifiedKFold(n_splits=5)

In [45]:
for train_index,test_index in skf.split(X,y):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Feature Scaling

In [46]:
scale = StandardScaler()

In [47]:
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)

# Model selection

In [62]:
log = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=5)

# Training the model

In [63]:
log.fit(X_train, y_train)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

# Test the model

In [71]:
y_log_pred = log.predict(X_test)

In [64]:
y_knn_pred = knn.predict(X_test)

In [65]:
newdf = pd.DataFrame({"Actual":y_test, "Predicted":y_knn_pred})

In [66]:
newdf.head()

Unnamed: 0,Actual,Predicted
266,0,0
267,0,0
269,0,0
270,0,1
276,0,0


In [67]:
confusion_matrix(y_test, y_knn_pred)

array([[46,  5],
       [ 7, 22]])

In [None]:
Accuracy = (46+22)/80 = 68/80 = 86%

In [57]:
from sklearn.metrics import accuracy_score

In [72]:
accuracy_score(y_test, y_log_pred)

0.725

In [68]:
lis = [i for i in range(2,101) if i%2==0]

In [69]:
acc=[]
dic = {}
for i in lis:
  knn = KNeighborsClassifier(n_neighbors=i)
  knn.fit(X_train, y_train)
  y_knn_pred = knn.predict(X_test)
  acc.append(accuracy_score(y_test,y_knn_pred))
  # dic[i] = accuracy_score(y_test,y_knn_pred)

print(max(acc))
# 0.8875 = 89%
# 0.875 = 88%

0.875


In [70]:
dic

{2: 0.8125,
 4: 0.85,
 6: 0.85,
 8: 0.85,
 10: 0.875,
 12: 0.8625,
 14: 0.85,
 16: 0.85,
 18: 0.8,
 20: 0.775,
 22: 0.7625,
 24: 0.7375,
 26: 0.7375,
 28: 0.7375,
 30: 0.7375,
 32: 0.7375,
 34: 0.7375,
 36: 0.7375,
 38: 0.7375,
 40: 0.7375,
 42: 0.7375,
 44: 0.7375,
 46: 0.7375,
 48: 0.7375,
 50: 0.7375,
 52: 0.7375,
 54: 0.7375,
 56: 0.725,
 58: 0.7375,
 60: 0.7375,
 62: 0.725,
 64: 0.725,
 66: 0.7125,
 68: 0.7125,
 70: 0.725,
 72: 0.725,
 74: 0.725,
 76: 0.725,
 78: 0.725,
 80: 0.725,
 82: 0.725,
 84: 0.725,
 86: 0.725,
 88: 0.7125,
 90: 0.7,
 92: 0.6875,
 94: 0.6875,
 96: 0.6875,
 98: 0.6875,
 100: 0.6875}