## Importing the libraries

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

## Importing the Datasets

Links : https://raw.githubusercontent.com/omairaasim/machine_learning/master/project_13_kernel_svm/iphone_purchase_records.csv

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/omairaasim/machine_learning/master/project_13_kernel_svm/iphone_purchase_records.csv')

In [4]:
df.head()

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [5]:
df.shape

(400, 4)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gender           400 non-null    object
 1   Age              400 non-null    int64 
 2   Salary           400 non-null    int64 
 3   Purchase Iphone  400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 12.6+ KB


In [7]:
df.describe()

Unnamed: 0,Age,Salary,Purchase Iphone
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


In [11]:
df.Gender.value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [17]:
df.Gender.groupby(df['Purchase Iphone']).value_counts()

Purchase Iphone  Gender
0                Male      130
                 Female    127
1                Female     77
                 Male       66
Name: Gender, dtype: int64

In [22]:
df.rename({"Purchase Iphone":'Purchase_Iphone'}, axis=1, inplace = True)

## Spliting of data

In [24]:
df.head(3)

Unnamed: 0,Gender,Age,Salary,Purchase_Iphone
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0


In [25]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [26]:
X

Unnamed: 0,Gender,Age,Salary
0,Male,19,19000
1,Male,35,20000
2,Female,26,43000
3,Female,27,57000
4,Male,19,76000
...,...,...,...
395,Female,46,41000
396,Male,51,23000
397,Female,50,20000
398,Male,36,33000


In [27]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchase_Iphone, Length: 400, dtype: int64

## Label Encoding

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
enc = LabelEncoder()
X.Gender =enc.fit_transform(X.Gender)

In [30]:
X

Unnamed: 0,Gender,Age,Salary
0,1,19,19000
1,1,35,20000
2,0,26,43000
3,0,27,57000
4,1,19,76000
...,...,...,...
395,0,46,41000
396,1,51,23000
397,0,50,20000
398,1,36,33000


In [31]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Gender  400 non-null    int32
 1   Age     400 non-null    int64
 2   Salary  400 non-null    int64
dtypes: int32(1), int64(2)
memory usage: 7.9 KB


## Spliting the data into sets

In [32]:
skf = StratifiedKFold(n_splits=5)

In [33]:
for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

## Feature scaling

In [37]:
scale =  StandardScaler()

In [52]:
X_train= scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)

In [55]:
X_train

array([[ 1.02532046, -1.61062735, -1.48261454],
       [ 1.02532046, -0.10854866, -1.45444971],
       [-0.97530483, -0.95346792, -0.80665849],
       [-0.97530483, -0.859588  , -0.41235079],
       [ 1.02532046, -1.61062735,  0.12278108],
       [ 1.02532046, -0.859588  , -0.38418596],
       [-0.97530483, -0.859588  ,  0.34809976],
       [-0.97530483, -0.39018841,  2.20697891],
       [ 1.02532046, -1.04734784, -1.08830685],
       [-0.97530483, -0.10854866, -0.18703211],
       [-0.97530483, -0.95346792,  0.23544042],
       [-0.97530483, -0.95346792, -0.55317497],
       [ 1.02532046, -1.51674743,  0.40442943],
       [ 1.02532046, -0.39018841, -1.51077938],
       [ 1.02532046, -1.70450727,  0.29177009],
       [ 1.02532046, -0.67182817,  0.23544042],
       [ 1.02532046,  1.01801037, -1.31362553],
       [ 1.02532046,  0.83025053, -1.28546069],
       [ 1.02532046,  0.92413045, -1.22913102],
       [-0.97530483,  1.11189029, -1.20096619],
       [ 1.02532046,  0.83025053, -1.398

In [56]:
X_test

array([[ 1.        , -0.51982647,  0.49135807],
       [ 1.        , -0.94708111,  0.45312009],
       [ 1.        , -0.51982647, -0.04397368],
       [-1.        , -0.09257184,  2.70916104],
       [ 1.        , -0.8046629 ,  0.33840614],
       [ 1.        , -1.23191753, -0.04397368],
       [ 1.        ,  0.61951922,  3.0150649 ],
       [-1.        , -0.94708111, -0.0057357 ],
       [ 1.        , -0.37740826,  0.64431   ],
       [ 1.        , -0.94708111,  0.56783404],
       [-1.        , -1.23191753, -0.19692561],
       [-1.        , -1.08949932,  0.03250228],
       [ 1.        ,  0.19226459,  0.64431   ],
       [-1.        , -0.51982647, -0.08221166],
       [ 1.        , -0.23499005, -0.31163955],
       [-1.        ,  1.04677386,  2.74739902],
       [-1.        , -0.8046629 , -0.46459148],
       [-1.        , -0.23499005,  0.30016816],
       [-1.        , -0.8046629 , -0.46459148],
       [-1.        , -0.66224469,  0.64431   ],
       [ 1.        , -1.23191753, -0.273

## Model Selection

In [88]:
log = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=5)

## Trainig the Model

In [89]:
print(log.fit(X_train, y_train))
knn.fit(X_train, y_train)

LogisticRegression()


## Testing the model

In [107]:
y_log_pred = log.predict(X_test)

In [108]:
y_log_pred

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [90]:
y_knn_pred = knn.predict(X_test)

In [91]:
newdf = pd.DataFrame({"Actual":y_test, "Predicted": y_knn_pred})

In [92]:
newdf.head()

Unnamed: 0,Actual,Predicted
266,0,0
267,0,0
269,0,0
270,0,1
276,0,0


In [93]:
confusion_matrix(y_test, y_knn_pred)

array([[46,  5],
       [ 7, 22]], dtype=int64)

In [94]:
Accuracy = (46+22)/(46+7+5+22)
Accuracy

0.85

In [85]:
from sklearn.metrics import accuracy_score

In [109]:
accuracy_score(y_test, y_log_pred)

0.725

## Hyperparameter tuning

In [102]:
lis = [i for i in range (3,100) if i % 2 != 0]
# lis = [i for i in range (2,101) if i % 2 == 0]
acc = [ ]
dic = { }
for i in lis:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_knn_pred = knn.predict(X_test)
    acc.append(accuracy_score(y_test, y_knn_pred))
    dic [i] = accuracy_score(y_test, y_knn_pred)

print(max(acc))

# for odd number = 11, 0.8875
# for even numbers = 10, 0.875

0.8875


In [103]:
dic

{3: 0.8375,
 5: 0.85,
 7: 0.85,
 9: 0.8625,
 11: 0.8875,
 13: 0.8875,
 15: 0.875,
 17: 0.85,
 19: 0.8,
 21: 0.7875,
 23: 0.775,
 25: 0.7625,
 27: 0.7375,
 29: 0.7375,
 31: 0.7375,
 33: 0.7375,
 35: 0.7375,
 37: 0.7375,
 39: 0.7375,
 41: 0.7375,
 43: 0.7375,
 45: 0.7375,
 47: 0.7375,
 49: 0.7375,
 51: 0.7375,
 53: 0.7375,
 55: 0.7375,
 57: 0.7375,
 59: 0.7375,
 61: 0.7375,
 63: 0.725,
 65: 0.725,
 67: 0.725,
 69: 0.725,
 71: 0.725,
 73: 0.725,
 75: 0.725,
 77: 0.725,
 79: 0.725,
 81: 0.725,
 83: 0.725,
 85: 0.725,
 87: 0.725,
 89: 0.7125,
 91: 0.7,
 93: 0.7,
 95: 0.7,
 97: 0.6875,
 99: 0.6875}