In [1]:
# import the required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split

In [3]:
# inline the plots

In [4]:
%matplotlib inline

In [5]:
df=pd.read_csv('./datasets/Social_Network_Ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [7]:
# Separate Independent and Dependent matrix

In [8]:
X=df.iloc[:,1:4].values
X

array([['Male', 19, 19000],
       ['Male', 35, 20000],
       ['Female', 26, 43000],
       ...,
       ['Female', 50, 20000],
       ['Male', 36, 33000],
       ['Female', 49, 36000]], dtype=object)

In [9]:
y=np.array(df['Purchased'])
y

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,

In [10]:
# Check for any missing Values in DataFrame

In [11]:
df[(df['Gender']==np.NaN)|(df['Age']==np.NaN)|(df['EstimatedSalary']==np.NaN)|(df['Purchased']==np.NaN)]

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased


In [12]:
# Check for Categorical Values and Strings

In [13]:
df['Gender'].value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [14]:
df['Gender'].nunique()

2

In [15]:
df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [16]:
df['Purchased'].nunique()

2

In [17]:
# Data Preprocessing for categorical values and strings

In [18]:
# Convert strings to numbers for processing with algorithms

In [19]:
lb=LabelEncoder()
X[:,0]=lb.fit_transform(X[:,0])
X

array([[1, 19, 19000],
       [1, 35, 20000],
       [0, 26, 43000],
       ...,
       [0, 50, 20000],
       [1, 36, 33000],
       [0, 49, 36000]], dtype=object)

In [20]:
# No Need of OneHotEncoder because only 2 categories are there

In [21]:
# Split data into training and test sets

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [23]:
X_train

array([[0, 44, 39000],
       [0, 58, 95000],
       [0, 26, 15000],
       [0, 46, 96000],
       [0, 47, 30000],
       [1, 35, 88000],
       [0, 36, 50000],
       [0, 30, 62000],
       [1, 41, 79000],
       [0, 37, 137000],
       [0, 42, 80000],
       [1, 41, 72000],
       [0, 20, 23000],
       [1, 39, 122000],
       [1, 25, 33000],
       [1, 24, 23000],
       [1, 47, 23000],
       [1, 42, 73000],
       [0, 39, 59000],
       [1, 40, 59000],
       [1, 45, 79000],
       [1, 49, 65000],
       [0, 42, 70000],
       [0, 33, 51000],
       [0, 35, 23000],
       [1, 42, 54000],
       [0, 47, 50000],
       [0, 29, 83000],
       [0, 27, 17000],
       [0, 36, 75000],
       [0, 54, 26000],
       [0, 20, 82000],
       [1, 39, 96000],
       [0, 33, 113000],
       [1, 37, 70000],
       [0, 46, 74000],
       [1, 46, 79000],
       [0, 28, 59000],
       [0, 58, 38000],
       [1, 38, 61000],
       [1, 35, 20000],
       [0, 50, 44000],
       [0, 48, 29000],
       [

In [24]:
X_test

array([[1, 36, 99000],
       [0, 60, 46000],
       [1, 35, 108000],
       [1, 35, 55000],
       [0, 50, 20000],
       [1, 35, 50000],
       [0, 19, 21000],
       [1, 41, 87000],
       [0, 30, 79000],
       [0, 37, 78000],
       [0, 48, 96000],
       [1, 60, 34000],
       [0, 39, 134000],
       [0, 38, 65000],
       [0, 26, 17000],
       [0, 53, 143000],
       [0, 37, 33000],
       [0, 22, 27000],
       [1, 35, 39000],
       [1, 36, 125000],
       [1, 35, 58000],
       [0, 34, 43000],
       [0, 35, 65000],
       [1, 18, 52000],
       [1, 35, 73000],
       [0, 32, 86000],
       [0, 27, 57000],
       [0, 35, 44000],
       [0, 31, 34000],
       [1, 31, 18000],
       [1, 35, 72000],
       [0, 24, 55000],
       [1, 46, 59000],
       [1, 40, 71000],
       [1, 48, 74000],
       [0, 51, 134000],
       [1, 18, 82000],
       [0, 26, 52000],
       [1, 48, 33000],
       [1, 23, 20000],
       [0, 35, 71000],
       [1, 47, 43000],
       [0, 23, 82000],
      

In [25]:
y_train

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,

In [26]:
y_test

array([1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)

In [27]:
# Do feature scaling since data could go off the chart

In [28]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [29]:
X_train

array([[-9.63177472e-01,  5.72277640e-01, -9.11506155e-01],
       [-9.63177472e-01,  1.91177807e+00,  7.00473544e-01],
       [-9.63177472e-01, -1.14993720e+00, -1.60235460e+00],
       [-9.63177472e-01,  7.63634845e-01,  7.29258896e-01],
       [-9.63177472e-01,  8.59313447e-01, -1.17057432e+00],
       [ 1.03823026e+00, -2.88829781e-01,  4.98976082e-01],
       [-9.63177472e-01, -1.93151179e-01, -5.94867285e-01],
       [-9.63177472e-01, -7.67222793e-01, -2.49443064e-01],
       [ 1.03823026e+00,  2.85241833e-01,  2.39907916e-01],
       [-9.63177472e-01, -9.74725762e-02,  1.90945832e+00],
       [-9.63177472e-01,  3.80920436e-01,  2.68693268e-01],
       [ 1.03823026e+00,  2.85241833e-01,  3.84104538e-02],
       [-9.63177472e-01, -1.72400882e+00, -1.37207178e+00],
       [ 1.03823026e+00,  9.38846286e-02,  1.47767804e+00],
       [ 1.03823026e+00, -1.24561580e+00, -1.08421827e+00],
       [ 1.03823026e+00, -1.34129441e+00, -1.37207178e+00],
       [ 1.03823026e+00,  8.59313447e-01

In [30]:
X_test

array([[ 1.03823026e+00, -1.93151179e-01,  8.15614951e-01],
       [-9.63177472e-01,  2.10313528e+00, -7.10008692e-01],
       [ 1.03823026e+00, -2.88829781e-01,  1.07468312e+00],
       [ 1.03823026e+00, -2.88829781e-01, -4.50940526e-01],
       [-9.63177472e-01,  1.14634925e+00, -1.45842784e+00],
       [ 1.03823026e+00, -2.88829781e-01, -5.94867285e-01],
       [-9.63177472e-01, -1.81968742e+00, -1.42964249e+00],
       [ 1.03823026e+00,  2.85241833e-01,  4.70190730e-01],
       [-9.63177472e-01, -7.67222793e-01,  2.39907916e-01],
       [-9.63177472e-01, -9.74725762e-02,  2.11122564e-01],
       [-9.63177472e-01,  9.54992050e-01,  7.29258896e-01],
       [ 1.03823026e+00,  2.10313528e+00, -1.05543291e+00],
       [-9.63177472e-01,  9.38846286e-02,  1.82310226e+00],
       [-9.63177472e-01, -1.79397379e-03, -1.63087009e-01],
       [-9.63177472e-01, -1.14993720e+00, -1.54478389e+00],
       [-9.63177472e-01,  1.43338506e+00,  2.08217043e+00],
       [-9.63177472e-01, -9.74725762e-02

In [31]:
# Applying KNN algo

In [32]:
from sklearn.neighbors import KNeighborsClassifier

In [33]:
classifier=KNeighborsClassifier(n_neighbors=5,p=2,metric='minkowski')

In [34]:
classifier.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [35]:
y_predicted=classifier.predict(X_test)
y_predicted

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [40]:
# In classifiers, we must create a confusion matrix as well

In [None]:
from 