In [44]:
# import the required libraries

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split

In [46]:
# inline the plots

In [47]:
%matplotlib inline

In [48]:
df=pd.read_csv('./datasets/Social_Network_Ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [50]:
# Separate Independent and Dependent matrix

In [51]:
X=df.iloc[:,1:4].values
X

array([['Male', 19, 19000],
       ['Male', 35, 20000],
       ['Female', 26, 43000],
       ...,
       ['Female', 50, 20000],
       ['Male', 36, 33000],
       ['Female', 49, 36000]], dtype=object)

In [52]:
y=np.array(df['Purchased'])
y

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,

In [53]:
# Check for any missing Values in DataFrame

In [54]:
df[(df['Gender']==np.NaN)|(df['Age']==np.NaN)|(df['EstimatedSalary']==np.NaN)|(df['Purchased']==np.NaN)]

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased


In [55]:
# Check for Categorical Values and Strings

In [56]:
df['Gender'].value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [57]:
df['Gender'].nunique()

2

In [58]:
df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [59]:
df['Purchased'].nunique()

2

In [60]:
# Data Preprocessing for categorical values and strings

In [61]:
# Convert strings to numbers for processing with algorithms

In [62]:
lb=LabelEncoder()
X[:,0]=lb.fit_transform(X[:,0])
X

array([[1, 19, 19000],
       [1, 35, 20000],
       [0, 26, 43000],
       ...,
       [0, 50, 20000],
       [1, 36, 33000],
       [0, 49, 36000]], dtype=object)

In [63]:
# No Need of OneHotEncoder because only 2 categories are there

In [64]:
# Split data into training and test sets

In [65]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [66]:
X_train

array([[0, 23, 48000],
       [1, 48, 33000],
       [0, 57, 33000],
       [1, 53, 72000],
       [0, 46, 96000],
       [1, 36, 118000],
       [1, 49, 28000],
       [1, 34, 112000],
       [1, 41, 79000],
       [0, 58, 95000],
       [1, 21, 88000],
       [0, 36, 50000],
       [0, 47, 113000],
       [1, 35, 108000],
       [1, 46, 79000],
       [1, 29, 43000],
       [1, 40, 57000],
       [1, 42, 80000],
       [1, 42, 64000],
       [1, 25, 90000],
       [0, 41, 30000],
       [0, 50, 36000],
       [0, 18, 44000],
       [1, 35, 55000],
       [0, 35, 23000],
       [0, 40, 60000],
       [0, 59, 88000],
       [1, 47, 34000],
       [1, 29, 75000],
       [1, 47, 20000],
       [1, 34, 43000],
       [1, 30, 49000],
       [1, 23, 63000],
       [0, 37, 78000],
       [0, 40, 142000],
       [0, 49, 141000],
       [1, 31, 66000],
       [0, 38, 50000],
       [1, 28, 79000],
       [1, 45, 26000],
       [1, 28, 123000],
       [0, 37, 146000],
       [1, 48, 41000],
   

In [67]:
X_test

array([[0, 53, 143000],
       [0, 59, 83000],
       [1, 60, 102000],
       [1, 35, 39000],
       [1, 26, 16000],
       [1, 22, 18000],
       [1, 57, 60000],
       [1, 58, 144000],
       [0, 42, 75000],
       [1, 39, 106000],
       [1, 49, 86000],
       [0, 47, 49000],
       [1, 54, 70000],
       [0, 26, 118000],
       [0, 46, 32000],
       [0, 27, 31000],
       [1, 47, 25000],
       [0, 36, 63000],
       [0, 22, 27000],
       [0, 32, 135000],
       [1, 40, 75000],
       [1, 23, 20000],
       [0, 26, 84000],
       [0, 49, 39000],
       [0, 39, 79000],
       [0, 29, 83000],
       [0, 47, 47000],
       [0, 59, 42000],
       [0, 57, 26000],
       [1, 38, 71000],
       [1, 35, 59000],
       [0, 28, 84000],
       [1, 29, 61000],
       [1, 41, 52000],
       [1, 25, 79000],
       [0, 47, 144000],
       [1, 26, 30000],
       [0, 39, 61000],
       [1, 30, 17000],
       [1, 46, 117000],
       [1, 37, 53000],
       [0, 54, 26000],
       [0, 52, 114000],
  

In [68]:
y_train

array([0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,

In [69]:
y_test

array([1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [70]:
# Do feature scaling since data could go off the chart

In [71]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [72]:
X_train

array([[-0.96922337, -1.38023159, -0.65956503],
       [ 1.03175391,  1.03117979, -1.09848256],
       [-0.96922337,  1.89928789, -1.09848256],
       [ 1.03175391,  1.51346207,  0.04270302],
       [-0.96922337,  0.83826688,  0.74497107],
       [ 1.03175391, -0.12629767,  1.38871678],
       [ 1.03175391,  1.12763625, -1.2447884 ],
       [ 1.03175391, -0.31921058,  1.21314976],
       [ 1.03175391,  0.35598461,  0.2475312 ],
       [-0.96922337,  1.99574435,  0.7157099 ],
       [ 1.03175391, -1.5731445 ,  0.51088172],
       [-0.96922337, -0.12629767, -0.60104269],
       [-0.96922337,  0.93472334,  1.24241093],
       [ 1.03175391, -0.22275413,  1.09610509],
       [ 1.03175391,  0.83826688,  0.2475312 ],
       [ 1.03175391, -0.80149286, -0.80587087],
       [ 1.03175391,  0.25952815, -0.39621451],
       [ 1.03175391,  0.45244106,  0.27679237],
       [ 1.03175391,  0.45244106, -0.19138633],
       [ 1.03175391, -1.18731868,  0.56940405],
       [-0.96922337,  0.35598461, -1.186

In [73]:
X_test

array([[-0.96922337,  1.51346207,  2.12024599],
       [-0.96922337,  2.0922008 ,  0.36457587],
       [ 1.03175391,  2.18865726,  0.92053808],
       [ 1.03175391, -0.22275413, -0.92291555],
       [ 1.03175391, -1.09086222, -1.59592243],
       [ 1.03175391, -1.47668805, -1.53740009],
       [ 1.03175391,  1.89928789, -0.30843101],
       [ 1.03175391,  1.99574435,  2.14950716],
       [-0.96922337,  0.45244106,  0.13048652],
       [ 1.03175391,  0.16307169,  1.03758275],
       [ 1.03175391,  1.12763625,  0.45235938],
       [-0.96922337,  0.93472334, -0.63030386],
       [ 1.03175391,  1.60991852, -0.01581932],
       [-0.96922337, -1.09086222,  1.38871678],
       [-0.96922337,  0.83826688, -1.12774373],
       [-0.96922337, -0.99440577, -1.1570049 ],
       [ 1.03175391,  0.93472334, -1.33257191],
       [-0.96922337, -0.12629767, -0.2206475 ],
       [-0.96922337, -1.47668805, -1.27404957],
       [-0.96922337, -0.51212349,  1.88615664],
       [ 1.03175391,  0.25952815,  0.130