In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("Social_Network_Ads.csv")

In [3]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['Gender'] = encoder.fit_transform(df['Gender'])

In [5]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0


In [6]:
input_features = df.iloc[:,1:-1]
labels = df.iloc[:,-1]

In [7]:
print(input_features)
print(labels)

     Gender  Age  EstimatedSalary
0         1   19            19000
1         1   35            20000
2         0   26            43000
3         0   27            57000
4         1   19            76000
..      ...  ...              ...
395       0   46            41000
396       1   51            23000
397       0   50            20000
398       1   36            33000
399       0   49            36000

[400 rows x 3 columns]
0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64


In [8]:
# Convert the data into same scale

from sklearn.preprocessing import StandardScaler, MinMaxScaler

standard_scaler = StandardScaler()

scaled_input_data = standard_scaler.fit_transform(input_features)

In [9]:
scaled_input_data

array([[ 1.02020406, -1.78179743, -1.49004624],
       [ 1.02020406, -0.25358736, -1.46068138],
       [-0.98019606, -1.11320552, -0.78528968],
       ...,
       [-0.98019606,  1.17910958, -1.46068138],
       [ 1.02020406, -0.15807423, -1.07893824],
       [-0.98019606,  1.08359645, -0.99084367]])

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(scaled_input_data, labels, random_state = 2, test_size = 0.2)


In [14]:
print(X_train[:5])
print(X_test[:5])
print(Y_train[:5])
print(Y_test[:5])

[[-0.98019606  0.89257019 -0.66783025]
 [ 1.02020406 -0.15807423  0.85914229]
 [-0.98019606 -1.01769239 -0.37418169]
 [ 1.02020406  0.98808332  0.59485858]
 [ 1.02020406 -0.92217926  0.56549373]]
[[-0.98019606 -0.82666613  0.38930459]
 [-0.98019606 -1.59077117 -1.5781408 ]
 [ 1.02020406 -0.0625611  -0.49164111]
 [ 1.02020406 -0.82666613  0.15438573]
 [ 1.02020406  1.5611621   0.00756145]]
205    0
354    1
3      0
264    1
194    0
Name: Purchased, dtype: int64
94     0
32     0
225    0
157    0
356    1
Name: Purchased, dtype: int64


In [15]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

In [17]:
clf.fit(X_train, Y_train)

In [18]:
y_predict = clf.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score

accuracy_score(Y_test, y_predict)

0.8625

In [28]:
param_dist = {
    "criterion" : ["entropy", "gini"],
    "max_depth" : [1, 2, 3, 4, 5, 6, 7, None],
}

In [29]:
from sklearn.model_selection import GridSearchCV

gscv = GridSearchCV(clf, param_grid = param_dist, cv = 10, n_jobs = -1)

In [30]:
gscv.fit(X_train, Y_train)

In [31]:
gscv.best_estimator_

In [33]:
gscv.best_score_

0.896875

In [34]:
gscv.best_params_

{'criterion': 'entropy', 'max_depth': 2}