In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset=pd.read_csv("Social_Network_Ads.csv")

In [3]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
dataset=pd.get_dummies(dataset,drop_first=True)

In [5]:
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1
...,...,...,...,...,...
395,15691863,46,41000,1,0
396,15706071,51,23000,1,1
397,15654296,50,20000,1,0
398,15755018,36,33000,0,1


In [6]:
independent=dataset[['Age', 'EstimatedSalary','Gender_Male']]

In [7]:
dependent=dataset[["Purchased"]]

In [8]:
# splitting training and test dataset
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split(independent, dependent ,test_size=1/3,random_state=0)



In [9]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_features': ['auto','sqrt','log2']}

# Create the GridSearchCV object
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit=True, verbose=3, n_jobs=-1, scoring='f1_weighted')

# Fit the model
grid.fit(x_train, y_train)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             scoring='f1_weighted', verbose=3)

In [11]:
# print best parameter after tuning
re=grid.cv_results_
grid_predictions = grid.predict(x_test)


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,grid_predictions)

# print classification report 
from sklearn.metrics  import classification_report
clf_report = classification_report(y_test, grid_predictions)



In [12]:

from sklearn.metrics import f1_score
f1_macro=f1_score(y_test,grid_predictions,average='weighted')
print("The f1_macro value for best parameter {}:".format(grid.best_params_),f1_macro)

The f1_macro value for best parameter {'criterion': 'entropy', 'max_features': 'auto', 'splitter': 'random'}: 0.8728513338534315


In [13]:
print("The confusion Matrix:\n",cm)

The confusion Matrix:
 [[77  8]
 [ 9 40]]


In [14]:
print("The Report:\n",clf_report)

The Report:
               precision    recall  f1-score   support

           0       0.90      0.91      0.90        85
           1       0.83      0.82      0.82        49

    accuracy                           0.87       134
   macro avg       0.86      0.86      0.86       134
weighted avg       0.87      0.87      0.87       134



In [15]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,grid.predict_proba(x_test)[:,1])

0.8611044417767106

In [16]:
table =pd.DataFrame.from_dict(re)

In [17]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003125,0.006249,0.009514,0.007769,gini,auto,best,"{'criterion': 'gini', 'max_features': 'auto', ...",0.867478,0.828959,0.832483,0.851527,0.90361,0.856811,0.027201,3
1,0.008657,0.007191,0.003128,0.006256,gini,auto,random,"{'criterion': 'gini', 'max_features': 'auto', ...",0.847141,0.850809,0.738451,0.831098,0.850543,0.823609,0.043191,9
2,0.003127,0.006254,0.0,0.0,gini,sqrt,best,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.808927,0.834012,0.814409,0.833323,0.885265,0.835187,0.026959,7
3,0.009366,0.007647,0.006244,0.007647,gini,sqrt,random,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.808927,0.813511,0.814409,0.813179,0.841025,0.81821,0.011563,10
4,0.0,0.0,0.0,0.0,gini,log2,best,"{'criterion': 'gini', 'max_features': 'log2', ...",0.808927,0.870898,0.795256,0.832483,0.886792,0.838871,0.035105,6
5,0.0,0.0,0.0,0.0,gini,log2,random,"{'criterion': 'gini', 'max_features': 'log2', ...",0.826263,0.756254,0.759244,0.849057,0.862442,0.810652,0.044727,11
6,0.009621,0.007856,0.006414,0.007856,entropy,auto,best,"{'criterion': 'entropy', 'max_features': 'auto...",0.804764,0.849057,0.831098,0.906166,0.90361,0.858939,0.040084,2
7,0.0,0.0,0.001902,0.003804,entropy,auto,random,"{'criterion': 'entropy', 'max_features': 'auto...",0.843811,0.888107,0.831098,0.887907,0.885265,0.867238,0.024668,1
8,0.005705,0.004658,0.0,0.0,entropy,sqrt,best,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.804764,0.850809,0.833323,0.832483,0.880769,0.840429,0.024983,5
9,0.002882,0.003529,0.001441,0.002882,entropy,sqrt,random,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.743022,0.793754,0.738451,0.886792,0.82314,0.797032,0.054951,12


In [21]:
age_input=float(input("Age:"))

Gender_male_input=int(input("Gender_Male 0 or 1:"))

EstimatedSalary_input=int(input("EstimatedSalary:"))


Age:49
Gender_Male 0 or 1:0
EstimatedSalary:36000


In [22]:
Future_Prediction=grid.predict([[age_input,Gender_male_input,EstimatedSalary_input]])# change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[1]
