## Classification Algorithm
## Decision Tree- Grid

In [1]:
import pandas as pd

### Data collection

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')

In [3]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


### Data preprocessing

In [4]:
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)

In [5]:
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1
...,...,...,...,...,...
395,15691863,46,41000,1,0
396,15706071,51,23000,1,1
397,15654296,50,20000,1,0
398,15755018,36,33000,0,1


In [6]:
dataset.shape

(400, 5)

In [7]:
dataset = dataset.drop('User ID', axis=1)

### Check the No of Unique classes and its count

In [8]:
dataset['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

### Input Output Split

In [9]:
indep = dataset[['Age','EstimatedSalary','Gender_Male']]
dep = dataset[['Purchased']]

### Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(indep,dep,test_size=0.20,random_state=0)

### Standardization

In [11]:
from sklearn.preprocessing import StandardScaler
SC = StandardScaler()
X_train = SC.fit_transform(X_train)
X_test = SC.transform(X_test)

In [12]:
X_train

array([[ 1.92295008e+00,  2.14601566e+00,  1.02532046e+00],
       [ 2.02016082e+00,  3.78719297e-01, -9.75304830e-01],
       [-1.38221530e+00, -4.32498705e-01, -9.75304830e-01],
       [-1.18779381e+00, -1.01194013e+00, -9.75304830e-01],
       [ 1.92295008e+00, -9.25023920e-01, -9.75304830e-01],
       [ 3.67578135e-01,  2.91803083e-01, -9.75304830e-01],
       [ 1.73156642e-01,  1.46942725e-01, -9.75304830e-01],
       [ 2.02016082e+00,  1.74040666e+00,  1.02532046e+00],
       [ 7.56421121e-01, -8.38107706e-01, -9.75304830e-01],
       [ 2.70367388e-01, -2.87638347e-01, -9.75304830e-01],
       [ 3.67578135e-01, -1.71750061e-01,  1.02532046e+00],
       [-1.18475597e-01,  2.20395980e+00, -9.75304830e-01],
       [-1.47942605e+00, -6.35303205e-01, -9.75304830e-01],
       [-1.28500455e+00, -1.06988428e+00,  1.02532046e+00],
       [-1.38221530e+00,  4.07691369e-01,  1.02532046e+00],
       [-1.09058306e+00,  7.55356227e-01, -9.75304830e-01],
       [-1.47942605e+00, -2.00722133e-01

### Train Set

### Model Creation in GridSearchCV

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# param_grid - This enables searching over any sequence of parameter settings.
# Scoring - Strategy to evaluate the performance of the cross-validated model on the test set.
# n_jobs - -1 means using all processors
# CV - Determines the cross-validation splitting strategy

param_grid = {'criterion' : ['ginni', 'entropy', 'log_loss'],
              'splitter' : ['best','random'],
              'max_features': ['sqrt','log2']}

Classifier = GridSearchCV(DecisionTreeClassifier(), param_grid, refit = True,verbose = 3, n_jobs=-1,scoring = 'f1_weighted')

Classifier.fit(X_train,Y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidPar

### Test Set

### Evaluation Metrics 
### Confusion Matix
### Classification Report

In [17]:
# print the best parameter after tuning
results = Classifier.cv_results_

grid_predictions = Classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
CM = confusion_matrix(Y_test,grid_predictions)

from sklearn.metrics import classification_report
Clf_report = classification_report(Y_test,grid_predictions)

In [18]:
print("Confusion Matrix:\n",CM)

Confusion Matrix:
 [[54  4]
 [ 4 18]]


In [19]:
print("Classification Report:\n",Clf_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        58
           1       0.82      0.82      0.82        22

    accuracy                           0.90        80
   macro avg       0.87      0.87      0.87        80
weighted avg       0.90      0.90      0.90        80



### ADDITIONAL : Check for the best params & f1_score 

In [21]:
from sklearn.metrics import f1_score

# Parameters : average{‘micro’, ‘macro’, ‘samples’, ‘weighted’, ‘binary
f1_macro_ave = f1_score(Y_test,grid_predictions, average='macro')

print('f1_macro_value of best parameter{}:'.format(Classifier.best_params_),f1_macro_ave)

f1_macro_value of best parameter{'criterion': 'entropy', 'max_features': 'sqrt', 'splitter': 'best'}: 0.8746081504702194


### ADDITIONAL : Check roc_auc_score -Receiver Operating Characterstic (Area Under Curve)

In [22]:
# ROC_AUC : Is a performance metric used primarily for binary classification problems to evaluate the quality of a classification model.

from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test,Classifier.predict_proba(X_test)[:,1])

0.8746081504702196

### Tabulation of Parameters in GridSearchCV

In [23]:
Table=pd.DataFrame(results)
Table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004161,0.001403,0.0,0.0,ginni,sqrt,best,"{'criterion': 'ginni', 'max_features': 'sqrt',...",,,,,,,,9
1,0.002782,0.00089,0.0,0.0,ginni,sqrt,random,"{'criterion': 'ginni', 'max_features': 'sqrt',...",,,,,,,,9
2,0.002805,0.001722,0.0,0.0,ginni,log2,best,"{'criterion': 'ginni', 'max_features': 'log2',...",,,,,,,,9
3,0.003212,0.002297,0.0,0.0,ginni,log2,random,"{'criterion': 'ginni', 'max_features': 'log2',...",,,,,,,,9
4,0.017976,0.00475,0.032048,0.008705,entropy,sqrt,best,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.875,0.842259,0.860772,0.922651,0.83804,0.867744,0.030489,1
5,0.015251,0.00689,0.027094,0.010083,entropy,sqrt,random,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.859841,0.758892,0.764578,0.828791,0.875,0.81742,0.047879,7
6,0.011147,0.006447,0.031322,0.011396,entropy,log2,best,"{'criterion': 'entropy', 'max_features': 'log2...",0.842448,0.823187,0.875897,0.906923,0.888956,0.867482,0.030582,2
7,0.011511,0.006914,0.035486,0.01326,entropy,log2,random,"{'criterion': 'entropy', 'max_features': 'log2...",0.921572,0.858747,0.813846,0.860772,0.81071,0.85313,0.040291,3
8,0.017396,0.01167,0.042905,0.017243,log_loss,sqrt,best,"{'criterion': 'log_loss', 'max_features': 'sqr...",0.875,0.840368,0.815315,0.846096,0.872294,0.849815,0.022059,4
9,0.010249,0.004394,0.032323,0.01291,log_loss,sqrt,random,"{'criterion': 'log_loss', 'max_features': 'sqr...",0.814377,0.79104,0.858747,0.827357,0.90625,0.839554,0.039882,5


### Check the Model by getting User Input

In [24]:
Age_input = int(input("AGE: "))
Salary_input = int(input("Salary: "))
Gender_input = int(input("Gender Male 0 or 1: "))

AGE:  25
Salary:  50000
Gender Male 0 or 1:  0


### Prediction

In [25]:
Prediction=Classifier.predict([[Age_input,Salary_input,Gender_input]]) 
Prediction


array([1], dtype=int64)