In [1]:
#Import all required libraries to create a XGBoost Classifier Model
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

In [2]:
# Load the data set
Data = pd.read_csv(r"C:\Users\Vinoth\Dropbox\PC\Desktop\HOPE AI\Machine Learning\Classification_Models\DataSets\Social_Network_Ads.csv")

In [3]:
Data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
Data.shape

(400, 5)

## To know more about the data please refer the Exploratory Data Analysis on Social Network Ads DataSet.ipynb file

In [5]:
#Convert the Nominal categorical data of "Gender" column to Numeric values

Data = pd.get_dummies(Data,drop_first=True)

In [6]:
Data.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1


In [7]:
Data.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [8]:
# Arrange the columns for better visualization and also we can drop User ID colum as it doesn't makes any sense
Data = Data.loc[:,['Gender_Male','Age', 'EstimatedSalary','Purchased']]

In [9]:
Data

Unnamed: 0,Gender_Male,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0
...,...,...,...,...
395,0,46,41000,1
396,1,51,23000,1
397,0,50,20000,1
398,1,36,33000,0


In [10]:
# Let's Seperate dependent and independent variables

independent = Data[['Gender_Male','Age', 'EstimatedSalary']]
dependent = Data[['Purchased']]

In [11]:
dependent.head()

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0


In [12]:
independent.head()

Unnamed: 0,Gender_Male,Age,EstimatedSalary
0,1,19,19000
1,1,35,20000
2,0,26,43000
3,0,27,57000
4,1,19,76000


In [13]:
#Standardize the data
# Std =StandardScaler()
# independent = Std.fit_transform(independent)


In [14]:
independent.shape

(400, 3)

In [15]:
independent

Unnamed: 0,Gender_Male,Age,EstimatedSalary
0,1,19,19000
1,1,35,20000
2,0,26,43000
3,0,27,57000
4,1,19,76000
...,...,...,...
395,0,46,41000
396,1,51,23000
397,0,50,20000
398,1,36,33000


In [16]:
# Lets Split Train and Test Data

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,random_state=0,test_size=0.30)

In [17]:
# Lets Create the Model using training data and default parameters
XGBoost_Classifier_Model = XGBClassifier()
XGBoost_Classifier_Model.fit(X_Train,Y_Train)

In [19]:
# Lets Test the model using the Test Data
Y_Predicted = XGBoost_Classifier_Model.predict(X_Test)

In [20]:
# Lets check the performance of the model using confusion Matrix 
confusion_matrix(Y_Predicted,Y_Test)

array([[74,  6],
       [ 5, 35]], dtype=int64)

In [21]:
# importing the classification report function from sklearn.metrics module
from sklearn.metrics import classification_report

In [22]:
clf_report =classification_report(Y_Predicted,Y_Test,zero_division=0)

In [23]:
print(clf_report)

              precision    recall  f1-score   support

           0       0.94      0.93      0.93        80
           1       0.85      0.88      0.86        40

    accuracy                           0.91       120
   macro avg       0.90      0.90      0.90       120
weighted avg       0.91      0.91      0.91       120



In [25]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [32]:
# Initializing the model params for XGBoost Classifier to pass it as input for gridsearch CV
model_Params = {"max_depth":[i for i in range(1,10,2)],'n_estimators':[50,100,300,500],"learning_rate":[0.1,0.001,0.001,1.0,10],
                "max_leaves":[i for i in range(8,32,2)],"booster":["gbtree","gblinear"],'subsample':[0.5],'gamma':[0.1,0.001,1,10]}

In [33]:
# Invoking the gridsearch CV and pasing the appropriate parameters
import multiprocessing
n_jobs = multiprocessing.cpu_count()-1
XGBoost_Classifier_Model_GridSearch = GridSearchCV(XGBClassifier(),model_Params,scoring='f1',refit=True,n_jobs=n_jobs,verbose=2)

In [34]:
# Lets fit the GRID Search CV to the training data
import warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)
XGBoost_Classifier_Model_GridSearch.fit(X_Train,Y_Train)

Fitting 5 folds for each of 9600 candidates, totalling 48000 fits


In [35]:
# Best Parameter combination that has performed high than other combinations
XGBoost_Classifier_Model_GridSearch.best_params_

{'booster': 'gbtree',
 'gamma': 10,
 'learning_rate': 1.0,
 'max_depth': 3,
 'max_leaves': 8,
 'n_estimators': 500,
 'subsample': 0.5}

In [37]:
# Best Score that training model has performed from the parameter combinations
XGBoost_Classifier_Model_GridSearch.best_score_

0.8725622074402561

In [38]:
# Loading the crsoss validation results to the dataframe
Df = pd.DataFrame.from_dict(XGBoost_Classifier_Model_GridSearch.cv_results_)

In [39]:
Df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_gamma,param_learning_rate,param_max_depth,param_max_leaves,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.945009,0.028088,0.006402,0.012804,gbtree,0.1,0.1,1,8,50,0.5,"{'booster': 'gbtree', 'gamma': 0.1, 'learning_...",0.777778,0.750000,0.731707,0.904762,0.975610,0.827971,0.095456,1105
1,0.890508,0.054889,0.001865,0.003111,gbtree,0.1,0.1,1,8,100,0.5,"{'booster': 'gbtree', 'gamma': 0.1, 'learning_...",0.789474,0.750000,0.717949,0.909091,0.950000,0.823303,0.090586,1187
2,0.906364,0.020476,0.011892,0.023784,gbtree,0.1,0.1,1,8,300,0.5,"{'booster': 'gbtree', 'gamma': 0.1, 'learning_...",0.777778,0.789474,0.731707,0.954545,0.923077,0.835316,0.087252,961
3,0.725595,0.334509,0.004799,0.003918,gbtree,0.1,0.1,1,8,500,0.5,"{'booster': 'gbtree', 'gamma': 0.1, 'learning_...",0.777778,0.789474,0.717949,0.954545,0.923077,0.832565,0.090628,1069
4,0.009601,0.003200,0.006401,0.003201,gbtree,0.1,0.1,1,10,50,0.5,"{'booster': 'gbtree', 'gamma': 0.1, 'learning_...",0.777778,0.750000,0.731707,0.904762,0.975610,0.827971,0.095456,1105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9595,0.121733,0.026489,0.006422,0.003211,gblinear,10,10,9,28,500,0.5,"{'booster': 'gblinear', 'gamma': 10, 'learning...",0.000000,0.000000,0.000000,0.545455,0.000000,0.109091,0.218182,5281
9596,0.025551,0.005992,0.003506,0.003712,gblinear,10,10,9,30,50,0.5,"{'booster': 'gblinear', 'gamma': 10, 'learning...",0.000000,0.000000,0.000000,0.545455,0.000000,0.109091,0.218182,5281
9597,0.035225,0.008184,0.006397,0.003199,gblinear,10,10,9,30,100,0.5,"{'booster': 'gblinear', 'gamma': 10, 'learning...",0.000000,0.000000,0.000000,0.545455,0.000000,0.109091,0.218182,5281
9598,0.084889,0.015649,0.003206,0.003926,gblinear,10,10,9,30,300,0.5,"{'booster': 'gblinear', 'gamma': 10, 'learning...",0.000000,0.000000,0.000000,0.545455,0.000000,0.109091,0.218182,5281


In [40]:
# Now test the model from the best hyperparameter that we have got from GRID Serach CV
Predicted_Y = XGBoost_Classifier_Model_GridSearch.predict(X_Test)

In [41]:
confusion_matrix(Y_Test,Y_Predicted)

array([[74,  5],
       [ 6, 35]], dtype=int64)

In [42]:
Final_report = classification_report(Y_Test,Y_Predicted)

In [43]:
# Final Report for the tested data against the best parameters
print(Final_report)

              precision    recall  f1-score   support

           0       0.93      0.94      0.93        79
           1       0.88      0.85      0.86        41

    accuracy                           0.91       120
   macro avg       0.90      0.90      0.90       120
weighted avg       0.91      0.91      0.91       120



In [44]:
pickle.dump(XGBoost_Classifier_Model_GridSearch,open(r"C:\Users\Vinoth\Dropbox\PC\Desktop\HOPE AI\Machine Learning\Classification_Models\Final_Model\XGBoost_Classifier_Model_GridSearch_Final.sav",'wb'))