In [2]:
#Import all required libraries to create a AdaBoost Classifier Model
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

In [3]:
# Load the data set
Data = pd.read_csv(r"C:\Users\Vinoth\Dropbox\PC\Desktop\HOPE AI\Machine Learning\Classification_Models\DataSets\Social_Network_Ads.csv")

In [4]:
Data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [5]:
Data.shape

(400, 5)

## To know more about the data please refer the Exploratory Data Analysis on Social Network Ads DataSet.ipynb file

In [6]:
#Convert the Nominal categorical data of "Gender" column to Numeric values

Data = pd.get_dummies(Data,drop_first=True)

In [7]:
Data.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1


In [8]:
Data.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [9]:
# Arrange the columns for better visualization and also we can drop User ID colum as it doesn't makes any sense
Data = Data.loc[:,['Gender_Male','Age', 'EstimatedSalary','Purchased']]

In [10]:
Data

Unnamed: 0,Gender_Male,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0
...,...,...,...,...
395,0,46,41000,1
396,1,51,23000,1
397,0,50,20000,1
398,1,36,33000,0


In [11]:
# Let's Seperate dependent and independent variables

independent = Data[['Gender_Male','Age', 'EstimatedSalary']]
dependent = Data[['Purchased']]

In [12]:
dependent.head()

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0


In [13]:
independent.head()

Unnamed: 0,Gender_Male,Age,EstimatedSalary
0,1,19,19000
1,1,35,20000
2,0,26,43000
3,0,27,57000
4,1,19,76000


In [14]:
#Standardize the data
Std =StandardScaler()
independent = Std.fit_transform(independent)


In [15]:
independent.shape

(400, 3)

In [16]:
independent

array([[ 1.02020406, -1.78179743, -1.49004624],
       [ 1.02020406, -0.25358736, -1.46068138],
       [-0.98019606, -1.11320552, -0.78528968],
       ...,
       [-0.98019606,  1.17910958, -1.46068138],
       [ 1.02020406, -0.15807423, -1.07893824],
       [-0.98019606,  1.08359645, -0.99084367]])

In [17]:
# Lets Split Train and Test Data

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,random_state=0,test_size=0.30)

In [18]:
# Lets Create the Model using training data and default parameters
AdaBoost_Classifier_Model = AdaBoostClassifier()
AdaBoost_Classifier_Model.fit(X_Train,Y_Train)

  y = column_or_1d(y, warn=True)


In [19]:
# Lets Test the model using the Test Data
Y_Predicted = AdaBoost_Classifier_Model.predict(X_Test)

In [23]:
AdaBoost_Classifier_Model.predict_proba(X_Test)

array([[0.51099735, 0.48900265],
       [0.52073148, 0.47926852],
       [0.51871792, 0.48128208],
       [0.52899658, 0.47100342],
       [0.53414514, 0.46585486],
       [0.52328113, 0.47671887],
       [0.68999211, 0.31000789],
       [0.36766307, 0.63233693],
       [0.70835951, 0.29164049],
       [0.49901878, 0.50098122],
       [0.53771197, 0.46228803],
       [0.54104968, 0.45895032],
       [0.51737781, 0.48262219],
       [0.50527149, 0.49472851],
       [0.69541172, 0.30458828],
       [0.49045292, 0.50954708],
       [0.50527149, 0.49472851],
       [0.69541172, 0.30458828],
       [0.4445072 , 0.5554928 ],
       [0.51634822, 0.48365178],
       [0.53414514, 0.46585486],
       [0.34820084, 0.65179916],
       [0.52604033, 0.47395967],
       [0.46300674, 0.53699326],
       [0.69112   , 0.30888   ],
       [0.5274364 , 0.4725636 ],
       [0.51435373, 0.48564627],
       [0.52564806, 0.47435194],
       [0.52073148, 0.47926852],
       [0.52207088, 0.47792912],
       [0.

In [24]:
AdaBoost_Classifier_Model.predict_proba(X_Test)[:,1]

array([0.48900265, 0.47926852, 0.48128208, 0.47100342, 0.46585486,
       0.47671887, 0.31000789, 0.63233693, 0.29164049, 0.50098122,
       0.46228803, 0.45895032, 0.48262219, 0.49472851, 0.30458828,
       0.50954708, 0.49472851, 0.30458828, 0.5554928 , 0.48365178,
       0.46585486, 0.65179916, 0.47395967, 0.53699326, 0.30888   ,
       0.4725636 , 0.48564627, 0.47435194, 0.47926852, 0.47792912,
       0.31715934, 0.49472851, 0.5153295 , 0.48262219, 0.29442357,
       0.29442357, 0.30458828, 0.46585486, 0.48203407, 0.51041513,
       0.48900265, 0.47731024, 0.47100342, 0.48365178, 0.50029098,
       0.45895032, 0.49137062, 0.50706767, 0.30808298, 0.50966243,
       0.5554928 , 0.48632237, 0.46585486, 0.4955152 , 0.52638468,
       0.50968422, 0.47100342, 0.30175021, 0.51805996, 0.30660221,
       0.31288882, 0.67551016, 0.30458828, 0.51252441, 0.30888   ,
       0.68603918, 0.31425707, 0.46228803, 0.48262219, 0.48101491,
       0.62238246, 0.47792912, 0.29164049, 0.50794582, 0.48900

In [None]:
AdaBoost_Classifier_Model.decision_function()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_Test,AdaBoost_Classifier_Model.predict_proba())

In [124]:
# Lets check the performance of the model using confusion Matrix 
confusion_matrix(Y_Predicted,Y_Test)

array([[75,  8],
       [ 4, 33]], dtype=int64)

In [125]:
# importing the classification report function from sklearn.metrics module
from sklearn.metrics import classification_report

In [126]:
clf_report =classification_report(Y_Predicted,Y_Test,zero_division=0)

In [127]:
print(clf_report)

              precision    recall  f1-score   support

           0       0.95      0.90      0.93        83
           1       0.80      0.89      0.85        37

    accuracy                           0.90       120
   macro avg       0.88      0.90      0.89       120
weighted avg       0.90      0.90      0.90       120



In [24]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [84]:
# import all classifier model for AdaBoost Base Estimator
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [86]:
# initialize the best parameters that we have found for logistic regression classifier
LogisticRegression_Classifier  = LogisticRegression(C=0.1,class_weight='balanced',dual=False,fit_intercept=True,max_iter=100,penalty='l1',solver='liblinear',warm_start= True)

In [128]:
# initialize the best parameters that we have found for SVC classifier
SVC_Classifier_Model = SVC(C=1.0,class_weight='balanced',decision_function_shape= 'ovo',degree=1,gamma ='scale',kernel='rbf',probability=True)

In [116]:
# Final Report for the tested data against the best parameters
print(Final_report)

              precision    recall  f1-score   support

           0       0.90      0.92      0.91        79
           1       0.85      0.80      0.83        41

    accuracy                           0.88       120
   macro avg       0.87      0.86      0.87       120
weighted avg       0.88      0.88      0.88       120



In [89]:
# initialize the best parameters that we have found for Decision Tree classifier
DecisionTree_Classifier_Model = DecisionTreeClassifier(criterion='entropy',max_depth=100,max_features='sqrt',min_samples_split=4,splitter='random',)

In [121]:
pickle.dump(AdaBoost_Classifier_Model_GridSearch,open(r"C:\Users\Vinoth\Dropbox\PC\Desktop\HOPE AI\Machine Learning\Classification_Models\Final_Model\AdaBoost_Classifier_Model_GridSearch_Final.sav",'wb'))

In [91]:
# initialize the best parameters that we have found for RandomForest classifier
RandomForest_Classifier_Model = RandomForestClassifier(bootstrap=True,class_weight='balanced',criterion='gini',max_depth=100,max_features='sqrt',max_samples=0.6,min_samples_split=6,n_jobs=-1,oob_score=False)

In [106]:
# Initializing the model params for AdaBoost Classifier to pass it as input for gridsearch CV
model_Params = {"base_estimator":[LogisticRegression_Classifier,SVC_Classifier_Model,DecisionTree_Classifier_Model,RandomForest_Classifier_Model],'n_estimators':[50,100,300,500],"learning_rate":[0.1,0.001,0.001,1.0,10],
                "algorithm":['SAMME']}

In [107]:
# Invoking the gridsearch CV and pasing the appropriate parameters
import multiprocessing
n_jobs = multiprocessing.cpu_count()-1
AdaBoost_Classifier_Model_GridSearch = GridSearchCV(AdaBoostClassifier(),model_Params,scoring='f1',refit=True,n_jobs=n_jobs,verbose=2)

In [108]:
# Lets fit the GRID Search CV to the training data
import warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)
AdaBoost_Classifier_Model_GridSearch.fit(X_Train,Y_Train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


60 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Vinoth\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Vinoth\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 506, in fit
    return super().fit(X, y, sample_weight)
  File "C:\Users\Vinoth\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 160, in fit
    sample_weight, estimator_weight, estimator_error = self._boo

In [109]:
# Best Parameter combination that has performed high than other combinations
AdaBoost_Classifier_Model_GridSearch.best_params_

{'algorithm': 'SAMME',
 'base_estimator': RandomForestClassifier(class_weight='balanced', max_depth=100, max_samples=0.6,
                        min_samples_split=6, n_jobs=-1),
 'learning_rate': 0.001,
 'n_estimators': 100}

In [110]:
# Best Score that training model has performed from the parameter combinations
AdaBoost_Classifier_Model_GridSearch.best_score_

0.888119367240184

In [111]:
# Loading the crsoss validation results to the dataframe
Df = pd.DataFrame.from_dict(AdaBoost_Classifier_Model_GridSearch.cv_results_)

In [112]:
Df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_base_estimator,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.123231,0.006398,0.007998,1.809470e-06,SAMME,"LogisticRegression(C=0.1, class_weight='balanc...",0.1,50,"{'algorithm': 'SAMME', 'base_estimator': Logis...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,41
1,0.249748,0.005998,0.014408,3.202637e-03,SAMME,"LogisticRegression(C=0.1, class_weight='balanc...",0.1,100,"{'algorithm': 'SAMME', 'base_estimator': Logis...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,41
2,0.719132,0.015615,0.037027,3.782616e-03,SAMME,"LogisticRegression(C=0.1, class_weight='balanc...",0.1,300,"{'algorithm': 'SAMME', 'base_estimator': Logis...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,41
3,1.207537,0.020543,0.058421,6.003961e-03,SAMME,"LogisticRegression(C=0.1, class_weight='balanc...",0.1,500,"{'algorithm': 'SAMME', 'base_estimator': Logis...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,41
4,0.120035,0.005048,0.008001,9.536743e-07,SAMME,"LogisticRegression(C=0.1, class_weight='balanc...",0.001,50,"{'algorithm': 'SAMME', 'base_estimator': Logis...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,4.405975,1.889151,3.254428,3.606148e+00,SAMME,RandomForestClassifier(class_weight='balanced'...,1.0,500,"{'algorithm': 'SAMME', 'base_estimator': Rando...",0.820513,0.769231,0.844444,0.863636,0.952381,0.850041,0.060153,22
76,3.648473,1.616577,1.937983,1.390141e+00,SAMME,RandomForestClassifier(class_weight='balanced'...,10,50,"{'algorithm': 'SAMME', 'base_estimator': Rando...",0.526316,0.526316,0.603774,0.711864,0.545455,0.582745,0.070531,35
77,7.858344,1.256294,3.081086,1.002665e+00,SAMME,RandomForestClassifier(class_weight='balanced'...,10,100,"{'algorithm': 'SAMME', 'base_estimator': Rando...",0.764706,0.526316,0.653846,0.545455,0.384615,0.574988,0.127824,36
78,9.123473,0.549635,3.264003,3.612474e-01,SAMME,RandomForestClassifier(class_weight='balanced'...,10,300,"{'algorithm': 'SAMME', 'base_estimator': Rando...",0.645161,0.702703,0.629630,0.327869,0.687500,0.598572,0.137961,34


In [113]:
# Now test the model from the best hyperparameter that we have got from GRID Serach CV
Predicted_Y = AdaBoost_Classifier_Model_GridSearch.predict(X_Test)

In [None]:
confusion_matrix(Y_Test,Y_Predicted)

In [115]:
Final_report = classification_report(Y_Test,Y_Predicted)