<a href="https://colab.research.google.com/github/Titashmkhrj/Online-shopper-intention-project/blob/master/Online_shoppers_intention_HPO_MS_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries.

In [1]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
#---------------------------------------------------------------------------------------------------------------------------------------

# importing the required libraries
import numpy as np
import numpy.ma as ma
import pandas as pd
import joblib

import imblearn 
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline

from sklearn.linear_model import (LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (RandomizedSearchCV, train_test_split, cross_val_score)

from sklearn.metrics import accuracy_score

print("Finished importing the libraries.")

Finished importing the libraries.


# Models objects and their parameter grid.

In [2]:
# models as per the sequence in the parameter grid 
model_objects = [LogisticRegression(),
                 LogisticRegression(),
								 LogisticRegression(),
								 PassiveAggressiveClassifier(),
								 RidgeClassifier(),
								 KNeighborsClassifier(),
								 SVC(),
								 DecisionTreeClassifier(),
								 RandomForestClassifier()]



# hyper-parameter dictionary for the tunningof the models
parameter_grid = {'LR_l1' : {'model__penalty' : ['l1'],
                              'model__C' : [0.001, 0.01, 0.1, 1, 10, 100],
                              'model__random_state' : [42],
                              'model__solver' : ['liblinear', 'saga'],
                              'model__max_iter' : [100000]
                          },
				
                  'LR_l2' : {'model__penalty' : ['l2'],
                              'model__C' : [0.001, 0.01, 0.1, 1, 10, 100],
                              'model__random_state' : [42],
                              'model__solver' : ['newton-cg', 'lbfgs', 'sag', 'saga'],
                              'model__max_iter' : [100000]
                          },

                  'LR_ElNet' : {'model__penalty' : ['elasticnet'],
                                'model__l1_ratio' : [0.3, 0.5, 0.7],
                                'model__C' : [0.001, 0.01, 0.1, 1, 10, 100],
                                'model__random_state' : [42],
                                'model__solver' : ['saga'],
                                'model__max_iter' : [100000]
                              },

                  'Pass_Agg_clif' : {'model__C' : [0.001, 0.01, 0.1, 1, 10, 100],
                                    #   'model__fit_intercept' : ['True', 'False'],
                                      'model__random_state' : [42],
                                      'model__loss' : ['hinge', 'squared_hinge'],
                                      'model__class_weight' : ['balanced', None]
                                  },
                  
                  'Ridge_clif' : {'model__alpha' : [500.0, 50.0, 5.0, 0.5, 0.05, 0.005],
                                  'model__fit_intercept' : ['True', 'False'],
                                  'model__normalize' : ['True', 'False'],
                                  'model__class_weight' : ['balanced', None],
                                  'model__solver' : ['svd', 'cholesky', 'lsqr', 'sparse_cg']
                              },
                  
                  'KN_classif' : {'model__n_neighbors' : [1,3,5,7,9],
                                  'model__p' : [1,2,5]                     
                              },
                  
                  'SVC' : {'model__C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                           'model__gamma' : ['scale', 'auto'],                     
                      },
                  
                  'DT_clif' : {'model__criterion': ['gini','entropy'],
                                'model__max_features': ['sqrt','log2',None],
                                'model__min_samples_leaf': [1,2,5,10],
                                'model__min_samples_split' : [2,5,10,15,100],
                                'model__max_depth': [5,8,15,25,30,None]
                          },
                  
                  'RF_clif' : {'model__n_estimators' : [120,300,500,800,1200],
                               'model__max_features': ['sqrt','log2',None],
                                'model__min_samples_leaf': [1,2,5,10],
                                'model__min_samples_split' : [2,5,10,15,100],
                                'model__max_depth': [5,8,15,25,30,None]                      
                          }
              }

 # Loading the data.

In [3]:
# reading the feature and target spaces for our project
x_data = pd.read_csv("/content/drive/My Drive/data_for_HPO&MS/Online_shopper's_intention/feature_space.csv")
y_data = pd.read_csv("/content/drive/My Drive/data_for_HPO&MS/Online_shopper's_intention/target_space.csv")
# dropping an unnecessary column from our feature and target space
x_data.drop('Unnamed: 0', axis=1, inplace=True)
y_data.drop('Unnamed: 0', axis=1, inplace=True)

# Splitting the data for the purpose of hyper-parameter optimisation and model selection.

In [4]:
# splitting our dataset into train, validation and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 42)
x_optimization, x_validation, y_optimization, y_validation = train_test_split(x_train, y_train, test_size = 0.3, random_state = 42)

print("Finished splitting the data.")

Finished splitting the data.


# Hyper-parameter optimisation.

In [5]:
# initiating an empty list for storing the optimized models
hyper_parameter_optimized_models = []


'''
resampling our optimization datasets, in order to prevent overfitting of our models on the majority class of the target feature in our
for the purpose above stated we will be using SMOTENC, which requires us to give the column indices of the categrical features
'''
catg_features_idx = []
for feature in x_data.columns :
    if len(x_data[feature].value_counts().index) == 2 :
        catg_features_idx.append(list(x_data.columns).index(feature))
# adding two more categorical features that have moe than two classes
catg_features_idx.append(list(x_data.columns).index('SpecialDay'))
catg_features_idx.append(list(x_data.columns).index('Month'))
# arranging the list in ascending order
catg_features_idx.sort()

# making the resampling and standardising objects
over_sampler = SMOTENC(categorical_features = catg_features_idx, random_state=42)
scaler = StandardScaler()

# initiating the random search
for grid, model in zip(parameter_grid.values(), model_objects) :
  # the only change that i have done is remove the comma "," from the end of the very next line i.e classif_model = ......
  classif_model = Pipeline([('resampler', over_sampler), ('scaler', scaler), ('model', model)])
  # the nex thing tht we can do is remove the over_sampler an scaler objects and define them in te pipeline itself
  optimizer = RandomizedSearchCV(estimator = classif_model,
								param_distributions = grid,
								random_state = 42,
								cv = 3,
								error_score = -1,
								verbose = 10,
								n_jobs = -1,
								)
  optimizer.fit(x_optimization, y_optimization.values.ravel())
	# appending the best estimator to a list
  hyper_parameter_optimized_models.append(optimizer.best_estimator_)

print('Hyper parameter tunning is finished.')

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.3min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.3min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   51.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.8min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.3min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   51.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.2min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  8.3min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   53.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.0min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   47.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  5.9min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 11.4min finished


Hyper parameter tunning is finished.


# Model selection.

In [6]:
# initiating an empty list to stre the validation scores of the optimized models
optimized_model_validation_scores = []

for optimized_model in hyper_parameter_optimized_models :
  optimized_model_pipeline = Pipeline([('resampler', over_sampler), ('scaler', scaler), ('optimized_model', optimized_model)])
  model_validation_scores = cross_val_score(optimized_model_pipeline, x_validation, y_validation.values.ravel(), cv=3, n_jobs = -1)
  optimized_model_validation_scores.append(np.mean(model_validation_scores))

# making a dictionary to store the results of the hyper-parameter optimization and the model selection process.
results_dict = {'optimized_model':hyper_parameter_optimized_models,
                'validation_score':optimized_model_validation_scores
                }

optimized_model_results = pd.DataFrame(results_dict)
# saving the results of the hyper-parameter optimization and model_selection in a csv file
optimized_model_results.to_csv("/content/drive/My Drive/data_for_HPO&MS/Online_shopper's_intention/model_optimizaion_report.csv")
print('Model selection is finished')


Model selection is finished


# Best performing hyper-parameter optimised model.

In [7]:
print('Initiating the process of our final phase to judge the average out-of-sample performance of our best found optimized model.')
# selecting the best model by its index for the final predictions
best_model_idx = optimized_model_results['validation_score'].idxmax(axis=0)
best_model = optimized_model_results.iloc[best_model_idx,0]

print('The best model to our finding is ', best_model)

Initiating the process of our final phase to judge the average out-of-sample performance of our best found optimized model.
The best model to our finding is  Pipeline(memory=None,
         steps=[('resampler',
                 SMOTENC(categorical_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                               11, 12, 13, 14, 15, 16, 17, 18,
                                               19, 20, 21, 22, 23, 24, 25, 26,
                                               27, 28, 29, ...],
                         k_neighbors=5, n_jobs=1, random_state=42,
                         sampling_strategy='auto')),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=25, max_features=None,
                

# Defining the best model from the above findings, that will be futher used for the final prediction making.

In [8]:
# selecting the classifier algorithm from the pipeline of the best model found.
final_model = best_model[2]
final_model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Final Prediction.

In [9]:
# we are utilizing the whole training dataset for training the fianl model before making predictions on the test set.
# resampling our training datasets, in order to prevent overfitting of our models on the majority class of the target feature in our training set
x_train_resampled, y_train_resampled = over_sampler.fit_resample(x_train, y_train)
# dropping the sythetic feature after resampling is done
y_train_resampled = pd.DataFrame(y_train_resampled, columns = y_train.columns)
x_train_resampled = pd.DataFrame(x_train_resampled, columns = x_train.columns)

# scaling our features in the training dataset
scaler = StandardScaler().fit(x_train_resampled)
x_train_scaled = scaler.transform(x_train_resampled)
x_test_scaled = scaler.transform(x_test)

# re-fitting out best found optimized model to the whole training set
final_model.fit(x_train_scaled, y_train_resampled.values.ravel())
out_of_sample_predictions = final_model.predict(x_test_scaled)

final_score = accuracy_score(y_test, out_of_sample_predictions)

print('The final average out-of-sample performance score of our best optimized model is', final_score)

  y = column_or_1d(y, warn=True)


The final average out-of-sample performance score of our best optimized model is 0.8821303054879697


# Saving the model.

In [10]:
# saving our best found optimized model for this data, as a pickle file
joblib.dump(best_model, "/content/drive/My Drive/data_for_HPO&MS/Online_shopper's_intention/best_model.pkl")
joblib.dump(final_model, "/content/drive/My Drive/data_for_HPO&MS/Online_shopper's_intention/final_model.pkl")

["/content/drive/My Drive/data_for_HPO&MS/Online_shopper's_intention/final_model.pkl"]