In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
plt.style.use('ggplot')
from sklearn.utils import resample
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.io as pio
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
base_estimator = DecisionTreeClassifier()

warnings.filterwarnings('ignore')

### Loading Data

In [2]:
pd.set_option('display.max_columns', 60)
data = pd.read_csv("online_shoppers_intention.csv")

In [3]:
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
#One hot encoding of categorical variables
#Create list with features to be dummified cols.
                                       

nonum_feats_names = ['Month','OperatingSystems','Browser','Region','VisitorType','Weekend']


dataModel = pd.concat([data[['Administrative', 'Administrative_Duration', 'Informational','Informational_Duration', 
                              'ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues',
                               'TrafficType','SpecialDay']],
                       pd.get_dummies(data[nonum_feats_names].astype('category')),data['Revenue']],axis=1)
                      
dataModel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12205 entries, 0 to 12204
Data columns (total 57 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Administrative                 12205 non-null  int64  
 1   Administrative_Duration        12205 non-null  float64
 2   Informational                  12205 non-null  int64  
 3   Informational_Duration         12205 non-null  float64
 4   ProductRelated                 12205 non-null  int64  
 5   ProductRelated_Duration        12205 non-null  float64
 6   BounceRates                    12205 non-null  float64
 7   ExitRates                      12205 non-null  float64
 8   PageValues                     12205 non-null  float64
 9   TrafficType                    12205 non-null  int64  
 10  SpecialDay                     12205 non-null  float64
 11  Month_Aug                      12205 non-null  bool   
 12  Month_Dec                      12205 non-null 

In [5]:
# Display value count of target variable.
data['Revenue'].value_counts()

Revenue
False    10297
True      1908
Name: count, dtype: int64

In [6]:
#Label Encoding of revenue

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Revenue'] = le.fit_transform(data['Revenue'])
data['Revenue'].value_counts()

data['Revenue'].head()

# getting dependent and independent variables

x = dataModel
# removing the target column revenue from x
x = x.drop(['Revenue'], axis = 1)

y = data['Revenue']

# checking the shapes
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)

Shape of x: (12205, 56)
Shape of y: (12205,)


In [7]:
data_majority=dataModel[dataModel.Revenue==0] ## all rows where Revenue==0
data_minority=dataModel[dataModel.Revenue==1] ## all rows where Revenue==1

data_minority_upsampled=resample(data_minority,replace=True,n_samples=10297)
data_upsampled=pd.concat([data_minority_upsampled,data_majority])

data_upsampled.info()
print(data_upsampled['Revenue'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Index: 20594 entries, 12045 to 12204
Data columns (total 57 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Administrative                 20594 non-null  int64  
 1   Administrative_Duration        20594 non-null  float64
 2   Informational                  20594 non-null  int64  
 3   Informational_Duration         20594 non-null  float64
 4   ProductRelated                 20594 non-null  int64  
 5   ProductRelated_Duration        20594 non-null  float64
 6   BounceRates                    20594 non-null  float64
 7   ExitRates                      20594 non-null  float64
 8   PageValues                     20594 non-null  float64
 9   TrafficType                    20594 non-null  int64  
 10  SpecialDay                     20594 non-null  float64
 11  Month_Aug                      20594 non-null  bool   
 12  Month_Dec                      20594 non-null  

In [8]:
X_upsampled = data_upsampled.drop('Revenue', axis=1) ## This is the dependent variable

le1 = LabelEncoder()
data_upsampled['Revenue'] = le1.fit_transform(data_upsampled['Revenue'])
data_upsampled['Revenue'].value_counts()

y_upsampled = data_upsampled['Revenue']

# checking the shapes
print("Shape of x upsampled:", X_upsampled.shape)
print("Shape of y upsampled:", y_upsampled.shape)

Shape of x upsampled: (20594, 56)
Shape of y upsampled: (20594,)


In [9]:
# splitting the data

from sklearn.model_selection import train_test_split

x_baseTrain, x_baseTest, y_baseTrain, y_baseTest = train_test_split(x, y, test_size = 0.3, random_state = 101)
# X_val, X_test, y_val, y_test = train_test_split(x_baseTest, y_baseTest, test_size=0.5, random_state=101)

# checking the shapes

print("Shape of x_train :", x_baseTrain.shape)
print("Shape of y_train :", y_baseTrain.shape)
print("Shape of x_test :", x_baseTest.shape)
print("Shape of y_test :", y_baseTest.shape)

Shape of x_train : (8543, 56)
Shape of y_train : (8543,)
Shape of x_test : (3662, 56)
Shape of y_test : (3662,)


In [10]:
model = GaussianNB()
model.fit(x_baseTrain, y_baseTrain)

NB_y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, NB_y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, NB_y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, NB_y_basePred)
print(cr)

print('Gaussian Naive Bayes Performance:')
print('---------------------------------')
print('Accuracy        : ', metrics.accuracy_score(y_baseTest,NB_y_basePred))
print('F1 Score        : ', metrics.f1_score(y_baseTest,NB_y_basePred))
print('Precision       : ', metrics.precision_score(y_baseTest,NB_y_basePred))
print('Recall          : ', metrics.recall_score(y_baseTest,NB_y_basePred))
print('Confusion Matrix:\n ', confusion_matrix(y_baseTest,NB_y_basePred))

Training Accuracy : 0.7784150766709587
Testing Accuracy : 0.7695248498088476
ROC AUC Score : 0.7176179189958352
[[2441  633]
 [ 211  377]]
              precision    recall  f1-score   support

           0       0.92      0.79      0.85      3074
           1       0.37      0.64      0.47       588

    accuracy                           0.77      3662
   macro avg       0.65      0.72      0.66      3662
weighted avg       0.83      0.77      0.79      3662

Gaussian Naive Bayes Performance:
---------------------------------
Accuracy        :  0.7695248498088476
F1 Score        :  0.4718397997496871
Precision       :  0.37326732673267327
Recall          :  0.641156462585034
Confusion Matrix:
  [[2441  633]
 [ 211  377]]


In [11]:
knn_model = KNeighborsClassifier(n_neighbors=5,weights='uniform',leaf_size=30,p=2)
knn_model.fit(x_baseTrain, y_baseTrain)

KNN_y_basePred = knn_model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", knn_model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", knn_model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, KNN_y_basePred))


# confusion matrix
cm = confusion_matrix(y_baseTest, KNN_y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, KNN_y_basePred)
print(cr)

print('K-Nearest Neighbor Initial Performance:')
print('---------------------------------------')
print('Accuracy        : ', metrics.accuracy_score(y_baseTest,KNN_y_basePred))
print('F1 Score        : ', metrics.f1_score(y_baseTest,KNN_y_basePred))
print('Precision       : ', metrics.precision_score(y_baseTest,KNN_y_basePred))
print('Recall          : ', metrics.recall_score(y_baseTest,KNN_y_basePred))
print('Confusion Matrix:\n ', confusion_matrix(y_baseTest,KNN_y_basePred))


Training Accuracy : 0.8918412735572984
Testing Accuracy : 0.8506280720917532
ROC AUC Score : 0.6208246473605707
[[2949  125]
 [ 422  166]]
              precision    recall  f1-score   support

           0       0.87      0.96      0.92      3074
           1       0.57      0.28      0.38       588

    accuracy                           0.85      3662
   macro avg       0.72      0.62      0.65      3662
weighted avg       0.83      0.85      0.83      3662

K-Nearest Neighbor Initial Performance:
---------------------------------------
Accuracy        :  0.8506280720917532
F1 Score        :  0.37770193401592717
Precision       :  0.570446735395189
Recall          :  0.282312925170068
Confusion Matrix:
  [[2949  125]
 [ 422  166]]


In [12]:
rf_model = RandomForestClassifier(n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    bootstrap=True)  
rf_model.fit(x_baseTrain, y_baseTrain)

RF_y_basePred = rf_model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", rf_model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", rf_model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, RF_y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, RF_y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, RF_y_basePred)
print(cr)

print('Random Forest initial Performance:')
print('----------------------------------')
print('Accuracy        : ', metrics.accuracy_score(y_baseTest,RF_y_basePred))
print('F1 Score        : ', metrics.f1_score(y_baseTest,RF_y_basePred))
print('Precision       : ', metrics.precision_score(y_baseTest,RF_y_basePred))
print('Recall          : ', metrics.recall_score(y_baseTest,RF_y_basePred))
print('Confusion Matrix:\n ', confusion_matrix(y_baseTest,RF_y_basePred))


Training Accuracy : 1.0
Testing Accuracy : 0.9049699617695248
ROC AUC Score : 0.7597841674080172
[[2993   81]
 [ 267  321]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      3074
           1       0.80      0.55      0.65       588

    accuracy                           0.90      3662
   macro avg       0.86      0.76      0.80      3662
weighted avg       0.90      0.90      0.90      3662

Random Forest initial Performance:
----------------------------------
Accuracy        :  0.9049699617695248
F1 Score        :  0.6484848484848484
Precision       :  0.7985074626865671
Recall          :  0.5459183673469388
Confusion Matrix:
  [[2993   81]
 [ 267  321]]


In [13]:
lr_model = LogisticRegression(C=1.0,solver='lbfgs')
lr_model.fit(x_baseTrain, y_baseTrain)

LR_y_basePred = lr_model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", lr_model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", lr_model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, LR_y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, LR_y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, LR_y_basePred)
print(cr)

print('Logistic Regression initial Performance:')
print('----------------------------------------')
print('Accuracy        : ', metrics.accuracy_score(y_baseTest,LR_y_basePred))
print('F1 Score        : ', metrics.f1_score(y_baseTest,LR_y_basePred))
print('Precision       : ', metrics.precision_score(y_baseTest,LR_y_basePred))
print('Recall          : ', metrics.recall_score(y_baseTest,LR_y_basePred))

Training Accuracy : 0.8802528385812947
Testing Accuracy : 0.8823047515019116
ROC AUC Score : 0.675452223830326
[[3013   61]
 [ 370  218]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      3074
           1       0.78      0.37      0.50       588

    accuracy                           0.88      3662
   macro avg       0.84      0.68      0.72      3662
weighted avg       0.87      0.88      0.86      3662

Logistic Regression initial Performance:
----------------------------------------
Accuracy        :  0.8823047515019116
F1 Score        :  0.5028835063437139
Precision       :  0.7813620071684588
Recall          :  0.3707482993197279


In [14]:
ada_model = AdaBoostClassifier(base_estimator=base_estimator,
                         n_estimators=50,
                         learning_rate=1.0,
                         algorithm='SAMME.R',
                         random_state=None)
ada_model.fit(x_baseTrain, y_baseTrain)

ADA_y_basePred = model.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", ada_model.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", ada_model.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, ADA_y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, ADA_y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, ADA_y_basePred)
print(cr)

print('AdaBoost initial Performance:')
print('-----------------------------')
print('Accuracy        : ', metrics.accuracy_score(y_baseTest,ADA_y_basePred))
print('F1 Score        : ', metrics.f1_score(y_baseTest,ADA_y_basePred))
print('Precision       : ', metrics.precision_score(y_baseTest,ADA_y_basePred))
print('Recall          : ', metrics.recall_score(y_baseTest,ADA_y_basePred))
print('Confusion Matrix:\n ', confusion_matrix(y_baseTest,ADA_y_basePred))

Training Accuracy : 1.0
Testing Accuracy : 0.8582741671217914
ROC AUC Score : 0.7176179189958352
[[2441  633]
 [ 211  377]]
              precision    recall  f1-score   support

           0       0.92      0.79      0.85      3074
           1       0.37      0.64      0.47       588

    accuracy                           0.77      3662
   macro avg       0.65      0.72      0.66      3662
weighted avg       0.83      0.77      0.79      3662

AdaBoost initial Performance:
-----------------------------
Accuracy        :  0.7695248498088476
F1 Score        :  0.4718397997496871
Precision       :  0.37326732673267327
Recall          :  0.641156462585034
Confusion Matrix:
  [[2441  633]
 [ 211  377]]


In [15]:
gbm_modle = GradientBoostingClassifier(loss='deviance',
    learning_rate=0.1,
    n_estimators=100,
    subsample=1.0,
    criterion='friedman_mse',
    min_samples_split=2,
    min_samples_leaf=1,
    max_depth=3,
    max_features=None) #default parameters
gbm_modle.fit(x_baseTrain, y_baseTrain)
Gbm_y_basePred = gbm_modle.predict(x_baseTest)

# evaluating the model
print("Training Accuracy :", gbm_modle.score(x_baseTrain, y_baseTrain))
print("Testing Accuracy :", gbm_modle.score(x_baseTest, y_baseTest))
print("ROC AUC Score :", roc_auc_score(y_baseTest, Gbm_y_basePred))

# confusion matrix
cm = confusion_matrix(y_baseTest, Gbm_y_basePred)
print(cm)

# classification report
cr = classification_report(y_baseTest, Gbm_y_basePred)
print(cr)

print('Gradient Boosting initial Performance:')
print('-----------------------------')
print('Accuracy        : ', metrics.accuracy_score(y_baseTest,Gbm_y_basePred))
print('F1 Score        : ', metrics.f1_score(y_baseTest,Gbm_y_basePred))
print('Precision       : ', metrics.precision_score(y_baseTest,Gbm_y_basePred))
print('Recall          : ', metrics.recall_score(y_baseTest,Gbm_y_basePred))
print('Confusion Matrix:\n ', confusion_matrix(y_baseTest,Gbm_y_basePred))



Training Accuracy : 0.9173592414842561
Testing Accuracy : 0.903604587657018
ROC AUC Score : 0.7857906337551286
[[2949  125]
 [ 228  360]]
              precision    recall  f1-score   support

           0       0.93      0.96      0.94      3074
           1       0.74      0.61      0.67       588

    accuracy                           0.90      3662
   macro avg       0.84      0.79      0.81      3662
weighted avg       0.90      0.90      0.90      3662

Gradient Boosting initial Performance:
-----------------------------
Accuracy        :  0.903604587657018
F1 Score        :  0.6710158434296365
Precision       :  0.7422680412371134
Recall          :  0.6122448979591837
Confusion Matrix:
  [[2949  125]
 [ 228  360]]


### Result

| Classifier | Accuracy | F1-Score | Precision | Recall |
|:-----------|:---------|:---------|:----------|:-------|
|Naive Bayes |0.77     |0.472     | 0.373    |0.641  |
|KNN|0.851|0.380|0.570|0.282|
|Random Forest|0.905|0.65|0.797|0.549|
|Logistic Regression|0.88|0.50|0.78|0.37|
|ADA Boost| 0.77|0.472|0.373|0.64|
|Gardient Boost| 0.90|0.671|0.742|0.612|


Focus on what precission or recall explain why

Recall, also known as the true positive rate (TPR), is the percentage of data samples that a machine learning model correctly identifies as belonging to a class of interest—the “positive class”—out of the total samples for that class.

Precision is a metric that measures the accuracy of positive predictions. It is the number of true positive predictions divided by the number of true positive predictions plus false positive predictions.

In [16]:
import pickle 

In [17]:
filename = 'Naive_Bayes_Balance.sav'
pickle.dump(NB_y_basePred, open(filename,'wb'))

In [18]:
loaded_model = pickle.load(open('Naive_Bayes_Balance.sav', 'rb'))

In [19]:
filename = 'KNN_Balance.sav'
pickle.dump(KNN_y_basePred, open(filename,'wb'))

In [20]:
loaded_model = pickle.load(open('KNN_Balance.sav', 'rb'))

In [21]:
filename = 'Random_Forest_Balance.sav'
pickle.dump(RF_y_basePred, open(filename,'wb'))

In [22]:
loaded_model = pickle.load(open('Random_Forest_Balance.sav', 'rb'))

In [23]:
filename = 'Logistic_Regression_Balance.sav'
pickle.dump(LR_y_basePred, open(filename,'wb'))

In [24]:
loaded_model = pickle.load(open('Logistic_Regression_Balance.sav', 'rb'))

In [25]:
filename = 'ADA_Boosting_Balance.sav'
pickle.dump(ADA_y_basePred, open(filename,'wb'))

In [26]:
loaded_model = pickle.load(open('ADA_Boosting_Balance.sav', 'rb'))

In [27]:
filename = 'Gardient_Boost_Balance.sav'
pickle.dump(Gbm_y_basePred, open(filename,'wb'))

In [28]:
loaded_model = pickle.load(open('Gardient_Boost_Balance.sav', 'rb'))