In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

# Basics
import numpy as np
import pandas as pd

# Plots
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# Train Test Split
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold

# Imbalanced Dataset
from imblearn.over_sampling import SMOTE

# Standard Scaler
from sklearn.preprocessing import StandardScaler

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Metrics
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,roc_auc_score

Using TensorFlow backend.


In [2]:
data=pd.read_csv('online_shoppers_intention.csv')
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
le = LabelEncoder()
data['Revenue'] = le.fit_transform(data['Revenue'])
data['Revenue'].value_counts()

0    10422
1     1908
Name: Revenue, dtype: int64

In [4]:
le = LabelEncoder()
data['Month'] = le.fit_transform(data['Month'])
data['Month'].value_counts()

6    3364
7    2998
5    1907
1    1727
8     549
9     448
0     433
3     432
4     288
2     184
Name: Month, dtype: int64

In [5]:
le = LabelEncoder()
data['VisitorType'] = le.fit_transform(data['VisitorType'])
data['VisitorType'].value_counts()

2    10551
0     1694
1       85
Name: VisitorType, dtype: int64

In [6]:
le = LabelEncoder()
data['Weekend'] = le.fit_transform(data['Weekend'])
data['Weekend'].value_counts()

0    9462
1    2868
Name: Weekend, dtype: int64

In [7]:
y=data['Revenue']
X=data.drop(['Revenue','Region','Administrative','Informational','ProductRelated','BounceRates'],axis=1)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
sc=StandardScaler()
# X_train=sc.fit_transform(X_train)
# X_test=sc.transform(X_test)
X_std=sc.fit_transform(X)

## Cross-Validation

## Logistic Regression

In [10]:
kf = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kf.split(X_std), 1):
    X_train = X_std[train_index]
    y_train = y[train_index] 
    X_test = X_std[test_index]
    y_test = y[test_index]  
    sm = SMOTE(random_state=42)
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train,y_train)
    model = LogisticRegression(random_state=0)
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)
    print(f'For fold {fold}:')
    print(f'Accuracy: {model.score(X_test, y_test)*100}')
    print(f'Precision: {precision_score(y_test,y_pred)*100}')
    print(f'Recall: {recall_score(y_test,y_pred)*100}')
    print(f'AUC: {roc_auc_score(y_test,y_pred)*100}')
    print(f'f1-score: {f1_score(y_test, y_pred)*100}')
    print('\n')

For fold 1:
Accuracy: 93.43065693430657
Precision: 61.49068322981367
Recall: 83.89830508474576
AUC: 89.16888348407693
f1-score: 70.96774193548387


For fold 2:
Accuracy: 90.95701540957015
Precision: 54.48275862068965
Recall: 90.45801526717557
AUC: 90.7371746027348
f1-score: 68.00573888091822


For fold 3:
Accuracy: 80.12976480129764
Precision: 43.72163388804841
Recall: 71.007371007371
AUC: 76.47017409037807
f1-score: 54.11985018726592


For fold 4:
Accuracy: 84.30656934306569
Precision: 60.70175438596491
Recall: 67.97642436149313
AUC: 78.26516670297447
f1-score: 64.13345690454125


For fold 5:
Accuracy: 85.32035685320358
Precision: 62.547528517110266
Recall: 66.59919028340082
AUC: 78.30466613561522
f1-score: 64.50980392156862




In [11]:
auc=[.7830,.7827,.7647,.9074,.8917]
lr_auc_mean=np.mean(auc)
lr_auc_var=np.var(auc,ddof=1)
print('Mean :',lr_auc_mean)
print('Variance :',lr_auc_var)

Mean : 0.8259000000000001
Variance : 0.004605995


## Decision Tree

In [12]:
kf = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kf.split(X_std), 1):
    X_train = X_std[train_index]
    y_train = y[train_index] 
    X_test = X_std[test_index]
    y_test = y[test_index]  
    sm = SMOTE(random_state=42)
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train,y_train)
    model = DecisionTreeClassifier(random_state=0,criterion='gini',max_depth=5,max_features='auto',splitter='best')
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)
    print(f'For fold {fold}:')
    print(f'Accuracy: {model.score(X_test, y_test)*100}')
    print(f'Precision: {precision_score(y_test,y_pred)*100}')
    print(f'Recall: {recall_score(y_test,y_pred)*100}')
    print(f'AUC: {roc_auc_score(y_test,y_pred)*100}')
    print(f'f1-score: {f1_score(y_test, y_pred)*100}')
    print('\n')

For fold 1:
Accuracy: 93.7956204379562
Precision: 60.66838046272493
Recall: 100.0
AUC: 96.5695067264574
f1-score: 75.52


For fold 2:
Accuracy: 89.213300892133
Precision: 49.621212121212125
Recall: 100.0
AUC: 93.9655172413793
f1-score: 66.32911392405065


For fold 3:
Accuracy: 83.45498783454988
Precision: 49.924585218702866
Recall: 81.32678132678133
AUC: 82.60122456334209
f1-score: 61.86915887850467


For fold 4:
Accuracy: 85.07704785077048
Precision: 61.88870151770658
Recall: 72.10216110019645
AUC: 80.27693645198887
f1-score: 66.60617059891108


For fold 5:
Accuracy: 80.0081103000811
Precision: 50.070921985815595
Recall: 71.45748987854252
AUC: 76.8037956492104
f1-score: 58.88240200166804




In [13]:
auc=[.7680,.8028,.8260,.9397,.9657]
dt_auc_mean=np.mean(auc)
dt_auc_var=np.var(auc,ddof=1)
print('Mean :',dt_auc_mean)
print('Variance :',dt_auc_var)

Mean : 0.86044
Variance : 0.007603863000000001


## Random Forest

In [14]:
kf = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kf.split(X_std), 1):
    X_train = X_std[train_index]
    y_train = y[train_index] 
    X_test = X_std[test_index]
    y_test = y[test_index]  
    sm = SMOTE(random_state=42)
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train,y_train)
    model = RandomForestClassifier(criterion='entropy',max_depth=10,max_features='sqrt',n_estimators=75)
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)
    print(f'For fold {fold}:')
    print(f'Accuracy: {model.score(X_test, y_test)*100}')
    print(f'Precision: {precision_score(y_test,y_pred)*100}')
    print(f'Recall: {recall_score(y_test,y_pred)*100}')
    print(f'AUC: {roc_auc_score(y_test,y_pred)*100}')
    print(f'f1-score: {f1_score(y_test, y_pred)*100}')
    print('\n')

For fold 1:
Accuracy: 94.52554744525547
Precision: 64.38746438746439
Recall: 95.76271186440678
AUC: 95.07866534924375
f1-score: 77.00170357751279


For fold 2:
Accuracy: 89.57826439578265
Precision: 50.487329434697855
Recall: 98.85496183206108
AUC: 93.66523046231002
f1-score: 66.83870967741936


For fold 3:
Accuracy: 84.79318734793188
Precision: 52.58899676375405
Recall: 79.85257985257985
AUC: 82.81118550666875
f1-score: 63.414634146341456


For fold 4:
Accuracy: 84.18491484184915
Precision: 59.61227786752828
Recall: 72.4950884086444
AUC: 79.86021666216584
f1-score: 65.42553191489363


For fold 5:
Accuracy: 84.50932684509327
Precision: 59.859154929577464
Recall: 68.82591093117408
AUC: 78.63202240270672
f1-score: 64.030131826742




In [15]:
auc=[.7873,.8043,.8230,.9388,.9587]
rf_auc_mean=np.mean(auc)
rf_auc_var=np.var(auc,ddof=1)
print('Mean :',rf_auc_mean)
print('Variance :',rf_auc_var)

Mean : 0.86242
Variance : 0.006419656999999999


## K-NN

In [16]:
kf = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kf.split(X_std), 1):
    X_train = X_std[train_index]
    y_train = y[train_index] 
    X_test = X_std[test_index]
    y_test = y[test_index]  
    sm = SMOTE(random_state=42)
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train,y_train)
    model = KNeighborsClassifier(n_neighbors=16,weights='uniform')
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)
    print(f'For fold {fold}:')
    print(f'Accuracy: {model.score(X_test, y_test)*100}')
    print(f'Precision: {precision_score(y_test,y_pred)*100}')
    print(f'Recall: {recall_score(y_test,y_pred)*100}')
    print(f'AUC: {roc_auc_score(y_test,y_pred)*100}')
    print(f'f1-score: {f1_score(y_test, y_pred)*100}')
    print('\n')

For fold 1:
Accuracy: 88.88888888888889
Precision: 45.20202020202021
Recall: 75.84745762711864
AUC: 83.05825796154139
f1-score: 56.64556962025317


For fold 2:
Accuracy: 86.05028386050284
Precision: 42.49084249084249
Recall: 88.54961832061069
AUC: 87.15139718208394
f1-score: 57.425742574257434


For fold 3:
Accuracy: 77.77777777777779
Precision: 40.02828854314003
Recall: 69.53316953316954
AUC: 74.47032444604082
f1-score: 50.80789946140035


For fold 4:
Accuracy: 75.50689375506894
Precision: 43.994943109987354
Recall: 68.36935166994105
AUC: 72.86633143026945
f1-score: 53.53846153846153


For fold 5:
Accuracy: 75.18248175182481
Precision: 42.75184275184275
Recall: 70.44534412955466
AUC: 73.40725624327631
f1-score: 53.21100917431192




In [17]:
auc=[.7341,.7287,.7447,.8715,.8306]
knn_auc_mean=np.mean(auc)
knn_auc_var=np.var(auc,ddof=1)
print('Mean :',knn_auc_mean)
print('Variance :',knn_auc_var)

Mean : 0.7819200000000001
Variance : 0.004224692000000002


## Naive Bayes

In [18]:
kf = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kf.split(X_std), 1):
    X_train = X_std[train_index]
    y_train = y[train_index] 
    X_test = X_std[test_index]
    y_test = y[test_index]  
    sm = SMOTE(random_state=42)
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train,y_train)
    model = GaussianNB()
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)
    print(f'For fold {fold}:')
    print(f'Accuracy: {model.score(X_test, y_test)*100}')
    print(f'Precision: {precision_score(y_test,y_pred)*100}')
    print(f'Recall: {recall_score(y_test,y_pred)*100}')
    print(f'AUC: {roc_auc_score(y_test,y_pred)*100}')
    print(f'f1-score: {f1_score(y_test, y_pred)*100}')
    print('\n')

For fold 1:
Accuracy: 78.75101378751013
Precision: 29.130434782608695
Recall: 85.16949152542372
AUC: 81.62062020217374
f1-score: 43.41252699784017


For fold 2:
Accuracy: 79.56204379562044
Precision: 31.49847094801223
Recall: 78.62595419847328
AUC: 79.14963771629655
f1-score: 44.97816593886463


For fold 3:
Accuracy: 71.12733171127331
Precision: 32.57142857142858
Recall: 70.02457002457002
AUC: 70.68494164171678
f1-score: 44.46177847113885


For fold 4:
Accuracy: 71.49229521492295
Precision: 40.081799591002046
Recall: 77.01375245579568
AUC: 73.53498046908334
f1-score: 52.7236045729657


For fold 5:
Accuracy: 69.78913219789132
Precision: 38.23805060918463
Recall: 82.5910931174089
AUC: 74.5866216094144
f1-score: 52.27418321588726




In [19]:
auc=[.7459,.7353,.7068,.7915,.8162]
nb_auc_mean=np.mean(auc)
nb_auc_var=np.var(auc,ddof=1)
print('Mean :',nb_auc_mean)
print('Variance :',nb_auc_var)

Mean : 0.75914
Variance : 0.0019465330000000018
