In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

In [6]:
plt.style.use('ggplot')

In [7]:
data = pd.read_csv("online_shoppers_intention.csv")

In [8]:
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [9]:
X = data.drop(["Revenue"], axis=1)
X.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True


In [10]:
Y = data["Revenue"]
Y.head()

0    False
1    False
2    False
3    False
4    False
Name: Revenue, dtype: bool

In [11]:
label_enc = LabelEncoder()
Y = label_enc.fit_transform(Y)
Y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [12]:
X_encoded = pd.get_dummies(X, columns=["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"], prefix=["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"], drop_first = True )
X_encoded.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,TrafficType_14,TrafficType_15,TrafficType_16,TrafficType_17,TrafficType_18,TrafficType_19,TrafficType_20,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_True
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,...,0,0,0,0,0,0,0,0,1,1


In [13]:
X_encoded.shape

(12330, 68)

In [14]:
scaler = StandardScaler()
scale_df = X_encoded.iloc[:,:10]
scale_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0


In [20]:
vif = pd.DataFrame()
vif["vif"] = [variance_inflation_factor(scale_df.values,i) for i in range(scale_df.shape[1])]
vif["Features"] = scale_df.columns

#let's check the values
vif

Unnamed: 0,vif,Features
0,2.651491,Administrative
1,2.041915,Administrative_Duration
2,2.113723,Informational
3,1.777154,Informational_Duration
4,6.339345,ProductRelated
5,6.012012,ProductRelated_Duration
6,5.557495,BounceRates
7,5.978858,ExitRates
8,1.077806,PageValues
9,1.105706,SpecialDay


In [15]:
rem_df = X_encoded.iloc[:,10:]
rem_df.head()

Unnamed: 0,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,OperatingSystems_2,...,TrafficType_14,TrafficType_15,TrafficType_16,TrafficType_17,TrafficType_18,TrafficType_19,TrafficType_20,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_True
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [16]:
col_name = scale_df.columns

In [17]:
scale_df = scaler.fit_transform(scale_df)
scale_df = pd.DataFrame(scale_df, columns = [col_name])
scale_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay
0,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821
1,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.590903,-0.457683,1.171473,-0.317178,-0.308821
2,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821
3,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.622954,0.573535,1.99461,-0.317178,-0.308821
4,-0.696993,-0.457191,-0.396478,-0.244931,-0.488636,-0.29643,-0.045196,0.142551,-0.317178,-0.308821


In [18]:
X_merged = pd.concat([scale_df, rem_df], axis = 1)
X_merged.head()

Unnamed: 0,"(Administrative,)","(Administrative_Duration,)","(Informational,)","(Informational_Duration,)","(ProductRelated,)","(ProductRelated_Duration,)","(BounceRates,)","(ExitRates,)","(PageValues,)","(SpecialDay,)",...,TrafficType_14,TrafficType_15,TrafficType_16,TrafficType_17,TrafficType_18,TrafficType_19,TrafficType_20,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_True
0,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,0
1,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.590903,-0.457683,1.171473,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,0
2,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,0
3,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.622954,0.573535,1.99461,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,0
4,-0.696993,-0.457191,-0.396478,-0.244931,-0.488636,-0.29643,-0.045196,0.142551,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,1


In [19]:
X_merged.shape

(12330, 68)

## LOGISTIC REGRESSION

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X_merged, Y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(9864, 68) (9864,)
(2466, 68) (2466,)


In [73]:
lr = LogisticRegression(max_iter = 500)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [74]:
y_pred = lr.predict(X_test)

In [75]:
lr.score(X_test, y_test)

0.8880778588807786

In [105]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[2037,   36],
       [ 240,  153]], dtype=int64)

In [106]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [107]:
Accuracy = (true_positive + true_negative) / (true_positive +false_positive + false_negative + true_negative)
Accuracy

0.8880778588807786

In [108]:
Precision = true_positive/(true_positive+false_positive)
Precision

0.9826338639652678

In [109]:
Recall = true_positive/(true_positive+false_negative)
Recall

0.8945981554677207

In [110]:
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
F1_Score

0.9365517241379312

In [113]:
auc = roc_auc_score(y_test, y_pred)
auc

0.6859734205322522

## NAIVE BAYES

In [114]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [117]:
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [119]:
y_pred = model.predict(X_test)

In [120]:
print(accuracy_score(y_test, y_pred))

0.291970802919708


In [122]:
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

array([[ 332, 1741],
       [   5,  388]], dtype=int64)

In [123]:
auc = roc_auc_score(y_test, y_pred)
auc

0.5737158596716048

In [None]:
## From Logistic Regression and Naive Bayes, the AUC score is better for logistic, hence that is a better model.

## SMOTE sampling

In [20]:
from imblearn.over_sampling import SMOTE

In [21]:
oversample = SMOTE()

In [22]:
X_bal, y_bal = oversample.fit_resample(X_merged, Y)

In [23]:
print(X_bal.shape, y_bal.shape)

(20844, 68) (20844,)


In [24]:
np.unique(y_bal, return_counts = True)

(array([0, 1], dtype=int64), array([10422, 10422], dtype=int64))

In [25]:
X_bal.head()

Unnamed: 0,"(Administrative,)","(Administrative_Duration,)","(Informational,)","(Informational_Duration,)","(ProductRelated,)","(ProductRelated_Duration,)","(BounceRates,)","(ExitRates,)","(PageValues,)","(SpecialDay,)",...,TrafficType_14,TrafficType_15,TrafficType_16,TrafficType_17,TrafficType_18,TrafficType_19,TrafficType_20,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_True
0,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,0
1,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.590903,-0.457683,1.171473,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,0
2,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,0
3,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.622954,0.573535,1.99461,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,0
4,-0.696993,-0.457191,-0.396478,-0.244931,-0.488636,-0.29643,-0.045196,0.142551,-0.317178,-0.308821,...,0,0,0,0,0,0,0,0,1,1


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(16675, 68) (16675,)
(4169, 68) (4169,)


## Logistic Regression with SMOTE

In [39]:
lr = LogisticRegression(max_iter = 500)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
y_pred = lr.predict(X_test)

In [41]:
lr.score(X_test, y_test)

0.8697529383545215

In [42]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[1846,  245],
       [ 298, 1780]], dtype=int64)

In [43]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [44]:
Accuracy = (true_positive + true_negative) / (true_positive +false_positive + false_negative + true_negative)
Accuracy

0.8697529383545215

In [45]:
Precision = true_positive/(true_positive+false_positive)
Precision

0.882831181252989

In [46]:
Recall = true_positive/(true_positive+false_negative)
Recall

0.8610074626865671

In [47]:
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
F1_Score

0.8717827626918535

In [None]:
# We can see that SMOTE sampling did balance the data, however the accuracy declined.

## Naive Bayes with SMOTE 

In [27]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [28]:
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [29]:
y_pred = model.predict(X_test)

In [30]:
print(accuracy_score(y_test, y_pred))

0.6361237706884145
