In [301]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [302]:
#run env_setup to download dataset
file_name = 'online_shoppers_intention.csv'
data = pd.read_csv(file_name)

In [303]:
##after check, this dataset has no empty value##
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [304]:
##check the type of values in each column##
data.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

In [305]:
##drop those useless columns in the dataset##
data=data.drop(['Administrative', 'Administrative_Duration','Informational','Informational_Duration','PageValues'], axis=1)

In [306]:
data.dtypes

ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

In [307]:
##there are two "object" columns and we will encode them##
data.head()

Unnamed: 0,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,1,0.0,0.2,0.2,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,2,64.0,0.0,0.1,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,1,0.0,0.2,0.2,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,2,2.666667,0.05,0.14,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,10,627.5,0.02,0.05,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [308]:
onehot=OneHotEncoder()
transformed_month=onehot.fit_transform(data[["Month"]])
data[onehot.categories_[0]] = transformed_month.toarray()

In [309]:
transformed_vistype=onehot.fit_transform(data[["VisitorType"]])
data[onehot.categories_[0]] = transformed_vistype.toarray()

In [310]:
data=data.drop(["Month","VisitorType"],axis=1)
data.head()

Unnamed: 0,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,SpecialDay,OperatingSystems,Browser,Region,TrafficType,Weekend,...,Jul,June,Mar,May,Nov,Oct,Sep,New_Visitor,Other,Returning_Visitor
0,1,0.0,0.2,0.2,0.0,1,1,1,1,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,64.0,0.0,0.1,0.0,2,2,1,2,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,0.0,0.2,0.2,0.0,4,1,9,3,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,2.666667,0.05,0.14,0.0,3,2,2,4,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,10,627.5,0.02,0.05,0.0,3,3,1,4,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [311]:
##then we define y and X##
y=data['Revenue']
X=data.drop(['Revenue'],axis=1)
##Scale X##
X=StandardScaler().fit_transform(X)

In [312]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,shuffle=True)

In [313]:
##then we build an Logistic Regression to predict the column 'Revenue'##
log_reg = LogisticRegression(penalty='l1', solver='saga', max_iter=1000)
log_reg.fit(X=X_train, y=y_train)

In [317]:
##we predict the 'Revenue' using X_test##
y_pred = log_reg.predict(X_test)

array([False, False, False, ..., False, False, False])

In [315]:
##get the testing precision##
test_prec=accuracy_score(y_test,y_pred)
print("the testing precision is",test_prec)

the testing precision is 0.856853203568532


In [316]:
##get the classification report##
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.86      1.00      0.92      2113
        True       0.50      0.02      0.03       353

    accuracy                           0.86      2466
   macro avg       0.68      0.51      0.48      2466
weighted avg       0.81      0.86      0.80      2466

