In [99]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [100]:
file_name = 'online_shoppers_intention.csv'
data = pd.read_csv(file_name)

In [101]:
##after check, this dataset has no empty value##
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [102]:
##check the type of values in each column##
data.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

In [103]:
##drop those useless columns in the dataset##
data=data.drop(['Administrative', 'Administrative_Duration','Informational','Informational_Duration','PageValues'], axis=1)

In [104]:
data.dtypes

ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

In [105]:
##there are two "object" columns and we will encode them##
data.head()

Unnamed: 0,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,1,0.0,0.2,0.2,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,2,64.0,0.0,0.1,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,1,0.0,0.2,0.2,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,2,2.666667,0.05,0.14,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,10,627.5,0.02,0.05,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [106]:
##the column 'ProductRelated_Duration' needs to be normalized##
data_z=data.copy()
column='ProductRelated_Duration'
data_z[column] = MinMaxScaler().fit_transform(np.array(data_z[column]).reshape(-1,1))
data=data_z
data

Unnamed: 0,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,1,0.000000,0.200000,0.200000,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,2,0.001000,0.000000,0.100000,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,1,0.000000,0.200000,0.200000,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,2,0.000042,0.050000,0.140000,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,10,0.009809,0.020000,0.050000,0.0,Feb,3,3,1,4,Returning_Visitor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,53,0.027883,0.007143,0.029031,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,5,0.007280,0.000000,0.021333,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,6,0.002880,0.083333,0.086667,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,15,0.005408,0.000000,0.021053,0.0,Nov,2,2,3,11,Returning_Visitor,False,False


In [107]:
onehot=OneHotEncoder()
transformed_month=onehot.fit_transform(data[["Month"]])
data[onehot.categories_[0]] = transformed_month.toarray()

In [108]:
transformed_vistype=onehot.fit_transform(data[["VisitorType"]])
data[onehot.categories_[0]] = transformed_vistype.toarray()

In [109]:
data=data.drop(["Month","VisitorType"],axis=1)
data.head()

Unnamed: 0,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,SpecialDay,OperatingSystems,Browser,Region,TrafficType,Weekend,...,Jul,June,Mar,May,Nov,Oct,Sep,New_Visitor,Other,Returning_Visitor
0,1,0.0,0.2,0.2,0.0,1,1,1,1,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,0.001,0.0,0.1,0.0,2,2,1,2,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,0.0,0.2,0.2,0.0,4,1,9,3,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,4.2e-05,0.05,0.14,0.0,3,2,2,4,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,10,0.009809,0.02,0.05,0.0,3,3,1,4,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [110]:
##then we define y and X##
y=data['Revenue']
X=data.drop(['Revenue'],axis=1)

In [111]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,shuffle=True)

In [112]:
##then we build an SVM to predict the column 'Revenue'##
svc1=svm.SVC(kernel ='rbf').fit(X_train, y_train)

In [113]:
##we predict the 'Revenue' using X_test##
y_pred=svc1.predict(X_test)

In [114]:
##get the testing precision##
test_prec=accuracy_score(y_test,y_pred)
print("the testing precision is",test_prec)

the testing precision is 0.8673965936739659


In [115]:
##get the classification report##
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.87      1.00      0.93      2139
        True       0.00      0.00      0.00       327

    accuracy                           0.87      2466
   macro avg       0.43      0.50      0.46      2466
weighted avg       0.75      0.87      0.81      2466



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
