# Importing required librarie Pipeline

In [1]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

# Loading online_shoppers_intention.csv dataset

In [2]:
df = pd.read_csv("C:\\Users\\ramir\Downloads\\online_shoppers_intention.csv")

In [3]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


# Feature Engineering

In [4]:
df['Weekend'] = df['Weekend'].replace((True, False), (1, 0))
df['Revenue'] = df['Revenue'].replace((True, False), (1, 0))

In [5]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0


In [6]:
condition = df['VisitorType']=='Returning_Visitor'
df['Returning_Visitor'] = np.where(condition, 1, 0)

df = df.drop(columns=['VisitorType'])

In [7]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,1,0,1


In [8]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,1,0,1


# Applying One Hot Encoding on Month column

In [9]:
ordinal_encoder = OrdinalEncoder()
df['Month'] = ordinal_encoder.fit_transform(df[['Month']])

In [10]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2.0,1,1,1,1,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2.0,2,2,1,2,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2.0,4,1,9,3,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2.0,3,2,2,4,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2.0,3,3,1,4,1,0,1


# Checking correlation on Revenue column

In [None]:
result = df[df.columns[1:]].corr()['Revenue']
result1 = result.sort_values(ascending=False)

In [12]:
result

Administrative_Duration    0.093587
Informational              0.095200
Informational_Duration     0.070345
ProductRelated             0.158538
ProductRelated_Duration    0.152373
BounceRates               -0.150673
ExitRates                 -0.207071
PageValues                 0.492569
SpecialDay                -0.082305
Month                      0.080150
OperatingSystems          -0.014668
Browser                    0.023984
Region                    -0.011595
TrafficType               -0.005113
Weekend                    0.029295
Revenue                    1.000000
Returning_Visitor         -0.103843
Name: Revenue, dtype: float64

# Prepairing Features as X and target as y

In [13]:
X = df.drop(['Revenue'], axis=1)
y = df['Revenue']

# Prepairing Train and Test Dataset

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

# Model Pipeline

In [15]:
def model_pipeline(X, model):  
    n_c = X.select_dtypes(exclude=['object']).columns.values.tolist()
    c_c = X.select_dtypes(include=['object']).columns.values.tolist()

    numeric_columns = list(n_c)
    categorical_columns = list(c_c)

    numeric_pipeline = SimpleImputer(strategy = 'constant')

    categorical_pipeline = OneHotEncoder(handle_unknown = 'ignore')

    a = ('numeric', numeric_pipeline, numeric_columns)
    b = ('categorical', categorical_pipeline, categorical_columns)

    preprocessor = ColumnTransformer(

    transformers = [a, b], 
    remainder = 'passthrough'

    )

    c = ('preprocessor', preprocessor)
    d = ('smote', SMOTE(random_state = 1))
    e = ('scaler', MinMaxScaler())
    f = ('feature_selection', SelectKBest(score_func = chi2, k = 6))
    g = ('model', model)

    bundled_pipeline = imbpipeline(steps = [c, d, e, f, g])

    return bundled_pipeline

# Model Selection

In [16]:
def select_model(X, y, pipeline=None):

    classifiers = {}

    c_d1 = {"DummyClassifier": DummyClassifier(strategy='most_frequent')}
    classifiers.update(c_d1)

    xgb = XGBClassifier(
        verbosity = 0, 
        use_label_encoder = False,
        eval_metric = 'logloss',
        objective = 'binary:logistic',
    )

    c_d2 = {"XGBClassifier": xgb}
    classifiers.update(c_d2)

    c_d3 = {"LGBMClassifier": LGBMClassifier()}
    classifiers.update(c_d3)

    c_d4 = {"RandomForestClassifier": RandomForestClassifier()}
    classifiers.update(c_d4)

    c_d5 = {"DecisionTreeClassifier": DecisionTreeClassifier()}
    classifiers.update(c_d5)

    c_d6 = {"ExtraTreeClassifier": ExtraTreeClassifier()}
    classifiers.update(c_d6)

    c_d7 = {"ExtraTreesClassifier": ExtraTreeClassifier()}
    classifiers.update(c_d7)  

    c_d8 = {"AdaBoostClassifier": AdaBoostClassifier()}
    classifiers.update(c_d8)

    c_d9 = {"KNeighborsClassifier": KNeighborsClassifier()}
    classifiers.update(c_d9)

    c_d10 = {"RidgeClassifier": RidgeClassifier()}
    classifiers.update(c_d10)

    c_d11 = {"SGDClassifier": SGDClassifier()}
    classifiers.update(c_d11)

    c_d12 = {"BaggingClassifier": BaggingClassifier()}
    classifiers.update(c_d12)

    c_d13 = {"BernoulliNB": BernoulliNB()}
    classifiers.update(c_d13)

    c_d14 = {"SVC": SVC()}
    classifiers.update(c_d14)

    c_d15 = {"MLPClassifier": MLPClassifier()}
    classifiers.update(c_d15)

    mlpc = {
        "MLPClassifier (paper)": MLPClassifier(hidden_layer_sizes=(27, 50),
        max_iter = 300,
        activation = 'relu',
        solver = 'adam',
        random_state = 1)
    }
    
    c_d16 = mlpc
    classifiers.update(c_d16)    

    cols = ['model', 'run_time', 'roc_auc']
    df_models = pd.DataFrame(columns = cols)

    for key in classifiers:
        
        start_time = time.time()
        
        print()
        print("model_pipeline run successfully on", key)

        pipeline = model_pipeline(X_train, classifiers[key])
        
        cv = cross_val_score(pipeline, X, y, cv=10, scoring='roc_auc')

        row = {'model': key,
               'run_time': format(round((time.time() - start_time)/60,2)),
               'roc_auc': cv.mean(),
        }

        df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)
        
    df_models = df_models.sort_values(by='roc_auc', ascending=False)
    return df_models


# Access Model select_model function

In [17]:
models = select_model(X_train, y_train)




model_pipeline run successfully on DummyClassifier

model_pipeline run successfully on XGBClassifier





model_pipeline run successfully on LGBMClassifier

model_pipeline run successfully on RandomForestClassifier

model_pipeline run successfully on DecisionTreeClassifier

model_pipeline run successfully on ExtraTreeClassifier

model_pipeline run successfully on ExtraTreesClassifier

model_pipeline run successfully on AdaBoostClassifier

model_pipeline run successfully on KNeighborsClassifier

model_pipeline run successfully on RidgeClassifier

model_pipeline run successfully on SGDClassifier

model_pipeline run successfully on BaggingClassifier

model_pipeline run successfully on BernoulliNB

model_pipeline run successfully on SVC

model_pipeline run successfully on MLPClassifier

model_pipeline run successfully on MLPClassifier (paper)


In [18]:
models

Unnamed: 0,model,run_time,roc_auc
14,MLPClassifier,1.31,0.903394
15,MLPClassifier (paper),2.0,0.900118
2,LGBMClassifier,0.04,0.897217
1,XGBClassifier,0.09,0.891174
7,AdaBoostClassifier,0.07,0.888264
10,SGDClassifier,0.01,0.887675
13,SVC,0.83,0.885963
3,RandomForestClassifier,0.21,0.885747
11,BaggingClassifier,0.05,0.863954
12,BernoulliNB,0.01,0.857851


# Accessing best model and training

In [19]:
# selected_model = We can write code here to get best model name from DataFrame

In [20]:
selected_model = MLPClassifier()
bundled_pipeline = model_pipeline(X_train, selected_model)
bundled_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  SimpleImputer(strategy='constant'),
                                                  ['Administrative',
                                                   'Administrative_Duration',
                                                   'Informational',
                                                   'Informational_Duration',
                                                   'ProductRelated',
                                                   'ProductRelated_Duration',
                                                   'BounceRates', 'ExitRates',
                                                   'PageValues', 'SpecialDay',
                                                   'Month', 'OperatingSystems',
                                                   'Browser', 'Region',
           

# Accessing best model and training

In [35]:
y_pred = bundled_pipeline.predict(X_test)

In [36]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# ROC and AOC score

In [37]:
roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

In [38]:
roc_auc

0.8353072322709618

In [39]:
accuracy

0.8764530954311976

In [40]:
f1_score

0.6779422128259338

# Classification report

In [42]:
classif_report = classification_report(y_test, y_pred)

In [43]:
print(classif_report)

              precision    recall  f1-score   support

           0       0.95      0.90      0.92      3077
           1       0.60      0.77      0.68       622

    accuracy                           0.88      3699
   macro avg       0.78      0.84      0.80      3699
weighted avg       0.89      0.88      0.88      3699

