In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from imblearn.over_sampling import SMOTE

from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

In [3]:
dataset = pd.read_csv("online_shoppers_intention.csv")

In [4]:
dataset.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [5]:
dataset.shape

(12330, 18)

In [6]:
dataset.isnull().sum().sum()

np.int64(0)

In [7]:
dataset.duplicated().sum()

np.int64(125)

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [9]:
dataset.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


In [10]:
dataset['Revenue'] = dataset['Revenue'].replace((True ,False),(1,0))

In [11]:
dataset['Revenue'].unique()

array([0, 1])

In [12]:
dataset['Revenue'].value_counts()

Revenue
0    10422
1     1908
Name: count, dtype: int64

In [13]:
dataset['VisitorType'].unique()

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

In [14]:
con  = dataset['VisitorType'] == 'Returning_Visitor'

dataset['Returning_Visitor'] = np.where(con ,1,0)

In [15]:
dataset.drop(columns = 'VisitorType', inplace= True)

In [16]:
dataset['Returning_Visitor'].value_counts()

Returning_Visitor
1    10551
0     1779
Name: count, dtype: int64

In [17]:
dataset['Weekend'] = dataset['Weekend'].replace((True,False),(1,0))

In [18]:
dataset['Month'].unique()

array(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep',
       'Dec'], dtype=object)

In [19]:
dataset['Month'].value_counts(ascending=False)

Month
May     3364
Nov     2998
Mar     1907
Dec     1727
Oct      549
Sep      448
Aug      433
Jul      432
June     288
Feb      184
Name: count, dtype: int64

In [20]:
ordinal_encoder = OrdinalEncoder()

In [21]:
dataset['Month'] = ordinal_encoder.fit_transform(dataset[['Month']])

In [22]:
dataset['Month'].unique()

array([2., 5., 6., 8., 4., 3., 0., 7., 9., 1.])

In [23]:
result = dataset[dataset.columns[1:]].corr()['Revenue']

In [24]:
result1 = result.sort_values(ascending = False)
result1

Revenue                    1.000000
PageValues                 0.492569
ProductRelated             0.158538
ProductRelated_Duration    0.152373
Informational              0.095200
Administrative_Duration    0.093587
Month                      0.080150
Informational_Duration     0.070345
Weekend                    0.029295
Browser                    0.023984
TrafficType               -0.005113
Region                    -0.011595
OperatingSystems          -0.014668
SpecialDay                -0.082305
Returning_Visitor         -0.103843
BounceRates               -0.150673
ExitRates                 -0.207071
Name: Revenue, dtype: float64

In [25]:
X = dataset.drop('Revenue', axis = 1)
y = dataset.Revenue

In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33, random_state = 42)

In [27]:
# Machine Learning Pipeline

In [28]:
def model_pipeline(X, model):
    n_c = X.select_dtypes(exclude=['object']).columns.tolist()
    c_c = X.select_dtypes(include=['object']).columns.tolist()

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant')),
        ('scaler', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('numeric', numeric_pipeline, n_c),
        ('categorical', categorical_pipeline, c_c)
    ], remainder='passthrough')

    final_steps = [
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=1)),
        ('feature_selection', SelectKBest(score_func = chi2, k = 6)),
        ('model', model)
    ]

    return IMBPipeline(steps = final_steps)


In [29]:
# Model Selection

In [30]:
def select_model(X, y, pipeline=None):
    classifiers = {}

    classifiers['RandomForestClassifier'] = RandomForestClassifier()
    classifiers['DecisionTreeClassifier'] = DecisionTreeClassifier()
    classifiers['KNeighborsClassifier'] = KNeighborsClassifier()
    classifiers['RidgeClassifier'] = RidgeClassifier()
    classifiers['BernoulliNB'] = BernoulliNB()
    classifiers['SVC'] = SVC()

    cols = ['model', 'run_time', 'roc_auc']
    df_models = pd.DataFrame(columns=cols)

    for key in classifiers:
        start_time = time.time()  
        print(f'model_pipeline run successfully: {key}')

        cv = cross_val_score(classifiers[key], X, y, cv=10, scoring='roc_auc')

        run_time = round((time.time() - start_time) / 60, 2)

        row = {
            'model': key,
            'run_time': run_time,
            'roc_auc': cv.mean()
        }

        df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

    df_models = df_models.sort_values(by='roc_auc', ascending=False)

    return df_models

In [31]:
select_model(X_train,y_train)

model_pipeline run successfully: RandomForestClassifier
model_pipeline run successfully: DecisionTreeClassifier
model_pipeline run successfully: KNeighborsClassifier
model_pipeline run successfully: RidgeClassifier
model_pipeline run successfully: BernoulliNB
model_pipeline run successfully: SVC


Unnamed: 0,model,run_time,roc_auc
0,RandomForestClassifier,0.53,0.920781
3,RidgeClassifier,0.01,0.888456
4,BernoulliNB,0.0,0.862591
5,SVC,0.55,0.818776
2,KNeighborsClassifier,0.02,0.766095
1,DecisionTreeClassifier,0.03,0.741681


In [32]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [33]:
y_pred = model.predict(X_test)

In [34]:
roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)  

# Print results
print(f"ROC AUC: {roc_auc}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

ROC AUC: 0.7547377272886102
Accuracy: 0.8970262963873188
F1 Score: 0.6235399820305481


In [35]:
classif_report = classification_report(y_test, y_pred)

print(classif_report)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      3436
           1       0.72      0.55      0.62       633

    accuracy                           0.90      4069
   macro avg       0.82      0.75      0.78      4069
weighted avg       0.89      0.90      0.89      4069

