# Online Shoppers Purchasing Intention Prediction
Authors: Julian Daduica, Stephanie Ta, and Wai Ming Wong

In [None]:
from ucimlrepo import fetch_ucirepo # raw data is from this package
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV

## Summary

## Introduction

## Methods

### Data

### Analysis

## Results and Discussion

In [3]:
#Dataset importing script from UCI ML Repository
# fetch dataset 
online_shoppers_purchasing_intention_dataset = fetch_ucirepo(id=468) 

# data (as pandas dataframes) and save it as csv
X = online_shoppers_purchasing_intention_dataset.data.features 
y = online_shoppers_purchasing_intention_dataset.data.targets
df = pd.concat([X, y], axis=1)
df.to_csv("../data/raw/raw_df.csv")

# variable information 
print(online_shoppers_purchasing_intention_dataset.variables) 

# split the training set and testing set and save them as csv files
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)
train_df.to_csv("../data/processed/train_df.csv")
test_df.to_csv("../data/processed/test_df.csv")

# split X, y in the training set and testing set
X_train = train_df.drop(columns=["Revenue"])
X_test = test_df.drop(columns=["Revenue"])
y_train = train_df["Revenue"]
y_test = test_df["Revenue"]


                       name     role         type demographic description  \
0            Administrative  Feature      Integer        None        None   
1   Administrative_Duration  Feature      Integer        None        None   
2             Informational  Feature      Integer        None        None   
3    Informational_Duration  Feature      Integer        None        None   
4            ProductRelated  Feature      Integer        None        None   
5   ProductRelated_Duration  Feature   Continuous        None        None   
6               BounceRates  Feature   Continuous        None        None   
7                 ExitRates  Feature   Continuous        None        None   
8                PageValues  Feature      Integer        None        None   
9                SpecialDay  Feature      Integer        None        None   
10                    Month  Feature  Categorical        None        None   
11         OperatingSystems  Feature      Integer        None        None   

In [None]:
#X_train = train_df.drop(columns=["Revenue"])
#X_test = test_df.drop(columns=["Revenue"])
#y_train = train_df["Revenue"]
#y_test = test_df["Revenue"]

In [7]:
# create baseline model to compare final model to
dummy_classifier = DummyClassifier()
dummy_classifier.fit(X_train, y_train)
dummy_cv_scores = pd.DataFrame(
    cross_validate(dummy_classifier, X_train, y_train, cv = 5, return_train_score = True))
mean_dummy_validation_accuracy = dummy_cv_scores['test_score'].mean()
mean_dummy_validation_accuracy

np.float64(0.8494960081213042)

In [19]:
X_train['Month'].value_counts()

Month
May     2356
Nov     2087
Mar     1328
Dec     1230
Oct      381
Sep      313
Aug      306
Jul      296
June     197
Feb      137
Name: count, dtype: int64

In [None]:
numeric_cols = ['Administrative', 'Administrative_Duration',
                'Informational', 'Informational_Duration',
                'ProductRelated', 'ProductRelated_Duration',
                'BounceRates', 'ExitRates',
                'PageValues', 'SpecialDay']
categorical_cols = ['Weekend', 'OperatingSystems',
                    'Browser', 'Region',
                    'TrafficType', 'VisitorType']
ordinal_cols = ['Month']

In [None]:

month_levels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

OrdinalEncoder(categories=[month_levels])

In [8]:
# make preprocessor
X_train.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
2476,5,626.0,1,66.0,77,2660.15,0.0,0.008228,41.92519,0.0,May,1,1,3,2,Returning_Visitor,False
582,1,43.0,0,0.0,37,1505.166667,0.0,0.005263,0.0,0.0,Mar,2,2,4,8,New_Visitor,False
10029,0,0.0,0,0.0,9,258.291667,0.0,0.022222,0.0,0.0,Nov,2,5,3,2,Returning_Visitor,False
1154,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Mar,1,1,1,9,Returning_Visitor,True
1579,11,348.654762,1,11.0,64,1229.289286,0.0,0.002494,54.951269,0.0,Mar,2,2,1,2,New_Visitor,False


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8631 entries, 2476 to 3582
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           8631 non-null   int64  
 1   Administrative_Duration  8631 non-null   float64
 2   Informational            8631 non-null   int64  
 3   Informational_Duration   8631 non-null   float64
 4   ProductRelated           8631 non-null   int64  
 5   ProductRelated_Duration  8631 non-null   float64
 6   BounceRates              8631 non-null   float64
 7   ExitRates                8631 non-null   float64
 8   PageValues               8631 non-null   float64
 9   SpecialDay               8631 non-null   float64
 10  Month                    8631 non-null   object 
 11  OperatingSystems         8631 non-null   int64  
 12  Browser                  8631 non-null   int64  
 13  Region                   8631 non-null   int64  
 14  TrafficType              8

In [11]:
X_train.describe(include="all")

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
count,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631,8631.0,8631.0,8631.0,8631.0,8631,8631
unique,,,,,,,,,,,10,,,,,3,2
top,,,,,,,,,,,May,,,,,Returning_Visitor,False
freq,,,,,,,,,,,2356,,,,,7366,6610
mean,2.318851,80.035963,0.496582,33.735985,31.506546,1179.548652,0.022252,0.04318,5.765987,0.06333,,2.129765,2.353261,3.150852,4.071371,,
std,3.326228,173.132521,1.244019,138.9954,44.119701,1895.590842,0.048634,0.048648,18.215382,0.202414,,0.925164,1.727358,2.408261,4.011918,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,1.0,,
25%,0.0,0.0,0.0,0.0,7.0,182.208333,0.0,0.014286,0.0,0.0,,2.0,2.0,1.0,2.0,,
50%,1.0,7.0,0.0,0.0,18.0,593.70198,0.003077,0.025466,0.0,0.0,,2.0,2.0,3.0,2.0,,
75%,4.0,93.115833,0.0,0.0,37.0,1439.177083,0.017124,0.05,0.0,0.0,,3.0,2.0,4.0,4.0,,


In [None]:
# create and tune logistic regression model
LogisticRegression(max_iter=2000, random_state=123)

## References