# Online Shoppers Purchasing Intention Prediction
Authors: Julian Daduica, Stephanie Ta, and Wai Ming Wong

In [29]:
from ucimlrepo import fetch_ucirepo # raw data is from this package
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

## Summary

## Introduction

## Methods

### Data

### Analysis

## Results and Discussion

In [3]:
#Dataset importing script from UCI ML Repository
# fetch dataset 
online_shoppers_purchasing_intention_dataset = fetch_ucirepo(id=468) 

# data (as pandas dataframes) and save it as csv
X = online_shoppers_purchasing_intention_dataset.data.features 
y = online_shoppers_purchasing_intention_dataset.data.targets
df = pd.concat([X, y], axis=1)
df.to_csv("../data/raw/raw_df.csv")

# variable information 
print(online_shoppers_purchasing_intention_dataset.variables) 

# split the training set and testing set and save them as csv files
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)
train_df.to_csv("../data/processed/train_df.csv")
test_df.to_csv("../data/processed/test_df.csv")

# split X, y in the training set and testing set
X_train = train_df.drop(columns=["Revenue"])
X_test = test_df.drop(columns=["Revenue"])
y_train = train_df["Revenue"]
y_test = test_df["Revenue"]


                       name     role         type demographic description  \
0            Administrative  Feature      Integer        None        None   
1   Administrative_Duration  Feature      Integer        None        None   
2             Informational  Feature      Integer        None        None   
3    Informational_Duration  Feature      Integer        None        None   
4            ProductRelated  Feature      Integer        None        None   
5   ProductRelated_Duration  Feature   Continuous        None        None   
6               BounceRates  Feature   Continuous        None        None   
7                 ExitRates  Feature   Continuous        None        None   
8                PageValues  Feature      Integer        None        None   
9                SpecialDay  Feature      Integer        None        None   
10                    Month  Feature  Categorical        None        None   
11         OperatingSystems  Feature      Integer        None        None   

In [None]:
#X_train = train_df.drop(columns=["Revenue"])
#X_test = test_df.drop(columns=["Revenue"])
#y_train = train_df["Revenue"]
#y_test = test_df["Revenue"]

In [7]:
# create baseline model to compare final model to
dummy_classifier = DummyClassifier()
dummy_classifier.fit(X_train, y_train)
dummy_cv_scores = pd.DataFrame(
    cross_validate(dummy_classifier, X_train, y_train, cv = 5, return_train_score = True))
mean_dummy_validation_accuracy = dummy_cv_scores['test_score'].mean()
mean_dummy_validation_accuracy

np.float64(0.8494960081213042)

In [22]:
# lists of each type of feature
numeric_cols = ['Administrative', 'Administrative_Duration',
                'Informational', 'Informational_Duration',
                'ProductRelated', 'ProductRelated_Duration',
                'BounceRates', 'ExitRates',
                'PageValues', 'SpecialDay']
categorical_cols = ['Weekend', 'OperatingSystems',
                    'Browser', 'Region',
                    'TrafficType', 'VisitorType']
ordinal_cols = ['Month']

In [31]:
# make preproccessor, note imputation is not needed since there are no null values in the data set
month_levels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_cols),
    (OneHotEncoder(drop='if_binary', sparse_output=False, handle_unknown='ignore'), categorical_cols),
    (OrdinalEncoder(categories=[month_levels]), ordinal_cols)
)

In [32]:
# make pipeline including preprocessor and logistic regression model
log_reg_pipe = make_pipeline(
    preprocessor, LogisticRegression(max_iter=2000, random_state=123)
)

In [34]:
# tune hyperparameter C of the logistic regression model
param_grid = {
    "logisticregression__C": loguniform(1e-3, 1e3),
}

random_search = RandomizedSearchCV(
    log_reg_pipe,
    param_grid,
    n_iter=100,
    verbose=1,
    n_jobs=-1,
    random_state=123,
    return_train_score=True,
)

random_search.fit(X_train, y_train)

print("Best hyperparameter value: ", random_search.best_params_)
print("Best score: %0.3f" % (random_search.best_score_))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameter value:  {'logisticregression__C': np.float64(0.916453820211066)}
Best score: 0.887


## References