# Online Shoppers Purchasing Intention Prediction
Authors: Julian Daduica, Stephanie Ta, and Wai Ming Wong

In [1]:
from ucimlrepo import fetch_ucirepo # raw data is from this package
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

## Summary

## Introduction

## Methods

### Data

### Analysis

## Results and Discussion

In [2]:
#Dataset importing script from UCI ML Repository
# fetch dataset 
online_shoppers_purchasing_intention_dataset = fetch_ucirepo(id=468) 

# data (as pandas dataframes) and save it as csv
X = online_shoppers_purchasing_intention_dataset.data.features 
y = online_shoppers_purchasing_intention_dataset.data.targets
df = pd.concat([X, y], axis=1)
df.to_csv("../data/raw/raw_df.csv")

# variable information 
print(online_shoppers_purchasing_intention_dataset.variables) 

# split the training set and testing set and save them as csv files
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)
train_df.to_csv("../data/processed/train_df.csv")
test_df.to_csv("../data/processed/test_df.csv")

# split X, y in the training set and testing set
X_train = train_df.drop(columns=["Revenue"])
X_test = test_df.drop(columns=["Revenue"])
y_train = train_df["Revenue"]
y_test = test_df["Revenue"]


                       name     role         type demographic description  \
0            Administrative  Feature      Integer        None        None   
1   Administrative_Duration  Feature      Integer        None        None   
2             Informational  Feature      Integer        None        None   
3    Informational_Duration  Feature      Integer        None        None   
4            ProductRelated  Feature      Integer        None        None   
5   ProductRelated_Duration  Feature   Continuous        None        None   
6               BounceRates  Feature   Continuous        None        None   
7                 ExitRates  Feature   Continuous        None        None   
8                PageValues  Feature      Integer        None        None   
9                SpecialDay  Feature      Integer        None        None   
10                    Month  Feature  Categorical        None        None   
11         OperatingSystems  Feature      Integer        None        None   

In [3]:
# create baseline model to compare final model to
dummy_classifier = DummyClassifier()
dummy_classifier.fit(X_train, y_train)
dummy_cv_scores = pd.DataFrame(
    cross_validate(dummy_classifier, X_train, y_train, cv = 5, return_train_score = True))
mean_dummy_validation_accuracy = dummy_cv_scores['test_score'].mean()
mean_dummy_validation_accuracy

np.float64(0.8494960081213042)

In [4]:
# lists of each type of feature
numeric_cols = ['Administrative', 'Administrative_Duration',
                'Informational', 'Informational_Duration',
                'ProductRelated', 'ProductRelated_Duration',
                'BounceRates', 'ExitRates',
                'PageValues', 'SpecialDay']
categorical_cols = ['Weekend', 'OperatingSystems',
                    'Browser', 'Region',
                    'TrafficType', 'VisitorType']
ordinal_cols = ['Month']

In [5]:
# make preproccessor, note imputation is not needed since there are no null values in the data set
month_levels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_cols),
    (OneHotEncoder(drop='if_binary', sparse_output=False, handle_unknown='ignore'), categorical_cols),
    (OrdinalEncoder(categories=[month_levels]), ordinal_cols)
)

In [6]:
# make pipeline including preprocessor and logistic regression model
log_reg_pipe = make_pipeline(
    preprocessor, LogisticRegression(max_iter=2000, random_state=123)
)

In [7]:
# tune hyperparameter C of the logistic regression model
param_grid = {
    "logisticregression__C": loguniform(1e-3, 1e3),
}

random_search = RandomizedSearchCV(
    log_reg_pipe,
    param_grid,
    n_iter=100,
    verbose=1,
    n_jobs=-1,
    random_state=123,
    return_train_score=True,
)

random_search.fit(X_train, y_train)

print("Best hyperparameter value: ", random_search.best_params_)
print("Best score: %0.3f" % (random_search.best_score_))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameter value:  {'logisticregression__C': np.float64(0.916453820211066)}
Best score: 0.887


Cross Validation Results:
- the best C for logistic regression: 0.916
- validation accuracy score of best logistic regression model: 0.887
- validation accuracy score of dummy classifier: 0.849
- validation accuracy score of best logistic regression model is slightly (3.8%) better than the validation accuracy score of the dummy classifier


In [8]:
# score dummy and best logistic regression model on test set
dummy_test_score = dummy_classifier.score(X_test, y_test)
log_reg_test_score = random_search.score(X_test, y_test)


print("Dummy classifier test score: %0.3f" % dummy_test_score)
print("Best logistic regression model test score: %0.3f" % log_reg_test_score)

Dummy classifier test score: 0.835
Best logistic regression model test score: 0.875




Testing results:
- test accuracy score of best logistic regression model: 0.875
- test accuracy score of dummy classifier: 0.835
- test accuracy score of best logisitic regression model is just a bit (4%) better than the test accuracy score of the dmmy classfier

In [9]:
# find weights of each feature
best_estimator = random_search.best_estimator_
feature_names = best_estimator['columntransformer'].get_feature_names_out() # get feature names
weights = best_estimator["logisticregression"].coef_ # get feature coefficients

feat_weights = pd.DataFrame(weights, columns = feature_names)
feat_weights = feat_weights.T.reset_index()
feat_weights = feat_weights.rename(columns={'index': 'feature', 0: "weight"})
feat_weights['feature'] = feat_weights['feature'].str.split('__', expand = True)[1]
feat_weights.sort_values(by='weight', ascending=False)


Unnamed: 0,feature,weight
8,PageValues,1.432432
30,Browser_12,0.621479
56,TrafficType_16,0.610121
17,OperatingSystems_7,0.491074
51,TrafficType_11,0.480625
...,...,...
16,OperatingSystems_6,-0.501563
53,TrafficType_13,-0.617016
55,TrafficType_15,-0.630044
21,Browser_3,-0.777964


In [10]:
# average the weights of each feature

feat_weights['overall_feature'] = feat_weights['feature'].str.split('_', expand = True)[0]

# absolute value the weights so that positive and negative ones don't cancel eachother out
feat_weights['absolute_value_weight'] = abs(feat_weights['weight'])

pd.DataFrame(feat_weights.groupby('overall_feature'
    ).mean(numeric_only=True
    ).sort_values('absolute_value_weight', ascending = False
    )['absolute_value_weight'])

Unnamed: 0_level_0,absolute_value_weight
overall_feature,Unnamed: 1_level_1
PageValues,1.432432
ExitRates,0.838121
TrafficType,0.285063
Browser,0.232961
OperatingSystems,0.212912
VisitorType,0.17787
Weekend,0.158143
ProductRelated,0.145723
Region,0.082729
Month,0.076894


For determining the target, the `ExitRates` and `PagesValues` features seem to be the most important.

## References