In [2]:
import pandas as pd
dfOnline = pd.read_csv("../data/online_shoppers_intention.csv")
dfOnline.shape

(12330, 18)

Apply MinMaxEncoder or StandardScaler on the continuous features

In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler1 = MinMaxScaler()
MinMax = pd.DataFrame(data = dfOnline[["Administrative_Duration", "Informational_Duration", "ProductRelated_Duration"]])
dfMinMax = pd.DataFrame(data = scaler1.fit_transform(MinMax))
dfMinMax.columns = ["Administrative_Duration", "Informational_Duration", "ProductRelated_Duration"]

scaler2 = StandardScaler()
Standard = pd.DataFrame(data = dfOnline[["Administrative" ,"Informational","ProductRelated","PageValues"]])
dfStandard = pd.DataFrame(data = scaler2.fit_transform(Standard))
dfStandard.columns = ["Administrative" ,"Informational","ProductRelated","PageValues" ]

Apply OneHotEncoder or OrdinalEncoder on categorical features

In [14]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

dfOneHot = pd.DataFrame(dfOnline[["SpecialDay","Month","OperatingSystems","Browser","Region","TrafficType","VisitorType", "Weekend"]])
enc = OneHotEncoder(sparse=False,handle_unknown ='ignore' )
allOneHot = enc.fit_transform(dfOneHot)
colNames = list(enc.get_feature_names(["SpecialDay","Month","OperatingSystems","Browser","Region","TrafficType","VisitorType", "Weekend"]))
dfOneHot_new = pd.DataFrame(data = allOneHot)
dfOneHot_new.columns = colNames



Apply the LabelEncoder on the target variable

In [15]:
from sklearn.preprocessing import LabelEncoder
tar =  LabelEncoder()
TaVar = list(dfOnline["Revenue"])
dfTaVar = pd.DataFrame(data = tar.fit_transform(TaVar))
dfTaVar.columns = ["Revenue"]


Unchanged features

In [16]:
dfUnchange = dfOnline[["BounceRates","ExitRates"]]

Combine all the preprocessed data together

In [17]:
frames = [dfOneHot_new,dfMinMax,dfUnchange,dfStandard,dfTaVar]
dfCombined = pd.concat(frames, axis = 1)
dfCombined.head()
dfCombined.to_csv('../data/prepocessedData.csv')


Check the missing values

In [18]:
print(dfCombined.isnull().sum(axis=0)/dfCombined.shape[0])


SpecialDay_0.0    0.000000
SpecialDay_0.2    0.000000
SpecialDay_0.4    0.000000
SpecialDay_0.6    0.000000
SpecialDay_0.8    0.000000
                    ...   
Administrative    0.001135
Informational     0.001135
ProductRelated    0.001135
PageValues        0.000000
Revenue           0.000000
Length: 81, dtype: float64


From the percentage above, I notice that only small fraction of points contain missing values. Then, I did the MCAR test. 

In [20]:
import numpy as np
import pandas as pd
import math as ma
import scipy.stats as st

def checks_input_mcar_tests(data):
    """ Checks whether the input parameter of class McarTests is correct
            Parameters
            ----------
            data:
                The input of McarTests specified as 'data'
            Returns
            -------
            bool
                True if input is correct
            """

    if not isinstance(data, pd.DataFrame):
        print("Error: Data should be a Pandas DataFrame")
        return False

    if not any(data.dtypes.values == np.float):
        if not any(data.dtypes.values == np.int):
            print("Error: Dataset cannot contain other value types than floats and/or integers")
            return False

    if not data.isnull().values.any():
        print("Error: No NaN's in given data")
        return False

    return True



def mcar_test(data):
    """ Implementation of Little's MCAR test
    Parameters
    ----------
    data: Pandas DataFrame
        An incomplete dataset with samples as index and variables as columns
    Returns
    -------
    p_value: Float
        This value is the outcome of a chi-square statistical test, testing whether the null hypothesis
        'the missingness mechanism of the incomplete dataset is MCAR' can be rejected.
    """

    if not checks_input_mcar_tests(data):
        raise Exception("Input not correct")

    dataset = data.copy()
    vars = dataset.dtypes.index.values
    n_var = dataset.shape[1]

    # mean and covariance estimates
    # ideally, this is done with a maximum likelihood estimator
    gmean = dataset.mean()
    gcov = dataset.cov()

    # set up missing data patterns
    r = 1 * dataset.isnull()
    mdp = np.dot(r, list(map(lambda x: ma.pow(2, x), range(n_var))))
    sorted_mdp = sorted(np.unique(mdp))
    n_pat = len(sorted_mdp)
    correct_mdp = list(map(lambda x: sorted_mdp.index(x), mdp))
    dataset['mdp'] = pd.Series(correct_mdp, index=dataset.index)

    # calculate statistic and df
    pj = 0
    d2 = 0
    for i in range(n_pat):
        dataset_temp = dataset.loc[dataset['mdp'] == i, vars]
        select_vars = ~dataset_temp.isnull().any()
        pj += np.sum(select_vars)
        select_vars = vars[select_vars]
        means = dataset_temp[select_vars].mean() - gmean[select_vars]
        select_cov = gcov.loc[select_vars, select_vars]
        mj = len(dataset_temp)
        parta = np.dot(means.T, np.linalg.solve(select_cov, np.identity(select_cov.shape[1])))
        d2 += mj * (np.dot(parta, means))

    df = pj - n_var

    # perform test and save output
    p_value = 1 - st.chi2.cdf(d2, df)

    return p_value


print(mcar_test(dfCombined))

0.006894233374809544


From MCAR test, the p-value is aound 0.007 which is much smaller than 0.05. So, I decided to drop those observations. 

In [23]:
dfAfterDrop = dfCombined.dropna()
print(dfAfterDrop.shape)
dfAfterDrop.to_csv('../data/prepocessedData_Afterdrop.csv')

(12316, 81)


### EDA
##### The balance of the dataset

In [25]:
label = "Revenue"
print(dfAfterDrop[label].value_counts()/dfAfterDrop[label].shape)

0    0.84508
1    0.15492
Name: Revenue, dtype: float64
