# The questions to be answered by this analysis are:

### 1 What are the key metrics which contributes the most towards predicting a shopper's behavior?

### 2 What variables are most important to explain Revenued sessions?

### 3 What is the profile of the 'Right Customer' based on the metrics and variables included in the dataset?

### 4 Are the conversion rates of new visitors high when compared to those of returning customers?

## The data was reviewed and found to be ready for use.  No need for additional cleaning

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import fuzzywuzzy

##  Math & Statistics
import math
from collections import Counter
import scipy.stats as ss
from scipy.stats import chi2, \
                        chi2_contingency

#Metrics
from sklearn.metrics import accuracy_score, \
                            precision_score, \
                            recall_score, \
                            confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
file_path = "Resources/online_shoppers_intention.csv"
shoppers_df = pd.read_csv(file_path)
shoppers_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


## Functions Required for answering question 1
`What are the key metrics which contributes the most towards predicting a shopper's behavior?`

In [3]:
# Function that execute the qui-square test
def test_dependency(alpha, cont):
    conf = 1-alpha
    X2, p, dof, expected = chi2_contingency(cont)
    critical = chi2.ppf(conf, dof)
    if abs(X2) >= critical:
        print('dependent (reject H0)')
    else:
        print('independent (fail to reject H0)')


def conditional_entropy(x,y):
    # entropy of x given y
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x,y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y/p_xy)
    return entropy

# Uncertainty coefficient or Thiel's U
def theil_u(x,y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

## Use of Machine Learning to preprocess the dataset

In [4]:
import fuzzywuzzy
from fuzzywuzzy import process

class CustomPreProcessing():

    def __init__(self, min_ratio = 80, verbose = False):
        if(verbose):
            print("__init__")
        self.__class__.__name__ = 'CustomPreProcessing'
        self.min_ratio = min_ratio
        self.data = pd.DataFrame({'No data' : []})
        self.verbose = verbose

    
    def replace_matches_in_column(self, column, string_to_match, verbose=False):
        # get a list of unique strings
        strings = self.data[column].unique()
        
        # get the top 10 closest matches to our input string
        matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                            limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

        # only get matches with a ratio > 90
        close_matches = [matches[0] for matches in matches if matches[1] >= self.min_ratio]
        if (verbose):
            print(f'Searching for:{string_to_match}, close match: {close_matches}')
        # get the rows of all the close matches in our dataframe
        rows_with_matches = self.data[column].isin(close_matches)

        # replace all rows with close matches with the input matches 
        self.data.loc[rows_with_matches, column] = string_to_match

    def rem_inconsistent_date(self, X, values):
        if(self.verbose):
            print("rem_inconsistentdata")

        for val in values:
            self.replace_matches_in_column('Month', val)
    
    def map_months(self, months):
        if(self.verbose):    
            print('map_months')
        self.data['Month'] = self.data['Month'].map(dict((v,k+1) for k,v in enumerate(months)))
    
    def map_visitors(self, visitor_type):
        if(self.verbose):
            print('map_visitor_type')
        self.data['VisitorType'] = self.data['VisitorType'].map(dict((v,k) for k,v in enumerate(visitor_type)))
    
    def map_weekend(self):
        if(self.verbose):
            print('map_weekeend')     
        self.data['Weekend'] = self.data['Weekend'].map({True:1, False:0})
       

    def fit(self, X, y = None):
        if(self.verbose):
            print('fit')
        return self
    
    def transform(self, X, y = None):
        if(self.verbose):
            print("transform")

        self.data = X.copy()

        months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Oct','Sep','Nov','Dec']
        visitor_type = ['Returning_Visitor', 'New_Visitor','Other']

        try:
            
            # remove inconsistentdata
            self.rem_inconsistent_date(self.data, months)

            # mapping months from str to numeric
            self.map_months(months)

            # mapping visitortype from str to numeric
            self.map_visitors(visitor_type)

            # mapping Weekend from boolean to numeric
            self.map_weekend()

        except KeyError as ke:
            print(ke)
            pass
        finally:
            return self.data



    def __repr__(self):
        return 'CustomPreProcessing'

    def getdata(self):
        return self.data

## Creation of the baseline for the model

In [6]:
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

scoring = ('roc_auc','accuracy','precision','recall')

baseline = Pipeline([
                ('Preprocessor',CustomPreProcessing(min_ratio=80, verbose=False)),
                ('model',XGBClassifier(eval_metric='logloss'))])
scores = cross_validate(baseline, X_train_full, y_train, cv=5, scoring=scoring)

NameError: name 'X_train_full' is not defined