In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# Load dataset.

################################################################################################
###  Filename is based on Preprocessing notebook, please check there if you get any errors   ###
################################################################################################

dfpath = os.path.join(os.getcwd(), "data", "filtered_long.csv")
print("Loading file from: {}".format(dfpath))

df = pd.read_csv(dfpath, header=0)
df.head()

Loading file from: C:\Programming\CustomerAnalysis\GroupAssignment\data\filtered_long.csv


Unnamed: 0.1,Unnamed: 0,session,purchase,len,add
0,0,"(1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, ...",0,18,2
1,1,"(1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, ...",0,139,18
2,2,"(1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ...",0,41,16
3,3,"(1, 2, 3, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1)",0,16,2
4,4,"(1, 1, 1, 1, 2, 3, 4, 1, 2, 1, 1, 1, 4, 4, 4, ...",1,29,5


In [3]:
# Format the sessions from a string in the form [1,2,3] to a working tuple
df.session = df.session.map(lambda x: tuple(map(int, x.strip("()[]").split(","))))
type(df.session[0])

tuple

In [4]:
# Split set into training and validation set
from sklearn.model_selection import train_test_split

# No real reason for the selected ratio. In PS they used 70/30. 

######################################
###   Don't forget to set state.   ###
######################################

training_set, eval_set = train_test_split(df, test_size=0.2, random_state=123, shuffle=False)
eval_set.head()

Unnamed: 0.1,Unnamed: 0,session,purchase,len,add
129080,129080,"(1, 1, 2, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,17,6
129081,129081,"(1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 3, 1)",0,16,14
129082,129082,"(1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, ...",0,38,19
129083,129083,"(1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1)",1,13,5
129084,129084,"(1, 1, 2, 3, 1, 1, 2, 3, 1, 1, 2, 1, 1, 1, 1, ...",1,20,3


In [5]:
from sklearn import metrics
from collections import defaultdict

# Ideally Oracle would be a class object that contains information about e.g. clicks but CBA

def fit_oracle(data, clicks):
    
    ### Max clicks determines: 
    ### - Which sessions are evaluated. Any sessions with a "tail" shorter than max clicks are dropped
    ### - Where sessions are cut. A session with a "tail" longer than max clicks is sliced
    data = data[(data['len'] - data['add']) > clicks]
    sessions = data.apply(lambda x: x.session[:(x['add'] + clicks + 1)], axis=1)
    labels = data.purchase
    
    del data
    
    ### I finally caved and used the practical session code.
    
    d = defaultdict(lambda: defaultdict(int))
    for session, label in zip(sessions, labels):
        d[str(tuple(session))][label] += 1
    
    oracle = {}
    for session in d.keys():
        n_buy, n_ws = 0, 0
        for label in d[session].keys():
            if label:
                n_buy += d[session][label]
            else:
                n_ws += d[session][label]
        oracle[session] = n_buy / (n_buy + n_ws)

    return oracle

def predict_oracle(oracle, data, clicks):
    data = data[(data['len'] - data['add']) > clicks]
    sessions = data.apply(lambda x: x.session[:(x['add'] + clicks) + 1], axis=1)    
    true = data.purchase
    
    # The best metric for determining a label is not > 50%
    ratio = float((data.purchase[data['purchase'] == 1].value_counts() / len(data.purchase)))
    
    del data

    predict = sessions.map(lambda x: 1 if oracle.get(str(x)) > ratio else 0)

    f1 = metrics.f1_score(predict, true)
    
    return f1

In [60]:
### Do not fit the oracle on the training set! It is a metric for the evaluation set!
### This instantly solves the 'missing pattern' issue and matches with the practical session
oracle = fit_oracle(eval_set, 5)
p5 = predict_oracle(oracle, eval_set, 5)
print(p5)

0.24479604219172968
0.7088156723063223
