In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
# Load dataset
dfpath = os.path.join(os.getcwd(), "data", "training_data.csv")
print("Loading file from: {}".format(dfpath))

# Only store first 999999 rows for testing, comparable to data subset
df = pd.read_csv(dfpath, header=0, nrows=999999)
df.head()

Loading file from: C:\Programming\CustomerAnalysis\GroupAssignment\data\training_data.csv


Unnamed: 0,session_id_hash,event_type,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url
0,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,d5157f8bc52965390fa21ad5842a8502bc3eb8b0930f3f...,1550885210881,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...
1,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,61ef3869355b78e11011f39fc7ac8f8dfb209b3442a9d5...,1550885213307,4ed279f4f0deab6dfc80f4f7bf49d527fd894fa478a9ce...
2,20c458b802f6ea9374783bfc528b19421be977a6769785...,pageview,,,1550885213307,4ed279f4f0deab6dfc80f4f7bf49d527fd894fa478a9ce...
3,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,d5157f8bc52965390fa21ad5842a8502bc3eb8b0930f3f...,1550885215484,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...
4,20c458b802f6ea9374783bfc528b19421be977a6769785...,pageview,,,1550885215484,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...


In [3]:
# Sessionization
# Taken from practical session file: Assumed this is about as efficient as it gets
df['product_action'] = df['product_action'].fillna('view')
df = df.groupby('session_id_hash')['product_action'].agg(tuple).reset_index()
del df['session_id_hash']
df.shape

(138074, 1)

In [4]:
# Drop all sessions not containing any 'add' events
df.drop(df[df.product_action.map(lambda x: 'add' not in x)].index, inplace=True)
df.shape

(5626, 1)

In [5]:
# Drop all sessions with a length shorter than 5 or greater than 155
# *Marginally* faster than practical session method (~.05s @ 1m rows). Readability?
df.drop(df[~df.product_action.map(len).between(5, 155)].index, inplace=True)
df = df.reset_index()
del df['index']
df.shape

(5229, 1)

In [6]:
# Slice the head of the sessions until an add event
"""
Original plan was to save the index of the first add event, so the full session
can be used to predict an outcome

Turns out it sucks to have to work with an additional index when building the model
Keeping this just in case it turns out there's a use for it after all

df['add_index'] = df['product_action'].map(lambda x: x.index('add'))
"""

df['product_action'] = df['product_action'].map(lambda x: x[x.index('add'):-1])
df.head()

Unnamed: 0,product_action
0,"(add, view, detail, view, view, view, detail, ..."
1,"(add, view, view, view, view, view, view, view..."
2,"(add, view, remove, view, view, view)"
3,"(add, remove, view, view, view, detail, view, ..."
4,"(add, view, view, detail, view, add, view, det..."


In [7]:
# Class labeling
# From practical session. Improvement possible?
df['purchase'] = np.where(df.product_action.map(set(['purchase']).issubset), 1, 0)

In [8]:
# Cutting down purchase sessions
# Based on Carlijn Jurriaan's solution on Canvas discussion board
df['product_action'] = df['product_action'].map(lambda x: x[0:x.index('purchase')] if 'purchase' in x else x)
df = df[df['product_action'].str.len() > 0]
df.head()

Unnamed: 0,product_action,purchase
0,"(add, view, detail, view, view, view, detail, ...",0
1,"(add, view, view, view, view, view, view, view...",0
2,"(add, view, remove, view, view, view)",0
3,"(add, remove, view, view, view, detail, view, ...",0
4,"(add, view, view, detail, view, add, view, det...",0


In [9]:
# Symbolization
# Based on practical session 1. Counter doesn't seem that useful but unsure about alternative
counts = Counter([item for session in df['product_action'] for item in session])
symbol_alpha = {action : idx for idx, action in enumerate(counts, 1)}
print(counts, '\nSymbol alphabet: ', symbol_alpha)

# Overwriting the product action column. Maybe numpy arrays would be faster?
df['session'] = df['product_action'].map(lambda session: tuple([symbol_alpha[action] for action in session]))
df = df[['session', 'purchase']]
df.head()

Counter({'view': 51496, 'detail': 14669, 'add': 7630, 'remove': 6116}) 
Symbol alphabet:  {'add': 1, 'view': 2, 'detail': 3, 'remove': 4}


Unnamed: 0,session,purchase
0,"(1, 2, 3, 2, 2, 2, 3, 2, 3, 2, 1, 2, 2, 2, 2, ...",0
1,"(1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 2, 3, ...",0
2,"(1, 2, 4, 2, 2, 2)",0
3,"(1, 4, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 4, 2, 3, ...",0
4,"(1, 2, 2, 3, 2, 1, 2, 3, 2, 1, 2, 2)",0


In [15]:
# Split set into training and evaluation for Oracle (Is this even necessary?)
# The lecture / practical session started to fall apart around this point
from sklearn.model_selection import train_test_split

training_set, eval_set = train_test_split(df, test_size=0.3, random_state=52)
eval_set.head()

Unnamed: 0,session,purchase
506,"(1, 1, 2, 2, 2)",0
3475,"(1, 2, 2, 2, 3, 2, 3, 2)",0
378,"(1, 2, 2, 2, 2)",0
921,"(1, 2, 2, 4, 2, 2)",1
397,"(1, 2, 2)",1


In [17]:
# Return an oracle file for N clicks after add event (e.g. event 0)
def make_oracle(max_clicks):
    """
    The presentation of this part was uhhhh really fucking bad
    
    I have no idea what is meant by the '5 clicks post add-to-cart' in the assignment
    From description: " if a session ends 12 clicks after the first add-to-cart event, 
    it should not feature in the model evaluation at 15 clicks post add-to-cart."
    
    Does that mean it should only evaluate sessions longer than N clicks?
    Honestly I'm also still not sure how you're supposed to calculate F1 from this. 
    """
    
    # Find the count of each unique session, both for purchase and general sessions
    n_buy = eval_set.session[(eval_set['purchase'] == 1)].value_counts()
    n_all = eval_set.session.value_counts()
    
    oracle = {}

    for key in n_all.keys():
        precision = n_buy.get(key, default=0) / n_all.get(key, default=0)
        oracle[key] = precision  
    return oracle

oracle5 = make_oracle(10)
print(oracle5)

{(1, 2, 2): 0.43137254901960786, (1, 2): 0.29292929292929293, (1,): 0.0, (1, 2, 2, 2): 0.2962962962962963, (1, 2, 2, 2, 2): 0.2692307692307692, (1, 2, 2, 2, 2, 2): 0.5238095238095238, (1, 2, 3): 0.0, (1, 2, 2, 2, 2, 2, 2): 0.4, (1, 2, 2, 2, 2, 2, 2, 2, 2, 2): 0.5833333333333334, (1, 3): 0.0, (1, 2, 2, 2, 2, 2, 2, 2, 2): 0.45454545454545453, (1, 2, 2, 3, 2): 0.0, (1, 2, 2, 3): 0.0, (1, 2, 2, 2, 2, 2, 2, 2): 0.3333333333333333, (1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2): 0.25, (1, 1, 2, 2): 0.0, (1, 4): 0.0, (1, 1, 2, 2, 2): 0.42857142857142855, (1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2): 0.2857142857142857, (1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2): 0.3333333333333333, (1, 2, 3, 2): 0.0, (1, 1): 0.0, (1, 3, 2, 3): 0.0, (1, 2, 3, 2, 2, 2): 0.3333333333333333, (1, 2, 2, 3, 2, 2, 3): 0.0, (1, 2, 2, 2, 3, 2, 2): 0.0, (1, 1, 2, 2, 2, 2, 2): 0.6, (1, 2, 2, 2, 2, 2, 3, 2): 0.0, (1, 3, 2, 2): 0.2, (1, 2, 2, 2, 2, 2, 2, 3): 0.0, (1, 2, 3, 2, 2): 0.0, (1, 2, 2, 4): 0.0, (1, 2, 3, 2, 3): 0.0, (1, 2, 2, 3, 2, 2, 2)