In [None]:
import os
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
# Load dataset
dfpath = os.path.join(os.getcwd(), "data", "training_data.csv")
print("Loading file from: {}".format(dfpath))

# Only store first 999999 rows for testing, comparable to data subset
df = pd.read_csv(dfpath, header=0, nrows=999999)
df.head()

In [None]:
# Sessionization
# Taken from practical session file: Assumed this is about as efficient as it gets
df['product_action'] = df['product_action'].fillna('view')
df = df.groupby('session_id_hash')['product_action'].agg(tuple).reset_index()
del df['session_id_hash']
df.shape

In [None]:
# Drop all sessions not containing any 'add' events
df.drop(df[df.product_action.map(lambda x: 'add' not in x)].index, inplace=True)
df.shape

In [None]:
# Drop all sessions with a length shorter than 5 or greater than 155
df.drop(df[~df.product_action.map(len).between(5, 155)].index, inplace=True)
df = df.reset_index()
del df['index']
df.shape

In [None]:
# Slice the head of the sessions until after the first add event
df['product_action'] = df['product_action'].map(lambda x: x[x.index('add') + 1:-1])
df.head()

In [None]:
# Class labeling
# From practical session.
df['purchase'] = np.where(df.product_action.map(set(['purchase']).issubset), 1, 0)

In [None]:
# Cutting down purchase sessions
# Based on Carlijn Jurriaan's solution on Canvas discussion board
df['product_action'] = df['product_action'].map(lambda x: x[0:x.index('purchase')] if 'purchase' in x else x)

# Clear any empty lists from the database. Not sure if this is strictly necessary, given other filters.
df = df[df['product_action'].str.len() > 0]
df.head()

In [None]:
# Symbolization
# Based on practical session 1. Counter doesn't seem that useful but unsure about alternative
counts = Counter([item for session in df['product_action'] for item in session])
symbol_alpha = {action : idx for idx, action in enumerate(counts, 1)}
print(counts, '\nSymbol alphabet: ', symbol_alpha)

# Overwriting the product action column. Maybe numpy arrays would be faster?
df['session'] = df['product_action'].map(lambda session: tuple([symbol_alpha[action] for action in session]))
df = df[['session', 'purchase']]
df.head()

In [None]:
# Split set into training and validation set
from sklearn.model_selection import train_test_split

# No real reason for the selected ratio. In PS they used 70/30. Don't forget to set state.
training_set, eval_set = train_test_split(df, test_size=0.2, random_state=52)
eval_set.head()

In [None]:
# Return an oracle file for N clicks after add event (e.g. event 0)
def make_oracle(max_clicks):
    
    """
    Find the count of each unique session, both for purchase and no purchase sessions
    Function creates an oracle based on the clicks post add-to-cart, so filters based on session length
    Since we dropped all actions up to and including first add we can start from index 0
    """ 
    
    n_buy = eval_set.session[(eval_set['purchase'] == 1) & (eval_set['session'].str.len() >= max_clicks)].value_counts()
    n_nobuy = eval_set.session[(eval_set['purchase'] == 0) & (eval_set['session'].str.len() >= max_clicks)].value_counts()
    
    oracle = {}

    """
    Are we even calculating precision here? Is the assumption that everything is flagged as a purchase session?
    Don't even get me started on how you're supposed to calculate F1 for this thing. No earthly idea.
    """
    
    for key in n_all.keys():
        precision = n_buy.get(key, default=0) / (n_buy.get(key, default = 0) + n_nobuy.get(key, default = 0))
        oracle[key] = precision  
    return oracle

# As per the assignment, oracles are made for 5, 10 and 15 post add-to-cart clicks
oracle5 = make_oracle(5)
oracle10 = make_oracle(10)
oracle15 = make_oracle(15)

In [None]:
# Subset all nonzero results (i.e. that can potentially lead to a purchase)
o5_nonzero = list([(key, value) for key, value in oracle5.items() if value > 0.0])
print(oracle5)