In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Load dataset
dfpath = os.path.join(os.getcwd(), "data", "training_data.csv")
print("Loading file from: {}".format(dfpath))

df = pd.read_csv(dfpath, header=0)
df.head()

Loading file from: D:\GroupAssignment\data\training_data.csv


Unnamed: 0,session_id_hash,event_type,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url
0,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,d5157f8bc52965390fa21ad5842a8502bc3eb8b0930f3f...,1550885210881,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...
1,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,61ef3869355b78e11011f39fc7ac8f8dfb209b3442a9d5...,1550885213307,4ed279f4f0deab6dfc80f4f7bf49d527fd894fa478a9ce...
2,20c458b802f6ea9374783bfc528b19421be977a6769785...,pageview,,,1550885213307,4ed279f4f0deab6dfc80f4f7bf49d527fd894fa478a9ce...
3,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,d5157f8bc52965390fa21ad5842a8502bc3eb8b0930f3f...,1550885215484,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...
4,20c458b802f6ea9374783bfc528b19421be977a6769785...,pageview,,,1550885215484,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...


In [4]:
# Sessionization. Code from practical session.
# Same process may be repeatable for timeestamps in case we need that information for a model
df['product_action'] = df['product_action'].fillna('view')
df = df.groupby('session_id_hash')['product_action'].agg(tuple).reset_index()
del df['session_id_hash']
df.shape

(4187127, 1)

In [5]:
# Drop all sessions not containing any 'add' events
df = df[df.product_action.map(set(['add']).issubset)]
df = df[df.product_action.map(lambda x: "add" in x)]
df.shape

(174054, 1)

In [6]:
# Map the index of the first add event to a separate column for future reference
df['add'] = df['product_action'].map(lambda x: x.index('add'))
df.head()

Unnamed: 0,product_action,add
0,"(view, detail, add, view, view, view, view, vi...",2
37,"(view, view, view, detail, view, view, detail,...",18
64,"(view, view, view, detail, view, view, view, v...",16
84,"(view, detail, add, view, view, detail, add, v...",2
119,"(view, view, view, view, detail, add, remove, ...",5


In [7]:
# Class labeling. Code from practical session.
df['purchase'] = np.where(df.product_action.map(set(['purchase']).issubset), 1, 0)

In [8]:
# Cutting down purchase sessions
# Based on Carlijn Jurriaan's solution on Canvas discussion board
df['product_action'] = df['product_action'].map(lambda x: x[0:x.index('purchase')] if 'purchase' in x else x)

In [9]:
# Drop all sessions with a length shorter than 5 or greater than 155
df["len"] = df["product_action"].map(len)
df.drop(df[~df.product_action.map(len).between(5, 155)].index, inplace=True)
df = df.reset_index()
del df['index']
df.shape

(161350, 4)

In [12]:
# Symbolization, based on practical session.
from collections import Counter

counts = Counter([item for session in df['product_action'] for item in session])
symbol_alpha = {action : idx for idx, action in enumerate(sorted(counts, key=counts.get, reverse=True), 1)}
print(counts, '\nSymbol alphabet: ', symbol_alpha)

# Overwriting the product action column.
df['session'] = df['product_action'].map(lambda session: [symbol_alpha[action] for action in session])
df = df[['session', 'purchase', 'len', 'add']]
df.head()

Counter({'view': 3014880, 'detail': 1010529, 'add': 246971, 'remove': 217562}) 
Symbol alphabet:  {'view': 1, 'detail': 2, 'add': 3, 'remove': 4}


Unnamed: 0,session,purchase,len,add
0,"[1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, ...",0,18,2
1,"[1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, ...",0,139,18
2,"[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ...",0,41,16
3,"[1, 2, 3, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0,16,2
4,"[1, 1, 1, 1, 2, 3, 4, 1, 2, 1, 1, 1, 4, 4, 4, ...",1,29,5


In [13]:
df.to_csv(os.path.join(os.getcwd(), "data", "filtered_long.csv"))