In [341]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import json
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ast import literal_eval
from functools import partial
import pickle
sns.set()

DATA = Path('../../data')
RAW  = DATA/'raw'
PROCESSED = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
%%time
product           = pd.read_csv(RAW/'productid_category.csv', low_memory=False)
train_tracking    = pd.read_csv(RAW/'train_tracking.csv', low_memory=False)
test_tracking     = pd.read_csv(RAW/'test_tracking.csv', low_memory=False)
train_session     = pd.read_csv(RAW/'train_session.csv', low_memory=False)
test_session      = pd.read_csv(RAW/'random_submission.csv', low_memory=False)

CPU times: user 1min 8s, sys: 9.29 s, total: 1min 17s
Wall time: 1min 17s


In [20]:
train_features = train_session.copy()
test_features = test_session.copy()

# Mapped actions

In [386]:
def set_event_types(tracking):
    def extract_page(x):
        pages_types = ['_LR', '_PA', '_LP', '_CAROUSEL', '_SHOW_CASE']
        pages = ['CAROUSEL', 'PA', 'SHOW_CASE']
        pages_map = [['PURCHASE_PRODUCT_UNKNOW_ORIGIN', 'UNKNOWN'], ['LIST_PRODUCT', 'LP'], ['SEARCH', 'LR']]
        
        for pages_type in pages_types:
            if x.endswith(pages_type):
                return x[-len(pages_type)+1:]
        for page in pages:
            if x == page:
                return x
        for page_map in pages_map:
            if x == page_map[0]:
                return page_map[1]
        return '::' + x
    
    def extract_event(x):
        page, _type, type_s = x
        concatenated = _type
        
        if page == 'UNKNOWN':
            return page
        
        actions = ['PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
        for action in actions:
            if action in concatenated:
                return page + '_' + action
        return page

    def extract_action(x):
        actions = ['PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
        event_type = x
        
        for action in actions:
            if action in event_type:
                return action
        return 'None'
        
    tracking['page_n_type'] = list(zip(tracking.type.apply(extract_page), tracking.type, tracking.type_simplified))
    tracking['event_type'] = tracking.page_n_type.apply(extract_event)
    tracking['action_type'] = tracking.event_type.apply(extract_action)
    return tracking

In [387]:
def map_actions(tracking):
    tracking = tracking.copy()
    
    columns = list(tracking.columns.values)
    tracking = set_event_types(tracking)
    
    event_list = ['CAROUSEL', 'PA', 'LR', 'LR_ADD_TO_BASKET', 'LR_PRODUCT',
           'SHOW_CASE', 'UNKNOWN', 'PA_PRODUCT', 'CAROUSEL_PRODUCT',
           'CAROUSEL_ADD_TO_BASKET', 'LP_PRODUCT', 'SHOW_CASE_PRODUCT',
           'LP_ADD_TO_BASKET', 'PA_ADD_TO_BASKET', 'SHOW_CASE_ADD_TO_BASKET']
    event_dict = dict(zip(sorted(event_list), range(len(event_list)+1)))
    actions = ['None', 'PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
    action_dict = dict(zip(sorted(actions), range(len(actions))))
    tracking['event_id'] = tracking.event_type.apply(lambda x: event_dict[x])
    tracking['action_id'] = tracking.action_type.apply(lambda x: action_dict[x])
    
    return tracking[columns + ['event_id', 'action_id']]

In [424]:
tracking = train_tracking.copy()

**Algorithm:**

In [66]:
columns = list(tracking.columns.values)
tracking = set_event_types(tracking)

In [67]:
event_list = ['CAROUSEL', 'PA', 'LR', 'LR_ADD_TO_BASKET', 'LR_PRODUCT',
       'SHOW_CASE', 'UNKNOWN', 'PA_PRODUCT', 'CAROUSEL_PRODUCT',
       'CAROUSEL_ADD_TO_BASKET', 'LP_PRODUCT', 'SHOW_CASE_PRODUCT',
       'LP_ADD_TO_BASKET', 'PA_ADD_TO_BASKET', 'SHOW_CASE_ADD_TO_BASKET']
event_dict = dict(zip(sorted(event_list), range(len(event_list))))
tracking['event_id'] = tracking.event_type.apply(lambda x: event_dict[x])

In [68]:
tracking = tracking[columns + ['event_id']]

**Test:**

In [75]:
str(len(tracking.head().event_id.values == map_actions(train_tracking).head().event_id.values)/len(train_tracking)*100) + '%'

'0.00036897782074319513%'

# Timestamp series in session

In [345]:
def duration_to_seconds2(tracking):
    if not 'timestamp' in tracking.columns:
        tracking['timestamp'] = pd.to_timedelta(tracking.duration).dt.total_seconds()
    return tracking

In [346]:
def duration_timestamp(features, tracking):
    return features

In [347]:
def events_timeseries(features, tracking):
    tracking = map_actions(duration_to_seconds2(tracking))
    
    group = tracking.sort_values(['timestamp']).groupby('sid')
    eventseries = group.event_id.agg(list)

In [698]:
features = test_features
tracking = test_tracking

**Dimensiones:** Sesiones, ventana (0, 30), features <br>
**Output:** mapa de sesiones, cubo de features

In [699]:
tracking = map_actions(duration_to_seconds2(tracking))

In [700]:
tracking.event_id.unique()

array([11,  4,  5,  6,  0,  2,  1,  8, 13,  7, 10,  9,  3, 12, 14])

In [701]:
def to_timeseries_features(tstamps):
    series = pd.Series(list(tstamps))
    
    ts = list(tstamps)
    
    dts = series.diff().values
    dts[0] = 0
    max_dts = max(max(dts),1)
    dts_perc = dts/max_dts
    
    return [ts, dts_perc]

def to_eventseries(evs):
    return list(evs)

group = tracking.sort_values(['timestamp']).groupby('sid')
timeseries = group.timestamp.agg(partial(to_timeseries_features))
eventseries = group.event_id.agg(to_eventseries)

In [415]:
sessions = tracking.sid.unique()
sessions_map = dict(zip(sorted(sessions), range(len(sessions))))
len(sessions_map), len(sessions)

(88750, 88750)

In [416]:
WINDOW_SIZE = 15

# Building 3D variable
frames = np.zeros((len(sessions), WINDOW_SIZE, 2))

In [417]:
for session in timeseries.index:
    session_id = sessions_map[session]
    n_actions = len(timeseries[session])
    featblock = np.array(timeseries[session]).T
    if featblock.shape[0] == WINDOW_SIZE:
        pass
    elif featblock.shape[0] < WINDOW_SIZE:
        zeros = np.zeros((WINDOW_SIZE - featblock.shape[0], 2))
        featblock = np.concatenate((zeros, featblock))
    else:
        featblock = featblock[-WINDOW_SIZE:]
    frames[session_id] = featblock

In [418]:
shape = frames[:,:,0].shape
frames[:,:,0].mean()

661.3300318884853

In [419]:
scaler.fit(frames[:,:,0].reshape((-1, 1)))

ts_norm = scaler.transform(frames[:,:,0].reshape((-1, 1))).reshape(shape)

zero_pos = scaler.transform(np.zeros(shape).reshape((-1, 1)))[0][0]
zero_pos

-0.1960016255800045

In [420]:
frames[:,:,0] = ts_norm-zero_pos

In [421]:
np.save(PROCESSED/'f_duration_test.npy', frames)
with open(PROCESSED/'f_sessionsmap_test.pkl', 'wb') as handle:
    # json.dump(sessions_map, handle)
    pickle.dump(sessions_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [323]:
WINDOW_SIZE = 15

# Building 3D variable
frames_events = np.zeros((len(sessions), WINDOW_SIZE, 1))
print(frames_events.shape)

for session in eventseries.index:
    session_id = sessions_map[session]
    n_actions = len(eventseries[session])
    featblock = np.array([eventseries[session]]).T
    if featblock.shape[0] == WINDOW_SIZE:
        pass
    elif featblock.shape[0] < WINDOW_SIZE:
        zeros = np.zeros((WINDOW_SIZE - featblock.shape[0], 1))
        featblock = np.concatenate((zeros, featblock))
    else:
        featblock = featblock[-WINDOW_SIZE:]
    frames_events[session_id] = featblock

(133123, 30, 1)


# Number of actions series

### ADD_TO_BASKET

In [720]:
features = test_features
tracking = test_tracking
WINDOW_SIZE = 30

In [721]:
with open(PROCESSED/'f_sessionsmap_test.pkl', 'rb') as f:
    sessions_map = pickle.load(f)

In [722]:
# sessions = tracking.sid.unique()
# sessions_map = dict(zip(sorted(sessions), range(len(sessions))))
# len(sessions_map), len(sessions)

In [723]:
dict(zip(sorted(tracking.type.unique()), range(tracking.type.nunique())))

actions = ['None', 'PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
action_dict = dict(zip(sorted(actions), range(len(actions))))

tracking = map_actions(duration_to_seconds2(tracking))

In [724]:
ADD_TO_BASKET = action_dict['ADD_TO_BASKET']
PRODUCT = action_dict['PRODUCT']
PURCHASE_PRODUCT = action_dict['PURCHASE_PRODUCT']
def to_action_series(x):
    actions = np.array(list(x))

    if len(actions) > WINDOW_SIZE:
        actions = actions[-WINDOW_SIZE:]
    
    default_len = np.array([
        np.cumsum(actions == ADD_TO_BASKET),
        np.cumsum(actions == PRODUCT),
        np.cumsum(actions == PURCHASE_PRODUCT)
    ])
    
    if default_len.shape[1] <= WINDOW_SIZE:
        result = np.zeros((3, WINDOW_SIZE))
        result[:,-default_len.shape[1]:] = default_len
    else:
        result = default_len[:,-WINDOW_SIZE:]
    return result

group = tracking.sort_values(['timestamp']).groupby('sid')
action_series = group.action_id.apply(to_action_series)

In [729]:
df = action_series.to_frame().copy()
df['sid'] = pd.Series(df.index.values).apply(lambda x: sessions_map[x]).values
df

Unnamed: 0_level_0,action_id,sid
sid,Unnamed: 1_level_1,Unnamed: 2_level_1
++7b5Z/kXmn3IO6ra3bcqTmCeJoRDUU+iByI4jJnY/+rYC+6lkbiXpD2DMkQ1DOW,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0
++AKRj07v12R8fa3CXGus9gu6eOJd2tzfgxQWH5A5QgLYOZGNniq3Hwe7igNggQe,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1
++BuBFP5uXPeSKQCCACBynrUYEevAEox8C1QeW4/VCHolFsLZhf1SxTNKUX+9UlN,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
++EtPIejEaNzvYR1OOYbWWe7LpS9Ghhc6Hij836pqY/R6MJHgTqUrtOJaTrgN0Cl,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3
++HMEdn/ppTGsb7z8iCzT32uAcq/BOgCSxEnCWxpdHarVlT4Loy97YgxPYDB/rx9,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",4
++KNV8+T4rL+oKzYGZWCb9u2qKgsivsFuR/TbntG3oSr1DKCz54laYrYAyG7xYIZ,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",5
++Kt9aHd0h6iUOPPo+f/EPK9Ptyq2X6cRuxjmoCDlFIrMv4gwY8HBLVYYpljVFu3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",6
++MxektF9AEMu0jZelSfqc6scV2o9LrgQXQea1s5VKYMfI0bged21aiHP3N4vb0i,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7
++OdGfMAGEhZn+X5YkeGN7Gayd+fneUqJVFPRy//8Fn9+GpKvQRUvzblvg0+l95u,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8
++OvMl6DHKyeaHWyFW0DA5FYFo7RgxFXvbqzBZ7qN8tI39MW4U+00SA8+ZFU+FyN,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",9


In [694]:
action_series2 = df.set_index('sid').sort_index()['action_id'].values

action_seq = np.stack(action_series2).reshape((-1, WINDOW_SIZE, 3))
shape = action_seq.shape

In [695]:
pd.Series(action_seq[:,:,0].reshape(-1, 1).T[0]).describe()
# action_seq[:,:,0].T.shape

count    2.662500e+06
mean     1.909592e-01
std      1.064270e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      2.800000e+01
dtype: float64

In [696]:
action_seq = action_seq/WINDOW_SIZE

In [697]:
np.save(PROCESSED/'f_actions_test.npy', action_seq)
# with open(PROCESSED/'f_sessionsmap_test.pkl', 'wb') as handle:
#     pickle.dump(sessions_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Number of products added to basket

In [740]:
features = train_features
tracking = train_tracking
WINDOW_SIZE = 30

In [741]:
with open(PROCESSED/'f_sessionsmap_train.pkl', 'rb') as f:
    sessions_map = pickle.load(f)

In [745]:
tracking = map_actions(duration_to_seconds2(tracking))

In [746]:
def to_q_series(x):
    qs = np.array(list(x))

    if len(qs) > WINDOW_SIZE:
        qs = qs[-WINDOW_SIZE:]
    
    default_len = np.array([
        np.cumsum(qs)
    ])
    
    if default_len.shape[1] <= WINDOW_SIZE:
        result = np.zeros((1, WINDOW_SIZE))
        result[:,-default_len.shape[1]:] = default_len
    else:
        result = default_len[:,-WINDOW_SIZE:]
    return result

# tracking['new_id'] = tracking.sid.apply(lambda x: sessions_map[x])
# tracking = tracking.reset_index('new_id', drop=True)
tracking['filled_q'] = tracking.quantity.fillna(0)
group = tracking.sort_values(['timestamp']).groupby('sid')
q_series = group.filled_q.apply(to_q_series)

In [747]:
df = q_series.to_frame().copy()
df['sid'] = df.index
df['sid'] = df.sid.apply(lambda x: sessions_map[x])
q_seq = df.set_index('sid').values
q_seq.shape

(133123, 1)

In [748]:
q_series2 = df.set_index('sid').sort_index()['filled_q'].values

In [749]:
q_seq = np.stack(q_series2).reshape((-1, WINDOW_SIZE, 1))
shape = q_seq.shape
shape

(133123, 30, 1)

In [750]:
scaler = StandardScaler()
scaler.fit(q_seq.reshape(-1, 1))
# action_seq[:,:,0].T.shape
new_q = scaler.transform(q_seq.reshape(-1, 1)).reshape(shape)

In [751]:
new_q[:,-1,0]

array([-0.16290829, -0.16290829, -0.16290829, ..., -0.16290829,
        7.22445655, -0.16290829])

In [752]:
np.save(PROCESSED/'f_quantity_train.npy', new_q)
# with open(PROCESSED/'f_sessionsmap_test.pkl', 'wb') as handle:
#     pickle.dump(sessions_map, handle, protocol=pickle.HIGHEST_PROTOCOL)