In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import json
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ast import literal_eval
from functools import partial
import pickle
sns.set()

DATA = Path('../../data')
RAW  = DATA/'raw'
PROCESSED = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [178]:
%%time
product           = pd.read_csv(RAW/'productid_category.csv', low_memory=False)
train_tracking    = pd.read_csv(RAW/'train_tracking.csv', low_memory=False)
test_tracking     = pd.read_csv(RAW/'test_tracking.csv', low_memory=False)
train_session     = pd.read_csv(RAW/'train_session.csv', low_memory=False)
test_session      = pd.read_csv(RAW/'random_submission.csv', low_memory=False)

CPU times: user 1min 12s, sys: 9.98 s, total: 1min 22s
Wall time: 1min 22s


In [179]:
train_features = train_session.copy()
test_features = test_session.copy()

In [194]:
tracking = train_tracking.copy()
tracking = pd.merge(tracking, train_features, how='left', on='sid')

In [202]:
tracking = train_tracking.copy()
tracking = pd.merge(tracking, train_features, how='left', on='sid')
f = tracking[pd.notnull(tracking.sname)]
sellers = list(f.sname.unique())
def foo(x):
    return sum(x)/len(x)

f.groupby('sname').target.agg(foo).values

array([0.        , 0.        , 0.        , ..., 0.25714286, 0.        ,
       0.        ])

In [196]:
sellers = list(f.sname.unique())

In [212]:
def foo(x):
    return sum(x)/len(x)

seller_prob = f.groupby('sname').target.agg(foo).to_frame()

In [None]:
test_

In [203]:
tracking = train_tracking.copy()
tracking = pd.merge(tracking, train_features, how='left', on='sid')
f = tracking[pd.notnull(tracking.sname)]
sellers = list(f.sname.unique())
f.groupby('sname').target.agg(lambda x: sum(x)/len(x)).values

array([0.        , 0.        , 0.        , ..., 0.25714286, 0.        ,
       0.        ])

In [225]:
good_sellers = list(seller_prob[(seller_prob > 0.99)].index)

In [232]:
seller_dict = dict(zip(seller_prob.index, seller_prob.target))

In [260]:
df = pd.read_csv(DATA/'submissions/robert_5_submit_blend.csv', index_col='sid')

In [209]:
merged = pd.merge(df, test_tracking, on='sid', how='left')

In [224]:
len(good_sellers)

7662

In [269]:
def check(snames):
    probs = []
    for seller in snames:
        if seller in seller_dict:
            if seller_dict[seller] > 0.9999:
                return 0
            else:
                probs.append(seller_dict[seller])
    if len(probs)==0:
        return 1
    return 1 - sum(probs)/len(probs)

result = test_tracking.groupby('sid').sname.apply(check)

In [270]:
sum(result < 0.0001)

221

In [271]:
sum(result < 0.0001)/len(result)

0.0024901408450704226

In [246]:
the_sells = list(result[result < 0.01].index)

In [261]:
for sell in the_sells:
    df.at[sell, 'target'] = 1

In [262]:
df.to_csv(DATA/'submissions/franco_0_submit.csv')

In [266]:
df[df.target<1].max()

target    0.839868
dtype: float64

# Mapped actions

In [180]:
def set_event_types(tracking):
    def extract_page(x):
        pages_types = ['_LR', '_PA', '_LP', '_CAROUSEL', '_SHOW_CASE']
        pages = ['CAROUSEL', 'PA', 'SHOW_CASE']
        pages_map = [['PURCHASE_PRODUCT_UNKNOW_ORIGIN', 'UNKNOWN'], ['LIST_PRODUCT', 'LP'], ['SEARCH', 'LR']]
        
        for pages_type in pages_types:
            if x.endswith(pages_type):
                return x[-len(pages_type)+1:]
        for page in pages:
            if x == page:
                return x
        for page_map in pages_map:
            if x == page_map[0]:
                return page_map[1]
        return '::' + x
    
    def extract_event(x):
        page, _type, type_s = x
        concatenated = _type
        
        if page == 'UNKNOWN':
            return page
        
        actions = ['PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
        for action in actions:
            if action in concatenated:
                return page + '_' + action
        return page

    def extract_action(x):
        actions = ['PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
        event_type = x
        
        for action in actions:
            if action in event_type:
                return action
        return 'None'
        
    tracking['page_n_type'] = list(zip(tracking.type.apply(extract_page), tracking.type, tracking.type_simplified))
    tracking['event_type'] = tracking.page_n_type.apply(extract_event)
    tracking['action_type'] = tracking.event_type.apply(extract_action)
    return tracking

In [181]:
def map_actions(tracking):
    tracking = tracking.copy()
    
    columns = list(tracking.columns.values)
    tracking = set_event_types(tracking)
    
    event_list = ['CAROUSEL', 'PA', 'LR', 'LR_ADD_TO_BASKET', 'LR_PRODUCT',
           'SHOW_CASE', 'UNKNOWN', 'PA_PRODUCT', 'CAROUSEL_PRODUCT',
           'CAROUSEL_ADD_TO_BASKET', 'LP_PRODUCT', 'SHOW_CASE_PRODUCT',
           'LP_ADD_TO_BASKET', 'PA_ADD_TO_BASKET', 'SHOW_CASE_ADD_TO_BASKET']
    event_dict = dict(zip(sorted(event_list), range(len(event_list)+1)))
    
    actions = ['None', 'PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
    action_dict = dict(zip(sorted(actions), range(len(actions))))
    
    tracking['event_id'] = tracking.event_type.apply(lambda x: event_dict[x])
    tracking['action_id'] = tracking.action_type.apply(lambda x: action_dict[x])
    
    return tracking[columns + ['event_id', 'action_id']]

In [182]:
tracking = train_tracking.copy()

**Algorithm:**

In [183]:
columns = list(tracking.columns.values)
tracking = set_event_types(tracking)

In [184]:
event_list = ['CAROUSEL', 'PA', 'LR', 'LR_ADD_TO_BASKET', 'LR_PRODUCT',
       'SHOW_CASE', 'UNKNOWN', 'PA_PRODUCT', 'CAROUSEL_PRODUCT',
       'CAROUSEL_ADD_TO_BASKET', 'LP_PRODUCT', 'SHOW_CASE_PRODUCT',
       'LP_ADD_TO_BASKET', 'PA_ADD_TO_BASKET', 'SHOW_CASE_ADD_TO_BASKET']
event_dict = dict(zip(sorted(event_list), range(len(event_list))))

actions = ['None', 'PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
action_dict = dict(zip(sorted(actions), range(len(actions))))

tracking['event_id'] = tracking.event_type.apply(lambda x: event_dict[x])
tracking['action_id'] = tracking.action_type.apply(lambda x: action_dict[x])

In [185]:
tracking = tracking[columns + ['event_id', 'action_id']]

In [186]:
tracking.head().event_id.values

array([0, 0, 0, 0, 8])

**Test:**

In [95]:
str(len(tracking.event_id.values == map_actions(train_tracking).event_id.values)/len(train_tracking)*100) + '%'

'100.0%'

In [94]:
map_actions(train_tracking).head().event_id.values

array([0, 0, 0, 0, 8])

# Timestamp series in session

In [109]:
def duration_to_seconds2(tracking):
    if not 'timestamp' in tracking.columns:
        tracking['timestamp'] = pd.to_timedelta(tracking.duration).dt.total_seconds()
    return tracking

In [110]:
def duration_timestamp(features, tracking):
    return features

In [111]:
def events_timeseries(features, tracking):
    tracking = map_actions(duration_to_seconds2(tracking))
    
    group = tracking.sort_values(['timestamp']).groupby('sid')
    eventseries = group.event_id.agg(list)

In [112]:
features = test_features.copy()
tracking = test_tracking.copy()

**Dimensiones:** Sesiones, ventana (0, 30), features <br>
**Output:** mapa de sesiones, cubo de features

In [113]:
tracking = map_actions(duration_to_seconds2(tracking))

In [114]:
def to_timeseries_features(tstamps):
    series = pd.Series(list(tstamps))
    
    ts = list(tstamps)
    
    dts = series.diff().values
    dts[0] = 0
    max_dts = max(max(dts),1)
    dts_perc = dts/max_dts
    
    return [ts, dts_perc]

def to_eventseries(evs):
    return list(evs)

group = tracking.sort_values(['timestamp']).groupby('sid')
timeseries = group.timestamp.agg(partial(to_timeseries_features))
#eventseries = group.event_id.agg(to_eventseries)

In [115]:
sessions = set(np.append(train_tracking.sid.unique(), test_tracking.sid.unique()))
sessions_map = dict(zip(sorted(sessions), range(len(sessions))))
len(sessions_map), len(sessions)

(221873, 221873)

In [116]:
WINDOW_SIZE = 30

# Building 3D variable
frames = np.zeros((len(sessions), WINDOW_SIZE, 2))

In [117]:
for session in timeseries.index:
    session_id = sessions_map[session]
    n_actions = len(timeseries[session])
    featblock = np.array(timeseries[session]).T
    if featblock.shape[0] == WINDOW_SIZE:
        pass
    elif featblock.shape[0] < WINDOW_SIZE:
        zeros = np.zeros((WINDOW_SIZE - featblock.shape[0], 2))
        featblock = np.concatenate((zeros, featblock))
    else:
        featblock = featblock[-WINDOW_SIZE:]
    frames[session_id] = featblock

In [118]:
shape = frames[:,:,0].shape
frames[:,:,0].mean()

264.5343972908066

In [119]:
scaler = StandardScaler()
scaler.fit(frames[:,:,0].reshape((-1, 1)))

ts_norm = scaler.transform(frames[:,:,0].reshape((-1, 1))).reshape(shape)

zero_pos = scaler.transform(np.zeros(shape).reshape((-1, 1)))[0][0]
zero_pos

-0.12255844149037984

In [120]:
frames[:,:,0] = ts_norm-zero_pos

In [121]:
np.save(PROCESSED/'f_duration_test.npy', frames)
with open(PROCESSED/'f_sessionsmap_test.pkl', 'wb') as handle:
    # json.dump(sessions_map, handle)
    pickle.dump(sessions_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [323]:
WINDOW_SIZE = 15

# Building 3D variable
frames_events = np.zeros((len(sessions), WINDOW_SIZE, 1))
print(frames_events.shape)

for session in eventseries.index:
    session_id = sessions_map[session]
    n_actions = len(eventseries[session])
    featblock = np.array([eventseries[session]]).T
    if featblock.shape[0] == WINDOW_SIZE:
        pass
    elif featblock.shape[0] < WINDOW_SIZE:
        zeros = np.zeros((WINDOW_SIZE - featblock.shape[0], 1))
        featblock = np.concatenate((zeros, featblock))
    else:
        featblock = featblock[-WINDOW_SIZE:]
    frames_events[session_id] = featblock

(133123, 30, 1)


# Number of actions series

### ADD_TO_BASKET

In [177]:
features = train_features.copy()
tracking = train_tracking.copy()
WINDOW_SIZE = 30

In [165]:
with open(PROCESSED/'f_sessionsmap_train.pkl', 'rb') as f:
    sessions_map = pickle.load(f)
    
sessions = set(np.append(train_tracking.sid.unique(), test_tracking.sid.unique()))
sessions_map = dict(zip(sorted(sessions), range(len(sessions))))
len(sessions_map), len(sessions)

(221873, 221873)

In [166]:
# sessions = tracking.sid.unique()
# sessions_map = dict(zip(sorted(sessions), range(len(sessions))))
# len(sessions_map), len(sessions)

In [167]:
dict(zip(sorted(tracking.type.unique()), range(tracking.type.nunique())))

actions = ['None', 'PRODUCT', 'ADD_TO_BASKET', 'PURCHASE_PRODUCT']
action_dict = dict(zip(sorted(actions), range(len(actions))))

tracking = map_actions(duration_to_seconds2(tracking))

In [168]:
ADD_TO_BASKET = action_dict['ADD_TO_BASKET']
PRODUCT = action_dict['PRODUCT']
PURCHASE_PRODUCT = action_dict['PURCHASE_PRODUCT']
def to_action_series(x):
    actions = np.array(list(x))

    if len(actions) > WINDOW_SIZE:
        actions = actions[-WINDOW_SIZE:]
    
    default_len = np.array([
        np.cumsum(actions == ADD_TO_BASKET),
        np.cumsum(actions == PRODUCT),
        np.cumsum(actions == PURCHASE_PRODUCT)
    ])
    
    if default_len.shape[1] <= WINDOW_SIZE:
        result = np.zeros((3, WINDOW_SIZE))
        result[:,-default_len.shape[1]:] = default_len
    else:
        result = default_len[:,-WINDOW_SIZE:]
    return result

group = tracking.sort_values(['timestamp']).groupby('sid')
action_series = group.action_id.apply(to_action_series)



In [172]:
df = action_series.to_frame().copy()
df['sid'] = pd.Series(df.index.values).apply(lambda x: sessions_map[x]).values
np.save(PROCESSED/'f_actions_sid_train.npy', df.sid.values)

In [173]:
action_series2 = df.set_index('sid').sort_index()['action_id'].values

action_seq = np.stack(action_series2).reshape((-1, WINDOW_SIZE, 3))
shape = action_seq.shape

In [174]:
pd.Series(action_seq[:,:,0].reshape(-1, 1).T[0]).describe()
# action_seq[:,:,0].T.shape

count    2662500.0
mean           0.0
std            0.0
min            0.0
25%            0.0
50%            0.0
75%            0.0
max            0.0
dtype: float64

In [175]:
action_seq = action_seq/WINDOW_SIZE

In [176]:
np.save(PROCESSED/'f_actions_train.npy', action_seq)
# with open(PROCESSED/'f_sessionsmap_test.pkl', 'wb') as handle:
#     pickle.dump(sessions_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Number of products added to basket

In [151]:
features = test_features
tracking = test_tracking
WINDOW_SIZE = 30

In [164]:
with open(PROCESSED/'f_sessionsmap_train.pkl', 'rb') as f:
    sessions_map = pickle.load(f)
        
sessions = set(np.append(train_tracking.sid.unique(), test_tracking.sid.unique()))
sessions_map = dict(zip(sorted(sessions), range(len(sessions))))
len(sessions_map), len(sessions)

(221873, 221873)

In [153]:
tracking = map_actions(duration_to_seconds2(tracking))

In [154]:
def to_q_series(x):
    qs = np.array(list(x))

    if len(qs) > WINDOW_SIZE:
        qs = qs[-WINDOW_SIZE:]
    
    default_len = np.array([
        np.cumsum(qs)
    ])
    
    if default_len.shape[1] <= WINDOW_SIZE:
        result = np.zeros((1, WINDOW_SIZE))
        result[:,-default_len.shape[1]:] = default_len
    else:
        result = default_len[:,-WINDOW_SIZE:]
    return result

# tracking['new_id'] = tracking.sid.apply(lambda x: sessions_map[x])
# tracking = tracking.reset_index('new_id', drop=True)
tracking['filled_q'] = tracking.quantity.fillna(0)
group = tracking.sort_values(['timestamp']).groupby('sid')
q_series = group.filled_q.apply(to_q_series)

In [155]:
df = q_series.to_frame().copy()
df['sid'] = df.index
df['sid'] = df.sid.apply(lambda x: sessions_map[x])
q_seq = df.set_index('sid').values
q_seq.shape

(88750, 1)

In [156]:
q_series2 = df.set_index('sid').sort_index()['filled_q'].values

In [157]:
q_seq = np.stack(q_series2).reshape((-1, WINDOW_SIZE, 1))
shape = q_seq.shape
shape

(88750, 30, 1)

In [158]:
scaler = StandardScaler()
scaler.fit(q_seq.reshape(-1, 1))
# action_seq[:,:,0].T.shape
new_q = scaler.transform(q_seq.reshape(-1, 1)).reshape(shape)

In [159]:
new_q[:,-1,0]

array([ 3.41755967, -0.15970804, -0.15970804, ..., -0.15970804,
       -0.15970804, -0.15970804])

In [160]:
np.save(PROCESSED/'f_quantity_test.npy', new_q)
# with open(PROCESSED/'f_sessionsmap_test.pkl', 'wb') as handle:
#     pickle.dump(sessions_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [162]:
new_q[:,-1,0]

array([ 3.41755967, -0.15970804, -0.15970804, ..., -0.15970804,
       -0.15970804, -0.15970804])