In [151]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
from collections import Counter
from sklearn.decomposition import PCA
from ast import literal_eval
sns.set()

DATA = Path('../../data')
RAW  = DATA/'raw'
PROCESSED = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [178]:
%%time
product           = pd.read_csv(RAW/'productid_category.csv', low_memory=False)
train_tracking    = pd.read_csv(RAW/'train_tracking.csv', low_memory=False)
test_tracking     = pd.read_csv(RAW/'test_tracking.csv', low_memory=False)
train_session     = pd.read_csv(RAW/'train_session.csv', low_memory=False)
test_session      = pd.read_csv(RAW/'random_submission.csv', low_memory=False)

CPU times: user 1min 9s, sys: 6.96 s, total: 1min 16s
Wall time: 1min 16s


In [215]:
session_features = train_session.copy()

# Duration feature
### TOTAL_DURATION

In [4]:
# Convertir duration a total de segundos
def duration_to_seconds(train_tracking):
    train_tracking.duration = pd.to_timedelta(train_tracking.duration).dt.total_seconds()
    return train_tracking

def duration_feature(session_features, train_tracking):
    train_tracking = duration_to_seconds(train_tracking)
    durations = train_tracking.groupby('sid').duration.max().reset_index()
    durations.columns = ['sid', 'TOTAL_DURATION']
    session_features = pd.merge(session_features, durations, on='sid', how='left')
    return session_features

In [5]:
duration_to_seconds(train_tracking)
train_tracking.duration

0           0.000000
1          11.179638
2          13.132756
3         137.397333
4           0.000000
5           3.571980
6          43.736337
7         193.467328
8         860.748909
9         861.640271
10      39220.444632
11          0.000000
12         22.974930
13         74.070494
14         89.935316
15        124.919148
16        172.222364
17        199.921949
18        220.535582
19        239.207941
20        331.216386
21        426.326788
22        433.759435
23        476.879169
24        500.808141
25        527.359679
26        601.352748
27        625.361456
28        642.594890
29          0.000000
            ...     
9970     2494.048306
9971        0.000000
9972       13.894872
9973       35.829504
9974      149.785943
9975      155.185702
9976        0.000000
9977       27.456191
9978       33.077170
9979       61.828305
9980       86.022902
9981       95.962380
9982      105.844270
9983     1392.396024
9984        0.000000
9985     1242.207719
9986     1260

In [6]:
durations = train_tracking.groupby('sid').duration.max().reset_index()
durations.columns = ['sid', 'TOTAL_DURATION']

In [61]:
durations

Unnamed: 0,sid,TOTAL_DURATION
0,+++elmtsXqN289wWNi6auO1Fm7gyPkXmsKngig88cIqXDD...,214.537492
1,++0tYP9PmT6jX9O1WjUhWd7w3hWV6xSRMBOdA7HMoBukKs...,65.059323
2,++2CIH+Rnf2MBamibl+EPSMDTKmweZzRgeX/VDBussbBR8...,167.882281
3,++3a8LhdXKrKZJeNiBtuHj8znGF/eQADRi0GSnPSlqRajq...,248.493267
4,++3dzXAmTuAQr+0il3jYZzqk8eoPk6TiffxCqNdQAKyBGp...,289.233742
5,++7Cfi2TWzILOEozI16/Z8VtuNdwIdRKZLNPWpGghC3kgd...,114.243229
6,++7hiflL4pfaqZ/dzSroZxy/4zAmspVWwpI/b6SXXUagvc...,424.451279
7,++88YC5S60iYECY2SgrPIuaeudd/xJ1xWzGdB7ZybSYq1j...,202.535417
8,++A1hjrNljzoZPBp65KbDL2wStgk94q4TfYmF58oA9uz2w...,138.518332
9,++F4WKEhQikoCBiqXLlN10zKjq/bfNZ6ne3T7Sw/ehc50v...,1750.781175


In [62]:
session_features = pd.merge(session_features, durations, on='sid', how='left')

In [7]:
session_features = duration_feature(session_features, train_tracking)
session_features

Unnamed: 0,sid,target,TOTAL_DURATION
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,False,
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,False,
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,False,
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,False,
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,False,
5,NsKSt1vreFt/tpUjhr2L4dYUlsaEpHiq2qsBMG6d+t+WtV...,False,
6,7Rgtt5iDUSO+/FGfyIaTrWYDiNjF1KbULPrldFj78dX6JO...,False,
7,L9OHLekz6vMT7t609Y6c3fUIJ2i5MKgdam8UOcCOJnDs+V...,False,
8,PydsL5KNw3VBO0pgF6HhczwRONYLyAhUIgzYMQv3Vaj84H...,False,
9,hLnY1trrWZhSv5dFqEZzBCPfvw6RI/vTzG1BrDpPuYvFjY...,False,


# Main category features

In [154]:
def add_page(train_tracking):
    def extract_page(x):
        pages_types = ['_LR', '_PA', '_LP', '_CAROUSEL', '_SHOW_CASE']
        pages = ['CAROUSEL', 'PA', 'SEARCH', 'SHOW_CASE', 'LIST_PRODUCT']
        pages_map = [['PURCHASE_PRODUCT_UNKNOW_ORIGIN', 'UNKNOWN']]
        for pages_type in pages_types:
            if x.endswith(pages_type):
                return x[-len(pages_type)+1:]
        for page in pages:
            if x == page:
                return x
        for page_map in pages_map:
            if x == page_map[0]:
                return page_map[1]
        return '::' + x
    train_tracking['page'] = train_tracking.type.apply(extract_page)
    return train_tracking

def simplify_categories(product):
    counter1 = product.groupby('category_product_id_level1').size()
    counter1dict = counter1.to_dict()
    mapcat = {}
    for idx in counter1dict:
        if counter1dict[idx] > 10:
            mapcat[idx] = idx
        else:
            mapcat[idx] = 10e7
    product['cat1'] = product.category_product_id_level1.apply(lambda x: mapcat[x])
    return product

def convert_jsonproducts(train_tracking, column):
    def convert_json(x):
        if pd.isnull(x):
            return x
        else:
            return literal_eval(x)
    train_tracking['product_list'] = train_tracking[column].apply(convert_json)
    return train_tracking

In [179]:
print('Loading pages')
train_tracking = add_page(train_tracking)
print('Loading categories')
product = simplify_categories(product)
product = product.append({'product_id': 'Zw92hEmaaA7dQb2cgpn/Jg==', 'cat1': 100000000.0}, ignore_index=True)
product = product.append({'product_id': '7UPNQe/PpnKtw1Sz998sUup6NPTGBfCKCRdXNRSP70U=', 'cat1': 100000000.0}, ignore_index=True)
print('Loading catmap')
catmap = dict(zip(product.product_id, product.cat1))

Loading pages
Loading categories
Loading catmap


In [188]:
def cat_counter(prodlist):
    global product
    counter = {}
    for prod in prodlist:
        if not prod['sku'] in catmap:
            print('CANT FIND ' + prod['sku'])
            # print(prodlist)
            cat = 10e7
        else:
            cat = int(catmap[prod['sku']])
        if cat in counter:
            counter[cat] = counter[cat] + 1
        else:
            counter[cat] = 1
    return counter

def merge_counters(counters):
    merged = {}
    for counter in counters:
        merged = {**merged, **counter}
    return merged

### ------- MAIN_CATEGORY_CAROUSEL -------

In [8]:
def main_cat_carousel(session_features):
    return session_features

### MAIN_CATEGORY_PURCHASE

In [189]:
def main_cat_purchase(session_features):
    carousel = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_CAROUSEL'].copy(), 'ocarproducts')
    carousel['prod_counter'] = carousel.product_list.apply(cat_counter)
    session_carousel = carousel.groupby('sid').prod_counter.agg(merge_counters).reset_index()
    
    lp = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_LP'].copy(), 'products')
    lp['prod_counter'] = lp.product_list.apply(cat_counter)
    session_lp = lp.groupby('sid').prod_counter.agg(merge_counters).reset_index()
    
    return session_features

In [190]:
carousel = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_CAROUSEL'].copy(), 'ocarproducts')

In [191]:
product.tail()

Unnamed: 0,product_id,category_product_id_level1,category_product_id_level2,category_product_id_level3,cat1
10635916,nT4sUuBC/5jry7c276xuuup6NPTGBfCKCRdXNRSP70U=,574400.0,574964.0,581475.0,100000000.0
10635917,vGDhDzugmY2Dx+dsJJ2q8ep6NPTGBfCKCRdXNRSP70U=,5024889.0,5025466.0,5031966.0,100000000.0
10635918,U/XDT/PYFx6TRxUVo3A6k+p6NPTGBfCKCRdXNRSP70U=,5103804.0,5104370.0,5110887.0,100000000.0
10635919,Zw92hEmaaA7dQb2cgpn/Jg==,,,,100000000.0
10635920,7UPNQe/PpnKtw1Sz998sUup6NPTGBfCKCRdXNRSP70U=,,,,100000000.0


In [192]:
def find_main_cat(prodlist):
    global product
    counter = {}
    for prod in prodlist:
        if not prod['sku'] in catmap:
            print(prodlist)
            continue
        cat = int(catmap[prod['sku']])
        if cat in counter:
            counter[cat] = counter[cat] + 1
        else:
            counter[cat] = 1
    evaluate = Counter(counter)
    return evaluate.most_common(1)[0][0]

# find_main_cat(carousel.iloc[0].product_list)
# product.columns
carousel['prod_counter'] = carousel.product_list.apply(cat_counter)
session_carousel = carousel.groupby('sid').prod_counter.agg(merge_counters)

In [193]:
lp = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_LP'].copy(), 'products')
lp['prod_counter'] = lp.product_list.apply(cat_counter)
session_lp = lp.groupby('sid').prod_counter.agg(merge_counters).reset_index()

In [194]:
lr = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_LR'].copy(), 'oproducts')
lr['prod_counter'] = lr.product_list.apply(cat_counter)
session_lr = lr.groupby('sid').prod_counter.agg(merge_counters).reset_index()

CANT FIND KwvglcLn/9rrtihM2P9oz+p6NPTGBfCKCRdXNRSP70U=
CANT FIND oSozGAvXONoebDvapwqju+p6NPTGBfCKCRdXNRSP70U=
CANT FIND UcgwRCtogmOkaqZgTfmkZep6NPTGBfCKCRdXNRSP70U=
CANT FIND eXO0OZNKDaZijXn3cyiaUup6NPTGBfCKCRdXNRSP70U=
CANT FIND NZJMVBB0oJU6C4KkPvDjDOp6NPTGBfCKCRdXNRSP70U=
CANT FIND VKcZFzH0wnzzM5HGPDpaEOp6NPTGBfCKCRdXNRSP70U=
CANT FIND abWjY90aTHpB2lXwZOY98+p6NPTGBfCKCRdXNRSP70U=
CANT FIND tWzp896BCv6VJnbc4Z341up6NPTGBfCKCRdXNRSP70U=
CANT FIND rhUiqoAzOSE9d1/DV/GWHOp6NPTGBfCKCRdXNRSP70U=
CANT FIND hODkWGUsJPnEdiJ39UfJW+p6NPTGBfCKCRdXNRSP70U=
CANT FIND o9yxJm+DjzDfWozuxFjX1up6NPTGBfCKCRdXNRSP70U=
CANT FIND W2u8gjxqqVQlh4bS8DHYZup6NPTGBfCKCRdXNRSP70U=
CANT FIND SpQ0GyN1bQe1IhIqkKZKoOp6NPTGBfCKCRdXNRSP70U=
CANT FIND 1V0RHJn8kgC/6dtAFSSzROp6NPTGBfCKCRdXNRSP70U=
CANT FIND Dn6CCCQtzsZWS10vGm43/+p6NPTGBfCKCRdXNRSP70U=
CANT FIND 3sdy9b2FZ+xQ7FMZQRgNmep6NPTGBfCKCRdXNRSP70U=
CANT FIND G0myTr9jLHdehuQyiKrm6Op6NPTGBfCKCRdXNRSP70U=
CANT FIND MiVS7ng6Yj0rSxor48mcKOp6NPTGBfCKCRdXNRSP70U=
CANT FIND 

### MAIN_CATEGORY_CART

# Cantidad de productos comprados

In [211]:
quants = train_tracking[(train_tracking.type=='PURCHASE_PRODUCT_UNKNOWN_ORIGIN') | (train_tracking.type=='PURCHASE_PRODUCT_SHOW_CASE')\
                   | (train_tracking.type=='PURCHASE_PRODUCT_PA') | (train_tracking.type=='PURCHASE_PRODUCT_LR')\
                   | (train_tracking.type=='PURCHASE_PRODUCT_LP') | (train_tracking.type=='PURCHASE_PRODUCT_CAROUSEL')].groupby('sid').quantity.sum().reset_index()


In [212]:
# sns.distplot(quants)
quants.columns = ['sid', 'PURCHASED_PRODUCTS']
quants

Unnamed: 0,sid,PURCHASED_PRODUCTS
0,+0Vh/Trjzcshzuq1XXjfUhr9EJHAll+V/yKuV/hhWQhNpn...,1.0
1,+3HJmV33aSPd7dF3hswIRCVWMudpCQOP4zf6JEsmOshW6G...,1.0
2,+A91LKDkR6SdIIgFkTFHG7A2TMtBIM3tlnD7svGWYfpIQS...,1.0
3,+BYtKgJYRuo7/IhqaOoV2Oxlkf4JYjYyLtX1up/oDZnGIp...,3.0
4,+C1il78kgxdQt6W/usx6LdbCSdPNmI9b4n4Kg1/WwFamjW...,1.0
5,+CUMGswW8k1NfC0xw7kTO8P/zegEWNqHRnzu4IxNQul0/g...,1.0
6,+FeBcR2+niL8pkSjFvgSKebwwIW7CwbJEr6V8mLird1jno...,1.0
7,+Fv7Kv71hyMAvMEqsvm594k/aV6/nnM3lkwjuETVaCfvo0...,1.0
8,+Giwr/L8m+C8Z6js5JrkSpYOskYVRI2qr712mQNauYHN1X...,1.0
9,+IdqwfISPjEbO9bxokyFK1KOgDA3ta0cu6uSy1Dld0w9Gk...,1.0


In [217]:
def purchased_products(session_features, train_tracking):
    quants = train_tracking[(train_tracking.type=='PURCHASE_PRODUCT_UNKNOWN_ORIGIN') | (train_tracking.type=='PURCHASE_PRODUCT_SHOW_CASE')\
                   | (train_tracking.type=='PURCHASE_PRODUCT_PA') | (train_tracking.type=='PURCHASE_PRODUCT_LR')\
                   | (train_tracking.type=='PURCHASE_PRODUCT_LP') | (train_tracking.type=='PURCHASE_PRODUCT_CAROUSEL')].groupby('sid').quantity.sum().reset_index()
    quants.columns = ['sid', 'PURCHASED_PRODUCTS']
    session_features = pd.merge(session_features, quants, on='sid', how='left')
    return session_features

purchased_products(session_features, train_tracking)

Unnamed: 0,sid,target,PURCHASED_PRODUCTS
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,False,
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,False,
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,False,
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,False,
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,False,
5,NsKSt1vreFt/tpUjhr2L4dYUlsaEpHiq2qsBMG6d+t+WtV...,False,
6,7Rgtt5iDUSO+/FGfyIaTrWYDiNjF1KbULPrldFj78dX6JO...,False,
7,L9OHLekz6vMT7t609Y6c3fUIJ2i5MKgdam8UOcCOJnDs+V...,False,
8,PydsL5KNw3VBO0pgF6HhczwRONYLyAhUIgzYMQv3Vaj84H...,False,
9,hLnY1trrWZhSv5dFqEZzBCPfvw6RI/vTzG1BrDpPuYvFjY...,False,
