In [217]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
from collections import Counter
from sklearn.decomposition import PCA
from ast import literal_eval
from functools import partial
import re
sns.set()

DATA = Path('../../data')
RAW  = DATA/'raw'
PROCESSED = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [218]:
%%time
product           = pd.read_csv(RAW/'productid_category.csv', low_memory=False)
train_tracking    = pd.read_csv(RAW/'train_tracking.csv', low_memory=False)
test_tracking     = pd.read_csv(RAW/'test_tracking.csv', low_memory=False)
train_session     = pd.read_csv(RAW/'train_session.csv', low_memory=False)
test_session      = pd.read_csv(RAW/'random_submission.csv', low_memory=False)

CPU times: user 1min 8s, sys: 8.69 s, total: 1min 17s
Wall time: 1min 17s


In [219]:
train_features = train_session.copy()
test_features = test_session.copy()

In [220]:
def add_page(train_tracking):
    def extract_page(x):
        pages_types = ['_LR', '_PA', '_LP', '_CAROUSEL', '_SHOW_CASE']
        pages = ['CAROUSEL', 'PA', 'SEARCH', 'SHOW_CASE', 'LIST_PRODUCT']
        pages_map = [['PURCHASE_PRODUCT_UNKNOW_ORIGIN', 'UNKNOWN']]
        for pages_type in pages_types:
            if x.endswith(pages_type):
                return x[-len(pages_type)+1:]
        for page in pages:
            if x == page:
                return x
        for page_map in pages_map:
            if x == page_map[0]:
                return page_map[1]
        return '::' + x
    train_tracking['page'] = train_tracking.type.apply(extract_page)
    return train_tracking

def simplify_categories(product):
    counter1 = product.groupby('category_product_id_level1').size()
    counter1dict = counter1.to_dict()
    mapcat = {}
    for idx in counter1dict:
        if counter1dict[idx] > 10:
            mapcat[idx] = idx
        else:
            mapcat[idx] = 10e7
    product['cat1'] = product.category_product_id_level1.apply(lambda x: mapcat[x])
    return product

def convert_jsonproducts(train_tracking, column):
    def convert_json(x):
        if pd.isnull(x):
            return x
        else:
            return literal_eval(x)
    train_tracking['product_list'] = train_tracking[column].apply(convert_json)
    return train_tracking

def nn_convert_jsonproducts(train_tracking, column):
    train_tracking['product_list'] = train_tracking[column].apply(literal_eval)
    return train_tracking

def fast_convert_jsonproducts(train_tracking, column):
    prog = re.compile("'sku':\ *'([a-zA-Z0-9\+\=\/]+)'")
    train_tracking['product_list'] = train_tracking[column].apply(lambda val: re.findall(prog, val))
    return train_tracking

In [221]:
print('Loading pages')
train_tracking = add_page(train_tracking)
print('Loading categories')
product = simplify_categories(product)
print('Loading catmap')
catmap = dict(zip(product.product_id, product.cat1))

Loading pages
Loading categories
Loading catmap


In [222]:
def cat_counter(prodlist, catmap):
    try:
        counter = {}
        for prod in prodlist:
            if not prod in catmap:
                # print('CANT FIND ' + prod['sku'])
                # print(prodlist)
                cat = 10e7
            else:
                cat = int(catmap[prod])
            if cat in counter:
                counter[cat] = counter[cat] + 1
            else:
                counter[cat] = 1
        return counter
    except:
        print(prodlist)
        print("ERROR")
        return {}
    
def prod_counter(prodlist):
    try:
        counter = {}
        for prod in prodlist:
            if prod in counter:
                counter[prod] = counter[prod] + 1
            else:
                counter[prod] = 1
        return counter
    except:
        print(prodlist)
        print("ERROR")
        return {}

def merge_counters(counters):
    merged = {}
    for counter in counters:
        for key in counter:
            if key in merged:
                merged[key] = merged[key] + counter[key]
            else:
                merged[key] = counter[key]
        # merged = {**merged, **counter}
    return merged

# Main category in purchases view

In [223]:
def main_cat_purchase(session_features):
    carousel = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_CAROUSEL'].copy(), 'ocarproducts')
    carousel['prod_counter'] = carousel.product_list.apply(cat_counter)
    session_carousel = carousel.groupby('sid').prod_counter.agg(merge_counters).reset_index()
    
    lp = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_LP'].copy(), 'products')
    lp['prod_counter'] = lp.product_list.apply(cat_counter)
    session_lp = lp.groupby('sid').prod_counter.agg(merge_counters).reset_index()
    
    lr = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_LR'].copy(), 'oproducts')
    lr['prod_counter'] = lr.product_list.apply(cat_counter)
    session_lr = lr.groupby('sid').prod_counter.agg(merge_counters).reset_index()
    
    session_categories = pd.merge(pd.merge(session_carousel, session_lp, on='sid', how='left'), session_lr, on='sid', how='left')
    session_categories['prod_counters'] = list(zip(session_categories.prod_counter_car, session_categories.prod_counter_lp, session_categories.prod_counter_lr))
    
    def merge_xyz(row):
        counters = []
        for i in range(3):
            if pd.notnull(row[i]):
                counters.append(row[i])
        merged = merge_counters(counters)
        evaluate = Counter(merged)
        return evaluate.most_common(1)[0][0]
    session_categories['top_cat'] = session_categories.prod_counters.apply(merge_xyz)
    top_cat_sessions = session_categories[['sid', 'top_cat']]
    top_cat_sessions.columns = ['sid', 'MAIN_CATEGORY_PURCHASED_VIEW']
    session_features = pd.merge(session_features, top_cat_sessions, on='sid', how='left')
    return session_features

In [224]:
session_data = []

carousel = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_CAROUSEL'].copy(), 'ocarproducts')
carousel['prod_counter0'] = carousel.product_list.apply(cat_counter)
session_carousel = carousel.groupby('sid').prod_counter0.agg(merge_counters).reset_index()
session_data.append(session_carousel)

lp = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_LP'].copy(), 'products')
lp['prod_counter1'] = lp.product_list.apply(cat_counter)
session_lp = lp.groupby('sid').prod_counter1.agg(merge_counters).reset_index()
session_data.append(session_lp)

lr = convert_jsonproducts(train_tracking[train_tracking.type=='PURCHASE_PRODUCT_LR'].copy(), 'oproducts')
lr['prod_counter2'] = lr.product_list.apply(cat_counter)
session_lr = lr.groupby('sid').prod_counter2.agg(merge_counters).reset_index()
session_data.append(session_lr)

bkt = convert_jsonproducts(train_tracking[train_tracking.type=='ADD_TO_BASKET_CAROUSEL'].copy(), 'ocarproducts')
bkt['prod_counter3'] = bkt.product_list.apply(cat_counter)
session_bkt = bkt.groupby('sid').prod_counter3.agg(merge_counters).reset_index()
session_data.append(session_bkt)

len(carousel), len(lr), len(lp), len(bkt)

TypeError: cat_counter() missing 1 required positional argument: 'catmap'

In [94]:
bkt = convert_jsonproducts(train_tracking[train_tracking.type=='ADD_TO_BASKET_LR'].copy(), 'oproducts')

In [96]:
bkt = bkt.drop(bkt[pd.isnull(bkt.oproducts)].index, axis=0)
bkt['prod_counter5'] = bkt.product_list.apply(cat_counter)
session_bkt = bkt.groupby('sid').prod_counter5.agg(merge_counters).reset_index()
session_data.append(session_bkt)

len(bkt)

31366

In [62]:
session_categories = pd.merge(pd.merge(session_carousel, session_lp, on='sid', how='left'), session_lr, on='sid', how='left')

In [63]:
session_categories['prod_counters'] = list(zip(session_categories.prod_counter_car, session_categories.prod_counter_lp, session_categories.prod_counter_lr))
# session_categories.applymap(merge_xyz)
# session_categories

In [64]:
def merge_xyz(row):
    counters = []
    for i in range(3):
        if pd.notnull(row[i]):
            counters.append(row[i])
    merged = merge_counters(counters)
    evaluate = Counter(merged)
    return evaluate.most_common(1)[0][0]

In [65]:
session_categories['top_cat'] = session_categories.prod_counters.apply(merge_xyz)
top_cat_sessions = session_categories[['sid', 'top_cat']]
top_cat_sessions.columns = ['sid', 'MAIN_CATEGORY_PURCHASED_VIEW']
var = pd.merge(session_features, top_cat_sessions, on='sid', how='left')
len(var[pd.notnull(var.MAIN_CATEGORY_PURCHASED_VIEW)])
# top_cat_sessions

275

In [68]:
275/len(session_features)

0.0020657587344035215

# OCAR Products main category

In [118]:
session_data = []

ocarprods = convert_jsonproducts(train_tracking[pd.notnull(train_tracking.ocarproducts)].copy(), 'ocarproducts')

In [124]:
ocarprods['prod_counter'] = ocarprods.product_list.apply(cat_counter)
session_ocar = ocarprods.groupby('sid').prod_counter.agg(merge_counters).reset_index()
len(session_ocar)

25484

In [126]:
25484/len(session_features)

0.19143198395468852

# Main watched product

In [209]:
def watched_category(session):
    prods = fast_convert_jsonproducts(train_tracking[pd.notnull(train_tracking.products)].copy(), 'products')
    prods['prod_counter'] = prods.product_list.apply(partial(cat_counter, catmap=catmap))
    session_prods = prods.groupby('sid').prod_counter.agg(merge_counters).reset_index()
    
    def top_cat(x):
        evaluation = Counter(x)
        return evaluation.most_common(1)[0][0]
    session_prods['top_cat'] = session_prods.prod_counter.apply(top_cat)
    
    session_cat = session_prods[['sid', 'top_cat']].copy()
    session_cat.columns = ['sid', 'WATCHED_CATEGORY']
    session_features = pd.merge(session_features, session_cat, on='sid', how='left')
    
    return session_features

def watched_product(session_features, train_tracking, product):
    prods = fast_convert_jsonproducts(train_tracking[pd.notnull(train_tracking.products)].copy(), 'products')
    prods['prod_counter'] = prods.product_list.apply(prod_counter)
    session_prods = prods.groupby('sid').prod_counter.agg(merge_counters).reset_index()
    
    def top_cat(x):
        evaluation = Counter(x)
        return evaluation.most_common(1)[0][0]
    session_prods['top_cat'] = session_prods.prod_counter.apply(top_cat)
    
    session_cat = session_prods[['sid', 'top_cat']].copy()
    session_cat.columns = ['sid', 'WATCHED_PRODUCT']
    session_features = pd.merge(session_features, session_cat, on='sid', how='left')
    
    return session_features

In [188]:
len(train_tracking[pd.notnull(train_tracking.products)])

test_data = train_tracking[pd.notnull(train_tracking.products)].sample(10000).copy()

def test(f1, f2):
    return sum(test_data.products.apply(lambda val: f1(val) == f2(val)))
prog = re.compile("'sku':\ *'([a-zA-Z0-9\+\=\/]+)'")
test(lambda val: len(re.findall(prog, val)), lambda x: len(literal_eval(x)))/10000

1.0

In [189]:
prods = fast_convert_jsonproducts(train_tracking[pd.notnull(train_tracking.products)].copy(), 'products')

In [191]:
prods['prod_counter'] = prods.product_list.apply(partial(prod_counter))
session_prods = prods.groupby('sid').prod_counter.agg(merge_counters).reset_index()
len(session_prods)

103124

In [192]:
len(session_prods)/len(session_features)

0.7746520135513772

In [193]:
def top_cat(x):
    evaluation = Counter(x)
    return evaluation.most_common(1)[0][0]
session_prods['top_cat'] = session_prods.prod_counter.apply(top_cat)

In [199]:
session_cat = session_prods[['sid', 'top_cat']].copy()
session_cat.columns = ['sid', 'WATCHED_PRODUCT']
session_features = pd.merge(session_features, session_cat, on='sid', how='left')

In [212]:
train_features = watched_product(train_features, train_tracking, product)
test_features = watched_product(test_features, test_tracking, product)

In [213]:
train_features[['sid', 'WATCHED_PRODUCT']].to_feather(PROCESSED/'train_WP.feather')
test_features[['sid', 'WATCHED_PRODUCT']].to_feather(PROCESSED/'test_WP.feather')

In [216]:
len(train_features.WATCHED_PRODUCT.unique())/len(train_features)

0.43825634939116453