In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')

DATA = Path('data')
RAW  = DATA/'raw'
PROCESSED = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [2]:
train_session         = pd.read_csv(f'{RAW}/train_session.csv', low_memory=False)
train_tracking        = pd.read_csv(f'{RAW}/train_tracking.csv', low_memory=False)
test_tracking         = pd.read_csv(f'{RAW}/test_tracking.csv', low_memory=False)
productid_category    = pd.read_csv(f'{RAW}/productid_category.csv', low_memory=False)
random_submission     = pd.read_csv(f'{RAW}/random_submission.csv', low_memory=False)

In [3]:
import re
from ast import literal_eval

In [4]:
def fast_convert_jsonproducts(train_tracking, column):
    prog = re.compile("'sku':\ *'([a-zA-Z0-9\+\=\/]+)'")
    train_tracking['product_list'] = train_tracking[column].apply(lambda val: re.findall(prog, val))
    return train_tracking

In [5]:
test_data = train_tracking[pd.notnull(train_tracking.products)].sample(10000).copy()

def test(f1, f2):
    return sum(test_data.products.apply(lambda val: f1(val) == f2(val)))
prog = re.compile("'offer_id':\ *'([a-zA-Z0-9\+\=\/]+)'")
test(lambda val: len(re.findall(prog, val)), lambda x: len(literal_eval(x)))/10000

0.0

In [6]:
len(train_tracking.offerid.unique())

105335

In [7]:
len(train_tracking)

1355095

In [8]:
sid_by_offer = train_tracking.groupby('sid')['offerid'].size().reset_index()

In [9]:
sid_by_offer.head()

Unnamed: 0,sid,offerid
0,+++elmtsXqN289wWNi6auO1Fm7gyPkXmsKngig88cIqXDD...,13
1,++0tYP9PmT6jX9O1WjUhWd7w3hWV6xSRMBOdA7HMoBukKs...,5
2,++2CIH+Rnf2MBamibl+EPSMDTKmweZzRgeX/VDBussbBR8...,4
3,++3a8LhdXKrKZJeNiBtuHj8znGF/eQADRi0GSnPSlqRajq...,18
4,++3dzXAmTuAQr+0il3jYZzqk8eoPk6TiffxCqNdQAKyBGp...,7


In [10]:
sid_by_offer_by_target = pd.merge(train_session, sid_by_offer, on='sid')

In [11]:
sid_by_offer_by_target.corr()

Unnamed: 0,target,offerid
target,1.0,0.103226
offerid,0.103226,1.0


In [12]:
sid_by_offer.describe()

Unnamed: 0,offerid
count,133123.0
mean,10.17927
std,14.249718
min,3.0
25%,4.0
50%,6.0
75%,11.0
max,951.0


In [13]:
successful_high_offer = len(sid_by_offer_by_target[(sid_by_offer_by_target['offerid']>6.0) 
                                                   & (sid_by_offer_by_target['target']==False)])

In [14]:
succesful = len(train_session[~train_session['target']])

In [15]:
successful_high_offer/succesful

0.4252795437779233

In [27]:
number_of_actions = train_tracking.groupby('sid')['type'].size().reset_index()

In [28]:
number_of_actions.head()

Unnamed: 0,sid,type
0,+++elmtsXqN289wWNi6auO1Fm7gyPkXmsKngig88cIqXDD...,13
1,++0tYP9PmT6jX9O1WjUhWd7w3hWV6xSRMBOdA7HMoBukKs...,5
2,++2CIH+Rnf2MBamibl+EPSMDTKmweZzRgeX/VDBussbBR8...,4
3,++3a8LhdXKrKZJeNiBtuHj8znGF/eQADRi0GSnPSlqRajq...,18
4,++3dzXAmTuAQr+0il3jYZzqk8eoPk6TiffxCqNdQAKyBGp...,7


In [29]:
to_normalize = pd.merge(sid_by_offer_by_target, number_of_actions, on='sid')

In [30]:
to_normalize.head()

Unnamed: 0,sid,target,offerid,type
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,False,4,4
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,False,13,13
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,False,4,4
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,False,4,4
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,False,3,3


In [31]:
to_normalize['rate_of_offers'] = to_normalize['offerid'] / to_normalize['type']

In [32]:
to_normalize.head()

Unnamed: 0,sid,target,offerid,type,rate_of_offers
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,False,4,4,1.0
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,False,13,13,1.0
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,False,4,4,1.0
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,False,4,4,1.0
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,False,3,3,1.0
