In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA      = Path('data')
RAW       = DATA/'raw'
INTERIM   = DATA/'interim'
PROCESSED = DATA/'processed'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [5]:
validation = pd.read_feather(PROCESSED/'validation.feather')

In [6]:
friday = validation[validation.TradeDateKey == 20180420]

In [8]:
challenge.shape

(484758, 6)

In [7]:
friday.shape

(4367, 6)

### Initialize dataset

In [15]:
%%time
val_friday = {}
date = friday['TradeDateKey'].unique()[0]
for idx, row in challenge.iterrows():
    val_friday[(date, row['CustomerIdx'], 
                row['IsinIdx'], row['BuySell'])] = 0
for idx, row in friday.sort_values('CustomerInterest').iterrows():
    key = (date, row['CustomerIdx'], 
           row['IsinIdx'], row['BuySell'])
    val_friday[key] = row['CustomerInterest']
    key = list(key)
    key[-1] = 'Sell' if row['BuySell'] == 'Buy' else 'Buy'
    key = tuple(key)
    if key not in val_friday:
        val_friday[key] = 0

In [18]:
val_friday = pd.Series(val_friday)

In [24]:
val_friday = pd.DataFrame(val_friday).reset_index()

In [25]:
val_friday.columns = ['TradeDateKey', 'CustomerIdx', 'IsinIdx', 'BuySell', 'CustomerInterest']

In [33]:
challenge.shape

(484758, 6)

In [42]:
%%time
from src.utils import make_val_set
val_friday = make_val_set(friday, challenge)

CPU times: user 29.7 s, sys: 96 ms, total: 29.8 s
Wall time: 29.8 s


In [43]:
val_friday.shape

(487172, 5)

In [46]:
challenge_pairs = set(zip(challenge.CustomerIdx, challenge.IsinIdx))

In [47]:
day_pairs = set(zip(friday.CustomerIdx, friday.IsinIdx))

In [45]:
val_friday.head()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest
0,20180420,0,21856,Buy,0.0
1,20180420,0,21856,Sell,0.0
2,20180420,0,24944,Buy,0.0
3,20180420,0,24944,Sell,0.0
4,20180420,0,25992,Buy,0.0


In [28]:
friday.sort_values(['TradeDateKey', 'CustomerIdx', 'IsinIdx', 'BuySell']).head()

Unnamed: 0,index,TradeDateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest
6511752,123070,20180420,5,10842,Buy,1.0
6512460,507303,20180420,14,26004,Buy,1.0
6512496,507304,20180420,14,26005,Buy,1.0
6511794,551605,20180420,14,26777,Buy,1.0
6514380,216341,20180420,31,26495,Sell,1.0


In [53]:
val_friday.CustomerInterest.value_counts()

0.0    483985
1.0      3187
Name: CustomerInterest, dtype: int64

In [54]:
# 0.66% of positive labels
3187 / 483985

0.006584914821740343