In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
market     = pd.read_csv(RAW/'Market.csv', low_memory=False)

In [4]:
week_labels = [20180226, 20180305, 20180312, 20180319, 
               20180326, 20180402, 20180409, 20180416, 20180423]

In [5]:
%%time
weeks = []
for name in week_labels:
    weeks.append(pd.read_feather(PROCESSED/f'week_{name % 10000:04}_diffscount.feather'))

CPU times: user 4.48 s, sys: 2.7 s, total: 7.18 s
Wall time: 13.3 s


In [12]:
from src.utils import get_weeks, week_num

In [15]:
all_weeks = get_weeks()
for w in weeks:
    w['week'] = w.TradeDateKey.apply(lambda x: week_num(all_weeks, x))

In [24]:
market['week'] = market.DateKey.apply(lambda x: week_num(all_weeks, x))

In [25]:
%%time
weekly_averages = market.groupby(['IsinIdx', 'week'])[['Price', 'Yield', 'ZSpread']].agg('mean')

CPU times: user 1.52 s, sys: 680 ms, total: 2.2 s
Wall time: 2.66 s


In [None]:
def get_previous_weeks(current_week, previous_weeks):
    weeks_idxs = [current_week - w_off for w_off in previous_weeks]
    return weekly_values.loc[current_week] - weekly_values.loc[weeks_idxs] 

In [27]:
df = weeks[0].copy()

In [None]:
df.set_index()

In [78]:
from itertools import product

def get_previous_weeks(row, weekly_averages, previous_weeks):
    weeks_idxs = [row['week'] - w_off for w_off in previous_weeks]
    result = weekly_averages.loc[(row['IsinIdx'], row['week'])] - \
             weekly_averages.loc[row['IsinIdx']].loc[weeks_idxs]
    result['WeekOffset'] = previous_weeks
    result.set_index('WeekOffset', inplace=True)
    return pd.Series(result.values.reshape(-1), 
                     index=[f'Week{wo}_{col}' \
           for wo, col in product(list(res.index), list(res.columns))])

In [54]:
res = get_previous_weeks(df.loc[0], weekly_averages, [1,2,4,8,16])
res

Unnamed: 0_level_0,Price,Yield,ZSpread
WeekOffset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.1,-0.1976,-0.2276
2,0.1498,-0.3124,-0.406
4,0.0748,-0.3504,-0.5326
8,-0.063,17.3852,14.9772
16,-0.5374,2.2662,1.4282


In [79]:
get_previous_weeks(df.loc[0], weekly_averages, [1,2,4,8,16])

Week1_Price        0.1000
Week1_Yield       -0.1976
Week1_ZSpread     -0.2276
Week2_Price        0.1498
Week2_Yield       -0.3124
Week2_ZSpread     -0.4060
Week4_Price        0.0748
Week4_Yield       -0.3504
Week4_ZSpread     -0.5326
Week8_Price       -0.0630
Week8_Yield       17.3852
Week8_ZSpread     14.9772
Week16_Price      -0.5374
Week16_Yield       2.2662
Week16_ZSpread     1.4282
dtype: float64

In [64]:
get_previous_weeks(df.loc[0], weekly_averages, [1,2,3,5,8,13])

Unnamed: 0_level_0,Price,Yield,ZSpread
WeekOffset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.1,-0.1976,-0.2276
2,0.1498,-0.3124,-0.406
3,0.2248,-0.4576,-0.6214
5,0.0374,-0.3594,-0.5816
8,-0.063,17.3852,14.9772
13,-0.015,0.534,-0.1974


In [81]:
%%time
df_res = df.apply(lambda r: get_previous_weeks(r, weekly_averages, [1,2,4,8,16]), axis=1)

In [26]:
weekly_averages.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Yield,ZSpread
IsinIdx,week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,51,104.25,7.835,5.505
1,52,103.9,7.8706,5.6304
1,53,102.2,8.0458,5.9564
1,54,102.05,8.0614,6.0388
1,55,103.375,7.9232,5.9136


In [20]:
weeks[-1].week.describe()

count    484758.0
mean        172.0
std           0.0
min         172.0
25%         172.0
50%         172.0
75%         172.0
max         172.0
Name: week, dtype: float64