In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [3]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [4]:
# challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
# customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
# isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
# submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
# trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
market     = pd.read_csv(RAW/'Market.csv', low_memory=False)

In [5]:
week_labels = [20180226, 20180305, 20180312, 20180319, 
               20180326, 20180402, 20180409, 20180416, 20180423]

In [6]:
df = pd.read_feather(PROCESSED/f'week_{week_labels[-1] % 10000:04}_diffscount.feather')

In [7]:
from src.utils import get_weeks, week_num

In [8]:
all_weeks = get_weeks()
df['week'] = df.TradeDateKey.apply(lambda x: week_num(all_weeks, x))

In [6]:
%%time
weeks = []
for name in week_labels:
    weeks.append(pd.read_feather(PROCESSED/f'week_{name % 10000:04}_diffscount.feather'))

CPU times: user 5.18 s, sys: 3.47 s, total: 8.66 s
Wall time: 14.8 s


In [8]:
all_weeks = get_weeks()
for w in weeks:
    w['week'] = w.TradeDateKey.apply(lambda x: week_num(all_weeks, x))

In [9]:
market['week'] = market.DateKey.apply(lambda x: week_num(all_weeks, x))

In [10]:
%%time
weekly_averages = market.groupby(['IsinIdx', 'week'])[['Price', 'Yield', 'ZSpread']].agg('mean')

CPU times: user 1.08 s, sys: 560 ms, total: 1.64 s
Wall time: 1.65 s


In [11]:
df = weeks[0].copy()

In [12]:
weekly_averages.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Yield,ZSpread
IsinIdx,week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,51,104.25,7.835,5.505
1,52,103.9,7.8706,5.6304
1,53,102.2,8.0458,5.9564
1,54,102.05,8.0614,6.0388
1,55,103.375,7.9232,5.9136


In [29]:
from collections import defaultdict
a = weekly_averages.to_dict('index')
averages = defaultdict(dict)
for k in a:
    averages[k[0]][k[1]] = a[k]

In [32]:
averages[1][51]

{'Price': 104.25, 'Yield': 7.835, 'ZSpread': 5.505}

In [35]:
from itertools import product

def slow_get_previous_weeks(row, weekly_averages, previous_weeks):
    weeks_idxs = [row.week - w_off for w_off in previous_weeks]
    result = pd.DataFrame(columns=weekly_averages.columns)
    for idx in weeks_idxs:
        if row.IsinIdx in weekly_averages.index:
            bond_data = weekly_averages.loc[row.IsinIdx]
            result.loc[idx] = bond_data.loc[row.week] - bond_data.loc[idx] \
                                if row.week in bond_data.index and idx in bond_data.index \
                                else [100,0,0]
        else:
            result.loc[idx] = [100,0,0]
    result['WeekOffset'] = previous_weeks
    result.set_index('WeekOffset', inplace=True)
    return pd.Series(result.values.reshape(-1), 
                     index=[f'Week{wo}_{col}' \
           for wo, col in product(list(result.index), list(result.columns))])

In [106]:
res = get_previous_weeks(df.loc[0], weekly_averages, [1,2,4,8,16])
res

Week1_Price        0.1000
Week1_Yield       -0.1976
Week1_ZSpread     -0.2276
Week2_Price        0.1498
Week2_Yield       -0.3124
Week2_ZSpread     -0.4060
Week4_Price        0.0748
Week4_Yield       -0.3504
Week4_ZSpread     -0.5326
Week8_Price       -0.0630
Week8_Yield       17.3852
Week8_ZSpread     14.9772
Week16_Price      -0.5374
Week16_Yield       2.2662
Week16_ZSpread     1.4282
dtype: float64

In [126]:
%%time
get_previous_weeks(df.loc[3].to_dict(), averages, [1,2,3,5,8,13])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 725 µs


{1: [0.42499999999999716, -0.09160000000000057, -0.06680000000000064],
 2: [0.5250000000000057, -0.11240000000000272, -0.14040000000000052],
 3: [0.07500000000000284, -0.014000000000001123, -0.10640000000000072],
 5: [-0.5499999999999972, 0.12139999999999773, -0.09200000000000053],
 8: [-1.2999999999999972, 0.29439999999999955, -0.21120000000000028],
 13: [-0.9749999999999943, 0.21419999999999906, -0.38420000000000076]}

In [105]:
%%time
get_previous_weeks(df.loc[0].to_dict(), averages, [1,2,3,5,8,13])

CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 23.9 ms


{'Week13_Price': -0.01500000000001478,
 'Week13_Yield': 0.5340000000000007,
 'Week13_ZSpread': -0.19740000000000046,
 'Week1_Price': 0.09999999999999432,
 'Week1_Yield': -0.19759999999999955,
 'Week1_ZSpread': -0.22759999999999936,
 'Week2_Price': 0.14979999999999905,
 'Week2_Yield': -0.31239999999999846,
 'Week2_ZSpread': -0.40600000000000014,
 'Week3_Price': 0.2248000000000019,
 'Week3_Yield': -0.45759999999999934,
 'Week3_ZSpread': -0.6214000000000008,
 'Week5_Price': 0.03740000000000521,
 'Week5_Yield': -0.35939999999999994,
 'Week5_ZSpread': -0.5816000000000003,
 'Week8_Price': -0.0630000000000166,
 'Week8_Yield': 17.385199999999998,
 'Week8_ZSpread': 14.977199999999998}

In [70]:
%%time
slow_get_previous_weeks(df.loc[0], weekly_averages, [1,2,3,5,8,13])

CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 30.5 ms


Week1_Price        0.1000
Week1_Yield       -0.1976
Week1_ZSpread     -0.2276
Week2_Price        0.1498
Week2_Yield       -0.3124
Week2_ZSpread     -0.4060
Week3_Price        0.2248
Week3_Yield       -0.4576
Week3_ZSpread     -0.6214
Week5_Price        0.0374
Week5_Yield       -0.3594
Week5_ZSpread     -0.5816
Week8_Price       -0.0630
Week8_Yield       17.3852
Week8_ZSpread     14.9772
Week13_Price      -0.0150
Week13_Yield       0.5340
Week13_ZSpread    -0.1974
dtype: float64

In [151]:
df.loc[348, ['TradeDateKey', 'IsinIdx']]

TradeDateKey    20180226
IsinIdx            26196
Name: 348, dtype: object

In [156]:
get_previous_weeks(df.loc[348], weekly_averages, [1,2,3,5,8,13])

Week1_Price       100
Week1_Yield         0
Week1_ZSpread       0
Week2_Price       100
Week2_Yield         0
Week2_ZSpread       0
Week3_Price       100
Week3_Yield         0
Week3_ZSpread       0
Week5_Price       100
Week5_Yield         0
Week5_ZSpread       0
Week8_Price       100
Week8_Yield         0
Week8_ZSpread       0
Week13_Price      100
Week13_Yield        0
Week13_ZSpread      0
dtype: object

In [66]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [103]:
%%time
# 3%|▎         | 19916/683028 [03:16<1:52:12, 98.50it/s]
df_res = df.progress_apply(lambda r: get_previous_weeks(r, averages, [1,2,4,8,13], weekly_averages.columns), axis=1)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.4 µs


In [98]:
import multiprocessing as mp
partitions = mp.cpu_count()
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = mp.Pool()
    data = pool.map(func, data_split)
    pool.close()
    pool.join()
    return data

In [77]:
def work(r):
    return get_previous_weeks(r, averages, [1,2,4,8,13], weekly_averages.columns)

In [81]:
from functools import partial

In [99]:
def foo(r):
    a = 0
    for i in range(10000000):
        a += i
    r.week = a
    return r

In [131]:
%%time
rows = [row.to_dict() for idx, row in df[['IsinIdx', 'week']].iterrows()]

CPU times: user 33.9 s, sys: 672 ms, total: 34.6 s
Wall time: 33.9 s


In [132]:
rows[:2]

[{'IsinIdx': 21856, 'week': 164}, {'IsinIdx': 21856, 'week': 164}]

In [114]:
def get_previous_weeks(row, averages, previous_weeks):
    weeks_idxs = [row['week'] - w_off for w_off in previous_weeks]
    result = {}
    for idx, weekoff in zip(weeks_idxs, previous_weeks):
        if row['IsinIdx'] in averages and row['week'] in averages[row['IsinIdx']] \
                                      and idx in averages[row['IsinIdx']]:
            result[weekoff] = [x - y for x, y in zip(averages[row['IsinIdx']][row['week']].values(), 
                                                     averages[row['IsinIdx']][idx].values())]
        else:
            result[weekoff] = [100,0,0]
    return result

In [141]:
foo = partial(get_previous_weeks, averages=averages, previous_weeks=[1,2,4,8,13])

In [139]:
get_previous_weeks(rows[0], averages, [1,2,3,5,8,13])

{1: [0.09999999999999432, -0.19759999999999955, -0.22759999999999936],
 2: [0.14979999999999905, -0.31239999999999846, -0.40600000000000014],
 3: [0.2248000000000019, -0.45759999999999934, -0.6214000000000008],
 5: [0.03740000000000521, -0.35939999999999994, -0.5816000000000003],
 8: [-0.0630000000000166, 17.385199999999998, 14.977199999999998],
 13: [-0.01500000000001478, 0.5340000000000007, -0.19740000000000046]}

In [None]:
%%time
# 57s
pool = mp.Pool()
data = pool.map(foo, rows)
pool.close()
pool.join()

In [144]:
len(data)

683028

In [164]:
col2idx = {col:idx for idx,col in enumerate(weekly_averages.columns)}

In [167]:
%%time
for wo, col in product(list(data[0].keys()), list(weekly_averages.columns)):
    df[f'Week{wo}_{col}'] = [r[wo][col2idx[col]] for r in data]           

CPU times: user 4.65 s, sys: 200 ms, total: 4.85 s
Wall time: 4.68 s


In [161]:
data[0]

{1: [0.09999999999999432, -0.19759999999999955, -0.22759999999999936],
 2: [0.14979999999999905, -0.31239999999999846, -0.40600000000000014],
 4: [0.07480000000001041, -0.3503999999999987, -0.5326000000000009],
 8: [-0.0630000000000166, 17.385199999999998, 14.977199999999998],
 13: [-0.01500000000001478, 0.5340000000000007, -0.19740000000000046]}

In [171]:
df.head(5)

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity,DaysCountBuySell,DaysCountTransaction,DaysCountCustomerActivity,DaysCountBondActivity,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType,week,Week1_Price,Week1_Yield,Week1_ZSpread,Week2_Price,Week2_Yield,Week2_ZSpread,Week4_Price,Week4_Yield,Week4_ZSpread,Week8_Price,Week8_Yield,Week8_ZSpread,Week13_Price,Week13_Yield,Week13_ZSpread
0,20180226,0,21856,Buy,0.0,240,240,5,4,0,0,6,6,Asset Managers & Hedge Funds,Independent Asset Manager,Asia Pacific,HONG KONG,1930,20190114,20140114,SEN,USD,FLOW LOCAL MARKET,ASIA-TOKYO,ASIA MARKET MAKING,ASIA HIGH YIELD,ASIA HIGH YIELD,B,Financial,Real Estate Oper/Develop,Euro-dollar,600000000.0,FIXED,164,0.1,-0.1976,-0.2276,0.1498,-0.3124,-0.406,0.0748,-0.3504,-0.5326,-0.063,17.3852,14.9772,-0.015,0.534,-0.1974
1,20180226,0,21856,Sell,0.0,240,240,5,4,0,0,6,6,Asset Managers & Hedge Funds,Independent Asset Manager,Asia Pacific,HONG KONG,1930,20190114,20140114,SEN,USD,FLOW LOCAL MARKET,ASIA-TOKYO,ASIA MARKET MAKING,ASIA HIGH YIELD,ASIA HIGH YIELD,B,Financial,Real Estate Oper/Develop,Euro-dollar,600000000.0,FIXED,164,0.1,-0.1976,-0.2276,0.1498,-0.3124,-0.406,0.0748,-0.3504,-0.5326,-0.063,17.3852,14.9772,-0.015,0.534,-0.1974
2,20180226,0,24944,Buy,0.0,240,240,5,3,0,0,6,8,Asset Managers & Hedge Funds,Independent Asset Manager,Asia Pacific,HONG KONG,1333,20230817,20160817,SEN,USD,FLOW LOCAL MARKET,ASIA-TOKYO,ASIA MARKET MAKING,ASIA HIGH YIELD,ASIA HIGH YIELD,NR,Financial,Property/Casualty Ins,Euro-dollar,590000000.0,FIXED,164,0.425,-0.0916,-0.0668,0.525,-0.1124,-0.1404,-0.5,0.1104,-0.0008,-1.3,0.2944,-0.2112,-0.975,0.2142,-0.3842
3,20180226,0,24944,Sell,0.0,240,240,5,3,0,0,6,8,Asset Managers & Hedge Funds,Independent Asset Manager,Asia Pacific,HONG KONG,1333,20230817,20160817,SEN,USD,FLOW LOCAL MARKET,ASIA-TOKYO,ASIA MARKET MAKING,ASIA HIGH YIELD,ASIA HIGH YIELD,NR,Financial,Property/Casualty Ins,Euro-dollar,590000000.0,FIXED,164,0.425,-0.0916,-0.0668,0.525,-0.1124,-0.1404,-0.5,0.1104,-0.0008,-1.3,0.2944,-0.2112,-0.975,0.2142,-0.3842
4,20180226,0,25992,Buy,1.0,240,240,5,3,0,0,6,28,Asset Managers & Hedge Funds,Independent Asset Manager,Asia Pacific,HONG KONG,744,20200601,20170601,SEN,USD,FLOW LOCAL MARKET,ASIA-TOKYO,ASIA MARKET MAKING,ASIA HIGH YIELD,ASIA HIGH YIELD,BB,Communications,Internet Connectiv Svcs,Euro-dollar,500000000.0,FIXED,164,0.075,-0.0258,-0.0292,0.025,0.0072,-0.073,-2.445,1.1778,1.0248,-2.95,1.4,1.0158,-2.8876,1.3616,0.8196


In [11]:
df['PreviousWeek'] = df['week'] - 1

In [12]:
len(weekly_averages), len(df)

(1980431, 484758)

In [13]:
%%time
from collections import defaultdict
weekly_dict = weekly_averages.to_dict('index')
averages = defaultdict(dict)
for k in weekly_dict:
    averages[k[0]][k[1]] = weekly_dict[k]

CPU times: user 1min 8s, sys: 272 ms, total: 1min 8s
Wall time: 1min 8s


In [14]:
averages[1][51]

{'Price': 104.25, 'Yield': 7.835, 'ZSpread': 5.505}

In [15]:
default_price = {'Price': 100, 'Yield': 0, 'ZSpread': 0}

In [16]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [18]:
last_prices = df.progress_apply(lambda r: \
                averages[r.IsinIdx][r.PreviousWeek] \
                if r.IsinIdx in averages and \
                   r.PreviousWeek in averages[r.IsinIdx]
                else default_price, axis=1)

In [22]:
d = pd.DataFrame(list(last_prices))

In [33]:
for c in d:
    df[c] = d[c]

In [34]:
df.head()

Unnamed: 0,PredictionIdx,CustomerIdx,IsinIdx,BuySell,CustomerInterest,TradeDateKey,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity,DaysCountBuySell,DaysCountTransaction,DaysCountCustomerActivity,DaysCountBondActivity,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType,week,PreviousWeek,Price,Yield,ZSpread
0,a1e0d80784,1856,13323,Buy,,20180423,296,296,3,3,0,0,6291,34,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,2740,20210315,20130314,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US TMT CDS,B-,"Consumer, Non-cyclic",Printing-Commercial,Global,450000000.0,FIXED,172,171,104.9536,5.986,3.206
1,c2cc6cc2a8,1856,9230,Buy,,20180423,14,14,3,3,3,4,6291,12,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,1446,20240215,20131210,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY FIN AUTOS,B,Financial,Multi-line Insurance,US domestic,400000000.0,FIXED,172,171,79.9826,9.3346,6.475
2,a8e94f6344,1780,9157,Buy,,20180423,296,296,3,296,0,0,2783,0,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,1387,20360815,20060815,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH FIN,US FIN SHORT CASH,A+,Industrial,Diversified Manufact Op,Domestic mtn,300000000.0,FLOATING,172,171,85.2818,4.8082,1.901
3,758bae1e35,2129,9131,Buy,,20180423,296,296,3,11,0,0,340,43,Asset Owners,Insurance,Americas,USA,1387,20180501,20080421,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH FIN,US FIN SHORT CASH,A+,Industrial,Diversified Manufact Op,Global,4000000000.0,FIXED,172,171,100.0,0.0,0.0
4,02ab378ee8,1758,7151,Buy,,20180423,296,296,3,33,0,0,1239,1,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,1290,20181115,20081118,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH NONFIN,US ENERGY CASH,BBB+,Utilities,Electric-Integrated,US domestic,300000000.0,FIXED,172,171,103.2588,0.1088,-2.3392


In [23]:
d.head()

Unnamed: 0,Price,Yield,ZSpread
0,104.9536,5.986,3.206
1,79.9826,9.3346,6.475
2,85.2818,4.8082,1.901
3,100.0,0.0,0.0
4,103.2588,0.1088,-2.3392


In [None]:
13323, 171

In [26]:
averages[13323][171]

{'Price': 104.95360000000001, 'Yield': 5.986, 'ZSpread': 3.2060000000000004}

In [24]:
len(d), len(df)

(484758, 484758)

In [19]:
last_prices.head()

0    {'Price': 104.95360000000001, 'Yield': 5.986, ...
1    {'Price': 79.9826, 'Yield': 9.334599999999998,...
2    {'Price': 85.2818, 'Yield': 4.808199999999999,...
3             {'Price': 100, 'Yield': 0, 'ZSpread': 0}
4    {'Price': 103.2588, 'Yield': 0.108800000000000...
dtype: object