In [1]:
import pandas as pd
from google.cloud import storage

# Trace Data Cleaning Daily Frequency

In [2]:
# Loading the trace data prepared after cleaning and Dick.N Filters

final_df = pd.read_csv("final_data_all_ver2.csv.gz",compression='gzip')

In [3]:
final_df.shape

(52658829, 10)

In [4]:

# Removing Null Rows on the basis of 'yld_pt' i.e. Target Column 
final_df = final_df[final_df['yld_pt'].notna()]
final_df.shape

(52658829, 10)

In [5]:
final_df['cusip_id'].nunique()

5228

In [6]:
final_df.head()

Unnamed: 0,cusip_id,bond_sym_id,company_symbol,trd_exctn_dt,trd_exctn_tm,entrd_vol_qt,rptd_pr,yld_pt,rpt_side_cd,sub_prdct
0,00846UAH4,A3900782,A,2012-10-01,8:44:41,6000.0,101.946,2.968464,S,CORP
1,00846UAH4,A3900782,A,2012-10-01,8:44:41,6000.0,101.946,2.968464,B,CORP
2,00846UAH4,A3900782,A,2012-10-01,8:45:11,6000.0,101.946,2.968464,B,CORP
3,00846UAH4,A3900782,A,2012-10-01,8:45:52,6000.0,101.946,2.968464,S,CORP
4,00846UAH4,A3900782,A,2012-10-02,10:26:29,2000000.0,101.358,3.037826,B,CORP


In [7]:
final_df['trd_exctn_dt'] = pd.to_datetime(final_df['trd_exctn_dt'])


In [8]:

final_df = final_df.sort_values(by=['cusip_id','trd_exctn_dt','entrd_vol_qt'], ascending=[True,True,False])

final_df



Unnamed: 0,cusip_id,bond_sym_id,company_symbol,trd_exctn_dt,trd_exctn_tm,entrd_vol_qt,rptd_pr,yld_pt,rpt_side_cd,sub_prdct
1422120,00037BAA0,ABB3852123,ABB,2012-10-01,15:26:58,625000.0,101.84900,1.209968,S,CORP
1422119,00037BAA0,ABB3852123,ABB,2012-10-01,11:41:41,510000.0,101.72700,1.237081,S,CORP
1422121,00037BAA0,ABB3852123,ABB,2012-10-01,16:02:48,350000.0,101.91600,1.195093,S,CORP
1422118,00037BAA0,ABB3852123,ABB,2012-10-01,11:09:23,150000.0,101.79500,1.221964,S,CORP
1422117,00037BAA0,ABB3852123,ABB,2012-10-01,5:38:16,15000.0,101.86600,1.206192,S,CORP
...,...,...,...,...,...,...,...,...,...,...
39244340,98978VAS2,PFE4985552,PFE,2022-09-30,13:52:02,9000.0,79.80693,5.251437,B,CORP
39244341,98978VAS2,PFE4985552,PFE,2022-09-30,13:52:02,9000.0,79.74443,5.262887,S,CORP
39244342,98978VAS2,PFE4985552,PFE,2022-09-30,13:52:02,9000.0,79.80693,5.251437,S,CORP
39244343,98978VAS2,PFE4985552,PFE,2022-09-30,13:52:02,9000.0,79.74443,5.262887,B,CORP


In [9]:
final_df[final_df['cusip_id']=='00037BAA0']['trd_exctn_dt'].nunique()

733

In [11]:
# Here we take the first row on the basis of the volume for each date, 
# if multiple trades for the same bond exist on the same day then we consider the trade with the highest volume
grouped_df = final_df.groupby([ 'cusip_id','trd_exctn_dt']).first().reset_index()

print(grouped_df.shape)


(4210976, 10)


In [13]:
grouped_df

Unnamed: 0,cusip_id,trd_exctn_dt,bond_sym_id,company_symbol,trd_exctn_tm,entrd_vol_qt,rptd_pr,yld_pt,rpt_side_cd,sub_prdct
0,00037BAA0,2012-10-01,ABB3852123,ABB,15:26:58,625000.0,101.8490,1.209968,S,CORP
1,00037BAA0,2012-10-02,ABB3852123,ABB,15:19:20,125000.0,101.8380,1.212172,S,CORP
2,00037BAA0,2012-10-03,ABB3852123,ABB,15:01:23,350000.0,102.0410,1.166046,S,CORP
3,00037BAA0,2012-10-09,ABB3852123,ABB,15:25:02,3147000.0,101.8720,1.202911,S,CORP
4,00037BAA0,2012-10-11,ABB3852123,ABB,10:03:42,5000000.0,102.0460,1.163062,S,CORP
...,...,...,...,...,...,...,...,...,...,...
4210971,98978VAS2,2022-09-26,PFE4985552,PFE,8:57:01,184000.0,80.5600,5.108972,S,CORP
4210972,98978VAS2,2022-09-27,PFE4985552,PFE,16:16:30,178000.0,78.8880,5.414995,S,CORP
4210973,98978VAS2,2022-09-28,PFE4985552,PFE,11:28:28,1000000.0,80.0655,5.199817,B,CORP
4210974,98978VAS2,2022-09-29,PFE4985552,PFE,13:00:08,1136000.0,79.3620,5.332033,B,CORP


In [15]:
grouped_df['cusip_id'].value_counts()

46625HJE1    2500
05968LAH5    2485
68389XAP0    2484
852061AS9    2484
369604BD4    2476
             ... 
505597AE4       4
00206RGY2       3
668074AV9       2
674599DS1       2
676167BS7       1
Name: cusip_id, Length: 5228, dtype: int64

In [16]:
grouped_df['cusip_id'].nunique()

5228

In [17]:
grouped_df.columns

Index(['cusip_id', 'trd_exctn_dt', 'bond_sym_id', 'company_symbol',
       'trd_exctn_tm', 'entrd_vol_qt', 'rptd_pr', 'yld_pt', 'rpt_side_cd',
       'sub_prdct'],
      dtype='object')

In [18]:
grouped_df.head()

Unnamed: 0,cusip_id,trd_exctn_dt,bond_sym_id,company_symbol,trd_exctn_tm,entrd_vol_qt,rptd_pr,yld_pt,rpt_side_cd,sub_prdct
0,00037BAA0,2012-10-01,ABB3852123,ABB,15:26:58,625000.0,101.849,1.209968,S,CORP
1,00037BAA0,2012-10-02,ABB3852123,ABB,15:19:20,125000.0,101.838,1.212172,S,CORP
2,00037BAA0,2012-10-03,ABB3852123,ABB,15:01:23,350000.0,102.041,1.166046,S,CORP
3,00037BAA0,2012-10-09,ABB3852123,ABB,15:25:02,3147000.0,101.872,1.202911,S,CORP
4,00037BAA0,2012-10-11,ABB3852123,ABB,10:03:42,5000000.0,102.046,1.163062,S,CORP


In [19]:
# Filtering the data from the past 5 years 
grouped_df['trd_exctn_dt'] = pd.to_datetime(grouped_df['trd_exctn_dt'])

start_date = pd.to_datetime("2017-01-01")
end_date = pd.to_datetime("2022-09-01")


In [22]:
test = grouped_df[(grouped_df['trd_exctn_dt'] >= start_date) & (grouped_df['trd_exctn_dt'] <= end_date)]


In [23]:
test.shape

(3228782, 10)

In [24]:
test['cusip_id'].value_counts()

88167AAE1    1439
88167AAD3    1436
345370CR9    1435
172967KA8    1434
88165FAG7    1432
             ... 
674599DS1       2
46625HJV3       2
668074AV9       2
205887BQ4       2
76720AAC0       1
Name: cusip_id, Length: 5057, dtype: int64

In [25]:

# Calculate the count of entries for each 'cusip_id' (bond) and store it 
test['no_of_traces'] = test.groupby('cusip_id')['cusip_id'].transform('count')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['no_of_traces'] = test.groupby('cusip_id')['cusip_id'].transform('count')


In [26]:
test['no_of_traces'].value_counts()

1419    18447
1422    17064
1418    17016
1415    14150
1412    14120
        ...  
11         11
10         10
2           8
5           5
1           1
Name: no_of_traces, Length: 1358, dtype: int64

In [27]:
# Filter out all the bonds with more than 700 entries
df_2yr = test[test['no_of_traces'] > 700]

df_2yr.shape

(2048710, 11)

In [28]:
df_2yr['cusip_id'].nunique()

1957

In [29]:
df_2yr['cusip_id'].value_counts()

88167AAE1    1439
88167AAD3    1436
345370CR9    1435
172967KA8    1434
88165FAG7    1432
             ... 
655664AT7     702
881609BA8     702
10373QAW0     701
92553PAM4     701
31620MAS5     701
Name: cusip_id, Length: 1957, dtype: int64

In [30]:
df_2yr

Unnamed: 0,cusip_id,trd_exctn_dt,bond_sym_id,company_symbol,trd_exctn_tm,entrd_vol_qt,rptd_pr,yld_pt,rpt_side_cd,sub_prdct,no_of_traces
1616,00037BAB8,2017-01-03,ABB3852125,ABB,10:42:08,45000.0,100.550,2.763052,B,CORP,1218
1617,00037BAB8,2017-01-04,ABB3852125,ABB,8:47:50,65000.0,100.670,2.738611,B,CORP,1218
1618,00037BAB8,2017-01-05,ABB3852125,ABB,9:24:05,250000.0,100.997,2.672547,B,CORP,1218
1619,00037BAB8,2017-01-06,ABB3852125,ABB,11:57:18,75000.0,100.850,2.702097,B,CORP,1218
1620,00037BAB8,2017-01-09,ABB3852125,ABB,9:45:30,500000.0,101.302,2.610951,B,CORP,1218
...,...,...,...,...,...,...,...,...,...,...,...
4209853,98978VAN3,2022-08-26,PFE4666688,PFE,15:10:09,210000.0,98.387,4.208056,B,CORP,764
4209854,98978VAN3,2022-08-29,PFE4666688,PFE,13:37:01,1500000.0,98.036,4.276044,B,CORP,764
4209855,98978VAN3,2022-08-30,PFE4666688,PFE,11:41:04,125000.0,98.041,4.275075,S,CORP,764
4209856,98978VAN3,2022-08-31,PFE4666688,PFE,9:06:10,480000.0,97.872,4.308006,B,CORP,764


In [31]:
# First, sort the entire DataFrame by 'trd_exctn_dt' in descending order

df = df_2yr.copy()
df = df.sort_values('trd_exctn_dt', ascending=False)

# Then, group by 'cusip_id' and use the .head() function to select the most recent 700 entries for each bond
# The idea is to get the most recent 700 trades(daily frequency) for all the bonds
df = df.groupby('cusip_id').head(700)

# You might want to sort again by 'cusip_id' and 'trd_exctn_dt' to make the DataFrame easier to work with
df = df.sort_values(['cusip_id', 'trd_exctn_dt'])


In [32]:
df['cusip_id'].value_counts()

00037BAB8    700
65504LAP2    700
655044AP0    700
655044AH8    700
654902AE5    700
            ... 
247361ZT8    700
247361ZP6    700
247361ZN1    700
247361ZJ0    700
98978VAN3    700
Name: cusip_id, Length: 1957, dtype: int64

In [33]:
df.shape

(1369900, 11)

In [34]:
df

Unnamed: 0,cusip_id,trd_exctn_dt,bond_sym_id,company_symbol,trd_exctn_tm,entrd_vol_qt,rptd_pr,yld_pt,rpt_side_cd,sub_prdct,no_of_traces
2134,00037BAB8,2019-03-20,ABB3852125,ABB,10:39:53,75000.0,99.96200,2.887133,B,CORP,1218
2135,00037BAB8,2019-03-21,ABB3852125,ABB,13:57:18,1305000.0,100.36900,2.750151,B,CORP,1218
2136,00037BAB8,2019-03-22,ABB3852125,ABB,16:33:02,1170000.0,100.30700,2.770905,B,CORP,1218
2137,00037BAB8,2019-03-25,ABB3852125,ABB,13:51:00,600000.0,100.78626,2.609905,B,CORP,1218
2138,00037BAB8,2019-03-26,ABB3852125,ABB,17:19:46,100000.0,100.66900,2.649010,B,CORP,1218
...,...,...,...,...,...,...,...,...,...,...,...
4209853,98978VAN3,2022-08-26,PFE4666688,PFE,15:10:09,210000.0,98.38700,4.208056,B,CORP,764
4209854,98978VAN3,2022-08-29,PFE4666688,PFE,13:37:01,1500000.0,98.03600,4.276044,B,CORP,764
4209855,98978VAN3,2022-08-30,PFE4666688,PFE,11:41:04,125000.0,98.04100,4.275075,S,CORP,764
4209856,98978VAN3,2022-08-31,PFE4666688,PFE,9:06:10,480000.0,97.87200,4.308006,B,CORP,764


In [35]:
df.reset_index(inplace=True)
df = df.iloc[:,1:]

In [36]:
df

Unnamed: 0,cusip_id,trd_exctn_dt,bond_sym_id,company_symbol,trd_exctn_tm,entrd_vol_qt,rptd_pr,yld_pt,rpt_side_cd,sub_prdct,no_of_traces
0,00037BAB8,2019-03-20,ABB3852125,ABB,10:39:53,75000.0,99.96200,2.887133,B,CORP,1218
1,00037BAB8,2019-03-21,ABB3852125,ABB,13:57:18,1305000.0,100.36900,2.750151,B,CORP,1218
2,00037BAB8,2019-03-22,ABB3852125,ABB,16:33:02,1170000.0,100.30700,2.770905,B,CORP,1218
3,00037BAB8,2019-03-25,ABB3852125,ABB,13:51:00,600000.0,100.78626,2.609905,B,CORP,1218
4,00037BAB8,2019-03-26,ABB3852125,ABB,17:19:46,100000.0,100.66900,2.649010,B,CORP,1218
...,...,...,...,...,...,...,...,...,...,...,...
1369895,98978VAN3,2022-08-26,PFE4666688,PFE,15:10:09,210000.0,98.38700,4.208056,B,CORP,764
1369896,98978VAN3,2022-08-29,PFE4666688,PFE,13:37:01,1500000.0,98.03600,4.276044,B,CORP,764
1369897,98978VAN3,2022-08-30,PFE4666688,PFE,11:41:04,125000.0,98.04100,4.275075,S,CORP,764
1369898,98978VAN3,2022-08-31,PFE4666688,PFE,9:06:10,480000.0,97.87200,4.308006,B,CORP,764


In [37]:
# Saving the prepared data to a pickle
df.to_pickle('final_data_daily_ver1.pkl')

In [38]:
mergent_data = pd.read_pickle("mergent_final_6k.pkl")

In [39]:
mergent_data = mergent_data[[
       'maturity', 'coupon_type','gross_spread','offering_amt',
        'offering_date','offering_price', 'offering_yield', 'principal_amt',
       'complete_cusip','coupon']]
mergent_data.shape

(6087, 10)

In [40]:
trace_data = df[['trd_exctn_dt', 'cusip_id', 'bond_sym_id', 'company_symbol'
                        , 'entrd_vol_qt', 'rptd_pr', 'yld_pt', 'rpt_side_cd' ]]

trace_data.shape

(1369900, 8)

In [41]:
mergent_data=mergent_data[mergent_data['offering_price'].notna()]
mergent_data.shape

(5987, 10)

In [42]:
final_merged_data = pd.merge(df,mergent_data,
                    left_on='cusip_id',right_on='complete_cusip',how='inner')
final_merged_data.shape

(1345400, 21)

In [43]:
final_merged_data['cusip_id'].nunique()

1922

In [44]:
final_merged_data.nunique()

cusip_id             1922
trd_exctn_dt         1580
bond_sym_id          2010
company_symbol        526
trd_exctn_tm        47320
entrd_vol_qt        18890
rptd_pr            128012
yld_pt            1114810
rpt_side_cd             2
sub_prdct               1
no_of_traces          671
maturity              888
coupon_type             2
gross_spread           76
offering_amt           95
offering_date         842
offering_price        751
offering_yield       1765
principal_amt           3
complete_cusip       1922
coupon                253
dtype: int64

In [45]:
final_merged_data['offering_date'] = pd.to_datetime(final_merged_data['offering_date'])
final_merged_data['maturity'] = pd.to_datetime(final_merged_data['maturity'])

In [46]:
final_merged_data.drop(columns=['rpt_side_cd','coupon_type','complete_cusip','no_of_traces','trd_exctn_tm','sub_prdct'],inplace=True)
final_merged_data.shape

(1345400, 15)

In [47]:
final_merged_data.columns

Index(['cusip_id', 'trd_exctn_dt', 'bond_sym_id', 'company_symbol',
       'entrd_vol_qt', 'rptd_pr', 'yld_pt', 'maturity', 'gross_spread',
       'offering_amt', 'offering_date', 'offering_price', 'offering_yield',
       'principal_amt', 'coupon'],
      dtype='object')

In [48]:
final_merged_data

Unnamed: 0,cusip_id,trd_exctn_dt,bond_sym_id,company_symbol,entrd_vol_qt,rptd_pr,yld_pt,maturity,gross_spread,offering_amt,offering_date,offering_price,offering_yield,principal_amt,coupon
0,00037BAB8,2019-03-20,ABB3852125,ABB,75000.0,99.96200,2.887133,2022-05-08,4.5,1250000.0,2012-05-03,97.833,3.12905,1000.0,2.875
1,00037BAB8,2019-03-21,ABB3852125,ABB,1305000.0,100.36900,2.750151,2022-05-08,4.5,1250000.0,2012-05-03,97.833,3.12905,1000.0,2.875
2,00037BAB8,2019-03-22,ABB3852125,ABB,1170000.0,100.30700,2.770905,2022-05-08,4.5,1250000.0,2012-05-03,97.833,3.12905,1000.0,2.875
3,00037BAB8,2019-03-25,ABB3852125,ABB,600000.0,100.78626,2.609905,2022-05-08,4.5,1250000.0,2012-05-03,97.833,3.12905,1000.0,2.875
4,00037BAB8,2019-03-26,ABB3852125,ABB,100000.0,100.66900,2.649010,2022-05-08,4.5,1250000.0,2012-05-03,97.833,3.12905,1000.0,2.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345395,98978VAN3,2022-08-26,PFE4666688,PFE,210000.0,98.38700,4.208056,2028-08-20,6.5,500000.0,2018-08-13,99.811,3.92303,1000.0,3.900
1345396,98978VAN3,2022-08-29,PFE4666688,PFE,1500000.0,98.03600,4.276044,2028-08-20,6.5,500000.0,2018-08-13,99.811,3.92303,1000.0,3.900
1345397,98978VAN3,2022-08-30,PFE4666688,PFE,125000.0,98.04100,4.275075,2028-08-20,6.5,500000.0,2018-08-13,99.811,3.92303,1000.0,3.900
1345398,98978VAN3,2022-08-31,PFE4666688,PFE,480000.0,97.87200,4.308006,2028-08-20,6.5,500000.0,2018-08-13,99.811,3.92303,1000.0,3.900


In [49]:
# Saving the final prepared data from Mergent and TRACE to a pickle file
final_merged_data.to_pickle('final_merged_data_daily_ver1.pkl')