In [1]:
import pandas as pd
import numpy as np

In [3]:
# load the data generated from Open Source Bond Asset Pricing
# the data is the output from MakeondDailyMetrics.py
data = pd.read_csv\
    (r'BondDailyPublic.csv.gzip',
     compression='gzip')
data.columns = data.columns.str.lower()
data['trd_exctn_dt'] = pd.to_datetime(data['trd_exctn_dt'])
if 'unnamed: 0' in data.columns:
    data.drop(columns=['unnamed: 0'], inplace=True)

In [74]:
data.head(5)

Unnamed: 0,cusip_id,trd_exctn_dt,prclean,prfull,acclast,accpmt,accall,ytm,ytmt,qvolume,dvolume,coupon,mod_dur,convexity,cs_dur,cs
698934,000325AA8,2002-07-01,118.25,121.652083,3.402083,53.274653,56.676736,-0.174177,-0.174177,1000000.0,1182500.0,8.875,0.663875,0.815456,-0.195077,-0.195077
698935,000325AA8,2002-07-05,118.25,121.8,3.55,53.274653,56.824653,-0.180752,-0.180752,1000000.0,1182500.0,8.875,0.648025,0.787306,-0.201552,-0.201552
698936,000325AA8,2002-07-15,100.75,104.497223,3.747222,53.274653,57.021875,0.075215,0.075215,18618000.0,18757635.0,8.875,0.544183,0.568071,0.055515,0.055515
698937,000325AA8,2002-07-24,100.6028,104.571897,3.969097,53.274653,57.24375,0.077392,0.077392,20650000.0,20774469.0,8.875,0.519523,0.529683,0.058492,0.058492
698938,000325AA8,2002-07-26,100.625,104.692708,4.067708,53.274653,57.342361,0.076781,0.076781,200000.0,201250.0,8.875,0.508981,0.513842,0.058781,0.058781


In [11]:
# Convert 'trd_exctn_dt' to datetime
data['trd_exctn_dt'] = pd.to_datetime(data['trd_exctn_dt'])

# Sort by 'cusip_id' and 'trd_exctn_dt'
data.sort_values(by=['cusip_id', 'trd_exctn_dt'], inplace=True)

In [75]:
# use the data in 2002 and 2003 for testing purposes
data_copy = data[['cusip_id','trd_exctn_dt','prclean']]

## Filter based on business days between trades (<= 5 days)

In [77]:
# Calculate the number of calendar days between trades
data_copy['days_since_last_trade'] = data_copy.groupby('cusip_id')['trd_exctn_dt'].diff().dt.days.fillna(0).astype(int)

# Define a function to calculate the number of business days between two dates
def calculate_business_days(row):
    if row['days_since_last_trade'] == 0:  # no difference means no business days
        return 0
    # Generate a date range that excludes weekends
    business_days = pd.bdate_range(start=row['trd_exctn_dt'] - pd.Timedelta(days=row['days_since_last_trade']),
                                   end=row['trd_exctn_dt'] - pd.Timedelta(days=1))
    return len(business_days)

# Apply the function to each row
data_copy['business_days_since_last_trade'] = data_copy.apply(calculate_business_days, axis=1)

# filter the trades has less than five business days since last trade
data_copy = data_copy[data_copy['business_days_since_last_trade'] <= 5]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy['days_since_last_trade'] = data_copy.groupby('cusip_id')['trd_exctn_dt'].diff().dt.days.fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy['business_days_since_last_trade'] = data_copy.apply(calculate_business_days, axis=1)


## filter out bonds with less than five trades per month

In [78]:
# Group by cusip_id and month, and filter out bonds with less than five trades per month
data_copy['year_month'] = data_copy['trd_exctn_dt'].dt.to_period('M')

monthly_trade_counts = data_copy.groupby(['cusip_id', 'year_month']).size().reset_index(name='monthly_trades')

eligible_bonds = monthly_trade_counts[monthly_trade_counts['monthly_trades'] >= 5]

# Join the eligible bonds back to the data
data_copy = data_copy.merge(eligible_bonds[['cusip_id', 'year_month']], on=['cusip_id', 'year_month'], how='inner')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy['year_month'] = data_copy['trd_exctn_dt'].dt.to_period('M')


## calculate the daily returns, remove large return reversals and exclude returns with absolute value > 20%

In [79]:
# Calculate daily returns
data_copy['daily_return'] = data_copy.groupby('cusip_id')['prclean'].pct_change()

# Remove large return reversals (20% or more followed by 20% or more in the opposite direction)
data_copy['previous_return'] = data_copy.groupby('cusip_id')['daily_return'].shift()
data_copy = data_copy[~((abs(data_copy['daily_return']) >= 0.2) & (data_copy['daily_return'] * data_copy['previous_return'] < 0))]

# Exclude returns with absolute value > 20%
data_copy = data_copy[abs(data_copy['daily_return']) <= 0.2]

  data_copy['daily_return'] = data_copy.groupby('cusip_id')['prclean'].pct_change()


In [80]:
# Select relevant columns to display the results
result = data_copy[['cusip_id', 'trd_exctn_dt', 'prclean', 'daily_return']]
result['daily_return_bps'] = result['daily_return'] * 10000
result['daily_return_bps'].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['daily_return_bps'] = result['daily_return'] * 10000


0.6486045618024078

In [82]:
result.to_csv(r"C:\Users\baiyu\Desktop\finm32900-project\daily_return.csv", index=False)