In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../../data/definitive_dataset.csv')
data['date'] = pd.to_datetime(data['date'])

# keep only the date and not the time of the day
data['date'] = data['date'].dt.date

# add a column of nans
data['target'] = np.nan

In [4]:
subset_date = '2007-12-31'
subset_date = pd.to_datetime(subset_date).date()

# subset the data after the date
data_subset = data[data['date'] > subset_date]

In [5]:
data_subset['date']

0          2008-01-02
1          2008-01-02
2          2008-01-02
3          2008-01-02
4          2008-01-02
              ...    
2347308    2023-12-29
2347309    2023-12-29
2347310    2023-12-29
2347311    2023-12-29
2347312    2023-12-29
Name: date, Length: 2347313, dtype: object

In [6]:
# find the unique values of the date column
unique_dates = data_subset['date'].unique()

unique_dates = pd.to_datetime(unique_dates)

print(unique_dates)

n_dates = len(unique_dates)
print(n_dates)

DatetimeIndex(['2008-01-02', '2008-01-03', '2008-01-04', '2008-01-07',
               '2008-01-08', '2008-01-09', '2008-01-10', '2008-01-11',
               '2008-01-14', '2008-01-15',
               ...
               '2023-12-15', '2023-12-18', '2023-12-19', '2023-12-20',
               '2023-12-21', '2023-12-22', '2023-12-26', '2023-12-27',
               '2023-12-28', '2023-12-29'],
              dtype='datetime64[ns]', length=4024, freq=None)
4024


In [7]:
# add a column to the data_subset that contains the date and the permno
data_subset['date_permno'] = data_subset['date'].astype(str) + '_' + data_subset['permno'].astype(str)

In [9]:
lagged_data = pd.DataFrame(columns=['date_permno', 'lagged_price'])

# loop through the unique dates
for i in range(n_dates):
    # get the date
    date = unique_dates[i].date()

    #print('Date:', date)

    # get the return date
    return_date = date + pd.DateOffset(days=90)
    return_date = return_date.date()

    # find the return date if it wasn't a trading day going back 1 day at a time
    while return_date not in unique_dates:
        return_date = return_date - pd.DateOffset(days=1)
    
    return_date = return_date.date()
    
    # find the indexes the return date
    return_date_mask = data_subset['date'] == return_date

    # get the lagged price
    lagged_price = data_subset[return_date_mask]['prc'].values

    # get the permno at the return date
    lagged_permnos = data_subset[return_date_mask]['permno'].copy()

    # find the lagged date_permno
    date_permnos = str(date) + '_' + lagged_permnos.astype(str)

    # add to the lagged_data dataframe
    lagged_data = pd.concat([lagged_data, pd.DataFrame({'date_permno': date_permnos.values, 'lagged_price': lagged_price})])

    # print the progress deleting the previous line
    print(f'{i+1}/{n_dates} dates processed', end='\r', flush=True)

1/4024 dates processed

  lagged_data = pd.concat([lagged_data, pd.DataFrame({'date_permno': date_permnos.values, 'lagged_price': lagged_price})])


4024/4024 dates processed

In [10]:
# merge the lagged data with the data_subset on the date_permno
data_merged = data_subset.merge(lagged_data, on='date_permno', how='left')

In [11]:
data_merged.head()

Unnamed: 0,date,permno,stat_divyeld,12_month_return,3_month_return,fed_funds_adj_close,fed_funds_volume,CAPEI,bm,evm,...,mktcap,prc_adj,naics_processed,ret_industry_tot,ret_industry_relative,MACD_index,rsi,target,date_permno,lagged_price
0,2008-01-02,10104,0.0,0.479792,0.202459,0.0,0.0,38.538,0.067646,10.3621,...,8.248108,6.529796,51.0,-0.942159,0.712659,0.395601,57.86879,,2008-01-02_10104,6.81139
1,2008-01-02,10107,0.011153,-0.169548,0.0949,0.0,0.0,30.821,0.062908,8.272798,...,4.242842,6.094239,51.0,-0.942159,0.201484,0.544679,46.02046,,2008-01-02_10107,4.819871
2,2008-01-02,10137,-0.00903,0.255381,0.069118,0.0,0.0,-565.06,0.185281,11.118,...,4.99078,18.182907,22.0,-1.022388,0.32338,1.182111,47.711559,,2008-01-02_10137,15.657113
3,2008-01-02,10138,0.012088,0.038009,-0.155647,0.0,0.0,33.137747,0.139315,11.79599,...,6.171009,19.560493,52.0,-1.359673,-0.156626,0.133547,37.203035,,2008-01-02_10138,20.845747
4,2008-01-02,10145,0.00036,-0.079194,-0.03388,0.0,0.0,30.245043,0.197,3.540256,...,5.452684,11.368027,33.0,-1.169556,-0.815948,0.871301,44.456469,,2008-01-02_10145,12.587665


In [12]:
# calculate the difference between the lagged price and the price
data_merged['return'] = data_merged['lagged_price'] - data_merged['prc']

# set the target to 1 if the return is greater than 0, to 0 if the return is less than 0
data_merged['target'] = np.where(data_merged['return'] > 0, 1, data_merged['target'])
data_merged['target'] = np.where(data_merged['return'] < 0, 0, data_merged['target'])

In [13]:
# remove printing limits
pd.set_option('display.max_columns', None)

# select n random rows
n = 100
columns_to_print = ['permno', 'date', 'prc', 'lagged_price', 'return', 'target']

# print the random rows
print(data_merged[columns_to_print].sample(n))

         permno        date         prc  lagged_price     return  target
645775    23114  2012-02-28   48.283470     49.106980   0.823510     1.0
1051905   35044  2014-10-16    3.753597      4.001785   0.248188     1.0
957152    93436  2014-03-06  227.312738    183.664076 -43.648662     0.0
26238     21573  2008-03-06   24.909001     20.246875  -4.662126     0.0
1414775   78034  2017-03-23   39.788347     42.520820   2.732473     1.0
...         ...         ...         ...           ...        ...     ...
1698501   53065  2019-03-21   19.698384     19.753822   0.055439     1.0
1632113   59176  2018-09-27  107.850000     89.500000 -18.350000     0.0
809288    11552  2013-03-22  113.130000    116.890000   3.760000     1.0
974921    10145  2014-04-17   21.545646     21.617808   0.072163     1.0
1405184   91937  2017-02-28   11.445655     12.923791   1.478136     1.0

[100 rows x 6 columns]


In [17]:
# drop the date_permno column
data_merged.drop('date_permno', axis=1, inplace=True)

KeyError: "['date_permno'] not found in axis"

In [21]:
# when return = 0, target = 0
data_merged['target'] = np.where(data_merged['return'] == 0, 0, data_merged['target'])

# drop lagged_price and return columns
data_merged.drop(['lagged_price', 'return'], axis=1, inplace=True)

In [23]:
# save the data to a csv file
data_merged.to_csv('../../data/definitive_dataset.csv', index=False)