In [10]:
import pandas as pd
import numpy as np

In [11]:
data = pd.read_csv('../../data/stationary_data.csv')
data['date'] = pd.to_datetime(data['date'])

# keep only the date and not the time of the day
data['date'] = data['date'].dt.date

# add a column of nans
data['target'] = np.nan

In [None]:
subset_date = '2007-12-31'
subset_date = pd.to_datetime(subset_date).date()

# subset the data after the date
data_subset = data[data['date'] > subset_date]

In [None]:
data_subset['date']

In [None]:
# find the unique values of the date column
unique_dates = data_subset['date'].unique()

unique_dates = pd.to_datetime(unique_dates)

print(unique_dates)

n_dates = len(unique_dates)
print(n_dates)

In [None]:
# add a column to the data_subset that contains the date and the permno
data_subset['date_permno'] = data_subset['date'].astype(str) + '_' + data_subset['permno'].astype(str)

print(data_subset['date_permno'])

In [None]:
lagged_data = pd.DataFrame(columns=['date_permno', 'lagged_price'])

# loop through the unique dates
for i in range(n_dates):
    # get the date
    date = unique_dates[i].date()

    #print('Date:', date)

    # get the return date
    return_date = date + pd.DateOffset(days=90)
    return_date = return_date.date()

    # find the return date if it wasn't a trading day going back 1 day at a time
    while return_date not in unique_dates:
        return_date = return_date - pd.DateOffset(days=1)
    
    return_date = return_date.date()
    #print('Return date:', return_date)
    
    # find the index of the date and return date
    #date_mask = data_subset['date'] == date
    return_date_mask = data_subset['date'] == return_date

    # get the price at the date and the price at the return date
    #price = data_subset[date_mask]['prc'].values
    lagged_price = data_subset[return_date_mask]['prc'].values

    #print('Lagged dates:', lagged_dates)

    # get the permno at the return date
    lagged_permnos = data_subset[return_date_mask]['permno'].copy()

    # find the length of the permnos
    n_permnos = len(lagged_permnos)

    #print('Lagged permnos:', lagged_permnos)

    # find the lagged date_permno
    date_permnos = str(date) + '_' + lagged_permnos.astype(str)

    # add to the lagged_data dataframe
    lagged_data = pd.concat([lagged_data, pd.DataFrame({'date_permno': date_permnos.values, 'lagged_price': lagged_price})])

    # print the progress deleting the previous line
    print(f'{i+1}/{n_dates} dates processed', end='\r', flush=True)

In [None]:
# merge the lagged data with the data_subset on the date_permno
data_merged = data_subset.merge(lagged_data, on='date_permno', how='left')

In [None]:
data_merged.head()

In [None]:
# calculate the difference between the lagged price and the price
data_merged['return'] = data_merged['lagged_price'] - data_merged['prc']

# set the target to 1 if the return is greater than 0, to 0 if the return is less than 0
data_merged['target'] = np.where(data_merged['return'] > 0, 1, data_merged['target'])
data_merged['target'] = np.where(data_merged['return'] < 0, 0, data_merged['target'])

In [None]:
# remove printing limits
pd.set_option('display.max_columns', None)

# select n random rows
n = 100
columns_to_print = ['permno', 'date', 'prc', 'lagged_price', 'return', 'target']

# print the random rows
print(data_merged[columns_to_print].sample(n))