In [10]:
import pandas as pd
import numpy as np

In [11]:
data = pd.read_csv('../../data/stationary_data.csv')
data['date'] = pd.to_datetime(data['date'])

# keep only the date and not the time of the day
data['date'] = data['date'].dt.date

# add a column of nans
data['target'] = np.nan

In [12]:
subset_date = '2007-12-31'
subset_date = pd.to_datetime(subset_date).date()

# subset the data after the date
data_subset = data[data['date'] > subset_date]

In [13]:
data_subset['date']

1123866    2008-01-02
1123867    2008-01-02
1123868    2008-01-02
1123869    2008-01-02
1123870    2008-01-02
              ...    
3483629    2023-12-29
3483630    2023-12-29
3483631    2023-12-29
3483632    2023-12-29
3483633    2023-12-29
Name: date, Length: 2359768, dtype: object

In [14]:
# find the unique values of the date column
unique_dates = data_subset['date'].unique()

unique_dates = pd.to_datetime(unique_dates)

print(unique_dates)

n_dates = len(unique_dates)
print(n_dates)

DatetimeIndex(['2008-01-02', '2008-01-03', '2008-01-04', '2008-01-07',
               '2008-01-08', '2008-01-09', '2008-01-10', '2008-01-11',
               '2008-01-14', '2008-01-15',
               ...
               '2023-12-15', '2023-12-18', '2023-12-19', '2023-12-20',
               '2023-12-21', '2023-12-22', '2023-12-26', '2023-12-27',
               '2023-12-28', '2023-12-29'],
              dtype='datetime64[ns]', length=4027, freq=None)
4027


In [15]:
# add a column to the data_subset that contains the date and the permno
data_subset['date_permno'] = data_subset['date'].astype(str) + '_' + data_subset['permno'].astype(str)

1123866    2008-01-02_89954
1123867    2008-01-02_60943
1123868    2008-01-02_24643
1123869    2008-01-02_60206
1123870    2008-01-02_75186
                 ...       
3483629    2023-12-29_32803
3483630    2023-12-29_32870
3483631    2023-12-29_32942
3483632    2023-12-29_60986
3483633    2023-12-29_93436
Name: date_permno, Length: 2359768, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_subset['date_permno'] = data_subset['date'].astype(str) + '_' + data_subset['permno'].astype(str)


In [16]:
lagged_data = pd.DataFrame(columns=['date_permno', 'lagged_price'])

# loop through the unique dates
for i in range(n_dates):
    # get the date
    date = unique_dates[i].date()

    #print('Date:', date)

    # get the return date
    return_date = date + pd.DateOffset(days=90)
    return_date = return_date.date()

    # find the return date if it wasn't a trading day going back 1 day at a time
    while return_date not in unique_dates:
        return_date = return_date - pd.DateOffset(days=1)
    
    return_date = return_date.date()
    
    # find the indexes the return date
    return_date_mask = data_subset['date'] == return_date

    # get the lagged price
    lagged_price = data_subset[return_date_mask]['prc'].values

    # get the permno at the return date
    lagged_permnos = data_subset[return_date_mask]['permno'].copy()

    # find the lagged date_permno
    date_permnos = str(date) + '_' + lagged_permnos.astype(str)

    # add to the lagged_data dataframe
    lagged_data = pd.concat([lagged_data, pd.DataFrame({'date_permno': date_permnos.values, 'lagged_price': lagged_price})])

    # print the progress deleting the previous line
    print(f'{i+1}/{n_dates} dates processed', end='\r', flush=True)

2/4027 dates processed

  lagged_data = pd.concat([lagged_data, pd.DataFrame({'date_permno': date_permnos.values, 'lagged_price': lagged_price})])


4027/4027 dates processed

In [17]:
# merge the lagged data with the data_subset on the date_permno
data_merged = data_subset.merge(lagged_data, on='date_permno', how='left')

In [18]:
data_merged.head()

Unnamed: 0,date,permno,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,...,naics_processed,ret_industry_tot,ret_industry_relative,MACD_index,rsi,12_month_return,3_month_return,target,date_permno,lagged_price
0,2008-01-02,89954,-11.591,0.0,0.0,-1.495,-1.495,-1.47,-1.458,-0.136,...,0.0,-0.104223,-0.670402,-0.02173,-7.976799,0.009346,-0.084295,,2008-01-02_89954,0.15
1,2008-01-02,60943,-2.102,0.0,0.0,-2.397,-2.416,-2.308,-2.107,-0.118,...,0.0,-0.614095,-1.779303,-0.047955,-6.054644,0.02521,-0.007067,,2008-01-02_60943,0.37
2,2008-01-02,24643,0.109,0.0,0.0,0.206,0.209,0.215,0.206,0.005,...,0.0,-0.614095,0.469282,0.081724,-5.632758,-0.012389,-0.009388,,2008-01-02_24643,-0.05
3,2008-01-02,60206,-0.381,0.0,0.0,-0.202,-0.205,-0.222,-0.233,-0.013,...,0.0,-0.614095,-0.770565,0.021984,-14.848607,0.012122,-0.050671,,2008-01-02_60206,1.86
4,2008-01-02,75186,2.65,0.0,0.0,1.477,1.477,1.477,0.602,0.285,...,0.0,-1.656499,-1.138517,0.006348,-19.941739,0.029637,-0.067404,,2008-01-02_75186,0.79


In [19]:
# calculate the difference between the lagged price and the price
data_merged['return'] = data_merged['lagged_price'] - data_merged['prc']

# set the target to 1 if the return is greater than 0, to 0 if the return is less than 0
data_merged['target'] = np.where(data_merged['return'] > 0, 1, data_merged['target'])
data_merged['target'] = np.where(data_merged['return'] < 0, 0, data_merged['target'])

In [20]:
# remove printing limits
pd.set_option('display.max_columns', None)

# select n random rows
n = 100
columns_to_print = ['permno', 'date', 'prc', 'lagged_price', 'return', 'target']

# print the random rows
print(data_merged[columns_to_print].sample(n))

         permno        date      prc  lagged_price   return  target
1663369   90454  2018-11-20  2.38001          2.75  0.36999     1.0
825904    15560  2013-04-18  0.02000         -0.07 -0.09000     0.0
2246808   91233  2023-02-24 -2.41000         -5.25 -2.84000     0.0
1710371   42200  2019-03-26  1.45000         -0.71 -2.16000     0.0
2294678   77606  2023-07-06 -0.24000         -0.81 -0.57000     0.0
...         ...         ...      ...           ...      ...     ...
1329550   13688  2016-08-01  0.21000          0.29  0.08000     1.0
2332479   36003  2023-10-17  0.07000         -0.10 -0.17000     0.0
687869    88860  2012-05-25  0.34000         -0.29 -0.63000     0.0
892184    38093  2013-09-20  0.13000          0.32  0.19000     1.0
1157086   66800  2015-06-10  1.69000         -1.37 -3.06000     0.0

[100 rows x 6 columns]


In [None]:
# drop the date_permno column
data_merged.drop('date_permno', axis=1, inplace=True)

# save the data to a csv file
data_merged.to_csv('../../data/data_stationary_labeled.csv', index=False)