In [2]:
import pandas as pd
import numpy as np

In [3]:
# this contains the dataset to which we are going to add the target variable

definitive_dataset = pd.read_csv('../../data/definitive_dataset.csv')

In [13]:
# this contains the not differentiated dataset, from which we are going to build the target variable from

data = pd.read_csv('../../data/merged_fin_with_rets.csv')
data['date'] = pd.to_datetime(data['date'])

# keep only the relevant columns
data = data[['date', 'permno', 'prc_adj']]

# keep only the date and not the time of the day
data['date'] = data['date'].dt.date

# add a column of nans
data['target'] = np.nan

In [14]:
subset_date = '2007-12-31'
subset_date = pd.to_datetime(subset_date).date()

# subset the data after the date
data_subset = data[data['date'] > subset_date]

In [15]:
data_subset['date']

1124389    2008-01-02
1124390    2008-01-02
1124391    2008-01-02
1124392    2008-01-02
1124393    2008-01-02
              ...    
3484152    2023-12-29
3484153    2023-12-29
3484154    2023-12-29
3484155    2023-12-29
3484156    2023-12-29
Name: date, Length: 2359768, dtype: object

In [16]:
# find the unique values of the date column
unique_dates = data_subset['date'].unique()

unique_dates = pd.to_datetime(unique_dates)

print(unique_dates)

n_dates = len(unique_dates)
print(n_dates)

DatetimeIndex(['2008-01-02', '2008-01-03', '2008-01-04', '2008-01-07',
               '2008-01-08', '2008-01-09', '2008-01-10', '2008-01-11',
               '2008-01-14', '2008-01-15',
               ...
               '2023-12-15', '2023-12-18', '2023-12-19', '2023-12-20',
               '2023-12-21', '2023-12-22', '2023-12-26', '2023-12-27',
               '2023-12-28', '2023-12-29'],
              dtype='datetime64[ns]', length=4027, freq=None)
4027


In [17]:
# add a column to the data_subset that contains the date and the permno
data_subset['date_permno'] = data_subset['date'].astype(str) + '_' + data_subset['permno'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_subset['date_permno'] = data_subset['date'].astype(str) + '_' + data_subset['permno'].astype(str)


In [19]:
# drop the rows which have NaN values in the date
data_subset = data_subset.dropna(subset=['date'])

Unnamed: 0,date,permno,prc_adj,target,date_permno
1124389,2008-01-02,89954,3.338141,,2008-01-02_89954
1124390,2008-01-02,60943,9.308660,,2008-01-02_60943
1124391,2008-01-02,24643,52.771613,,2008-01-02_24643
1124392,2008-01-02,60206,10.718681,,2008-01-02_60206
1124393,2008-01-02,75186,9.883326,,2008-01-02_75186
...,...,...,...,...,...
3484152,2023-12-29,32803,24.182703,,2023-12-29_32803
3484153,2023-12-29,32870,7.053167,,2023-12-29_32870
3484154,2023-12-29,32942,71.905941,,2023-12-29_32942
3484155,2023-12-29,60986,4.804874,,2023-12-29_60986


In [21]:
lagged_data = pd.DataFrame(columns=['date_permno', 'lagged_price'])

# loop through the unique dates
for i in range(n_dates):
    # get the date
    date = unique_dates[i].date()

    #print('Date:', date)

    # get the return date
    return_date = date + pd.DateOffset(days=90)
    return_date = return_date.date()

    # find the return date if it wasn't a trading day going back 1 day at a time
    while return_date not in unique_dates:
        return_date = return_date - pd.DateOffset(days=1)
    
    return_date = return_date.date()
    
    # find the indexes the return date
    return_date_mask = data_subset['date'] == return_date

    # get the lagged price
    lagged_price = data_subset[return_date_mask]['prc_adj'].values

    # get the permno at the return date
    lagged_permnos = data_subset[return_date_mask]['permno'].copy()

    # find the lagged date_permno
    date_permnos = str(date) + '_' + lagged_permnos.astype(str)

    # add to the lagged_data dataframe
    lagged_data = pd.concat([lagged_data, pd.DataFrame({'date_permno': date_permnos.values, 'lagged_price': lagged_price})])

    # print the progress deleting the previous line
    if i % 20 == 0:
        print(f'{i+1}/{n_dates} dates processed', end='\r', flush=True)

1/4027 dates processed

  lagged_data = pd.concat([lagged_data, pd.DataFrame({'date_permno': date_permnos.values, 'lagged_price': lagged_price})])


4021/4027 dates processed

In [22]:
# merge the lagged data with the data_subset on the date_permno
data_merged = data_subset.merge(lagged_data, on='date_permno', how='left')

In [23]:
data_merged.head()

Unnamed: 0,date,permno,prc_adj,target,date_permno,lagged_price
0,2008-01-02,89954,3.338141,,2008-01-02_89954,3.696637
1,2008-01-02,60943,9.30866,,2008-01-02_60943,7.783001
2,2008-01-02,24643,52.771613,,2008-01-02_24643,52.442252
3,2008-01-02,60206,10.718681,,2008-01-02_60206,10.759713
4,2008-01-02,75186,9.883326,,2008-01-02_75186,6.08934


In [30]:
# calculate the difference between the lagged price and the price
data_merged['return'] = data_merged['lagged_price'] - data_merged['prc_adj']

# set the target to 1 if the return is greater than 0, to 0 if the return is less than 0
data_merged['target'] = np.where(data_merged['return'] > 0, 1, data_merged['target'])
data_merged['target'] = np.where(data_merged['return'] <= 0, 0, data_merged['target'])

In [29]:
data_merged[data_merged['target'].notna()]

Unnamed: 0,date,permno,prc_adj,target,date_permno,lagged_price,return
0,2008-01-02,89954,3.338141,1.0,2008-01-02_89954,3.696637,0.358496
1,2008-01-02,60943,9.308660,0.0,2008-01-02_60943,7.783001,-1.525660
2,2008-01-02,24643,52.771613,0.0,2008-01-02_24643,52.442252,-0.329361
3,2008-01-02,60206,10.718681,1.0,2008-01-02_60206,10.759713,0.041032
4,2008-01-02,75186,9.883326,0.0,2008-01-02_75186,6.089340,-3.793986
...,...,...,...,...,...,...,...
2359234,2023-12-28,11533,192.936563,0.0,2023-12-28_11533,187.900676,-5.035887
2359235,2023-12-28,81055,84.630299,1.0,2023-12-28_81055,85.316579,0.686281
2359236,2023-12-28,17743,15.094473,1.0,2023-12-28_17743,15.519594,0.425121
2359237,2023-12-28,11600,19.447312,0.0,2023-12-28_11600,19.304233,-0.143080


In [38]:
# remove printing limits
pd.set_option('display.max_columns', None)

# select n random rows
n = 100
columns_to_print = ['permno', 'date', 'prc_adj', 'lagged_price', 'return', 'target']

# print the random rows
print(data_merged[columns_to_print].sample(n))

         permno       date     prc_adj  lagged_price     return  target
1122478   90441 2015-03-19    8.231664      6.737883  -1.493781     0.0
581248    91611 2011-09-19   53.567855     17.097343 -36.470512     0.0
1208165   86356 2015-10-08    9.690427      9.063742  -0.626685     0.0
2068855   89006 2021-10-26   15.955897     18.041576   2.085679     1.0
1885589   90664 2020-06-24   27.601672     25.587618  -2.014054     0.0
...         ...        ...         ...           ...        ...     ...
1412801   35554 2017-02-24  130.381725    124.154817  -6.226909     0.0
451490    91937 2010-11-19    5.328936      6.118272   0.789337     1.0
1047733   80286 2014-09-23   12.415291     17.162489   4.747198     1.0
10078     54181 2008-01-25   29.002696     29.370334   0.367638     1.0
1330575   81540 2016-08-03   19.825916      4.247663 -15.578253     0.0

[100 rows x 6 columns]


In [39]:
# drop the date_permno column
data_merged.drop('date_permno', axis=1, inplace=True)

In [41]:
# when return = 0, target = 0
data_merged['target'] = np.where(data_merged['return'] == 0, 0, data_merged['target'])

# drop lagged_price and return columns
data_merged.drop(['lagged_price', 'return'], axis=1, inplace=True)

In [44]:
data_merged[data_merged['target'].isna()]

Unnamed: 0,date,permno,prc_adj,target
286633,2009-10-30,10078,3.620103,
287154,2009-11-02,10078,3.535139,
287751,2009-11-03,10078,3.640167,
288117,2009-11-04,10078,3.742664,
289162,2009-11-05,10078,3.696347,
...,...,...,...,...
2328625,2023-10-06,79678,20.231223,
2329274,2023-10-09,79678,20.835352,
2330046,2023-10-10,79678,20.695816,
2330314,2023-10-11,79678,20.591058,


In [49]:
# merge data_merged[['date', 'permno', 'target']] with definitive_dataset on date and permno
definitive_dataset['date'] = pd.to_datetime(definitive_dataset['date']).dt.date
data_merged['date'] = pd.to_datetime(data_merged['date']).dt.date
definitive_dataset_2 = definitive_dataset.drop(columns=['target'])
definitive_dataset_2 = definitive_dataset_2.merge(data_merged[['date', 'permno', 'target']], on=['date', 'permno'], how='inner')

In [50]:
definitive_dataset_2

Unnamed: 0,date,permno,stat_divyeld,12_month_return,3_month_return,fed_funds_adj_close,fed_funds_volume,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,pcf,npm,opmbd,opmad,gpm,ptpm,cfm,roa,roe,roce,aftret_eq,aftret_invcapx,aftret_equity,GProf,equity_invcap,debt_invcap,totdebt_invcap,capital_ratio,cash_lt,debt_at,debt_ebitda,short_debt,lt_debt,cash_debt,fcf_ocf,lt_ppent,dltt_be,debt_assets,debt_capital,de_ratio,at_turn,rect_turn,pay_turn,sale_invcap,sale_equity,rd_sale,adv_sale,staff_sale,accrual,ptb,divyield,prc,vol,ret,retx,mktcap,prc_adj,naics_processed,ret_industry_tot,ret_industry_relative,MACD_index,rsi,target
0,2008-01-02,10104,0.000000,0.479792,0.202459,0.0,0.0,38.538000,0.067646,10.362100,25.659000,25.954000,26.565000,26.565000,6.126000,17.579000,0.211365,0.361570,0.304006,0.704252,0.332000,0.268929,0.092920,0.172698,0.076605,0.282000,0.196075,0.282000,0.419133,0.092205,0.033413,0.044476,0.033923,0.215495,0.025506,0.155053,0.053966,0.086741,0.118194,0.249890,4.327674,0.128230,0.060067,0.036729,0.907000,0.535159,5.572000,9.027789,0.746525,1.147000,0.020723,0.003210,0.000000,-0.062060,4.923465,0.000000,8.306649,42776121.0,-0.003986,-0.003986,8.248108,6.529796,51.0,-0.942159,0.712659,0.395601,57.868790,0.0
1,2008-01-02,10107,0.011153,-0.169548,0.094900,0.0,0.0,30.821000,0.062908,8.272798,19.371866,19.881841,19.881841,19.881841,5.228371,16.933000,0.210503,0.305421,0.236256,0.215021,0.300828,0.195810,0.147958,0.190613,0.213345,0.190613,0.305000,0.190613,0.209975,0.127531,0.000000,0.000000,0.000000,0.253822,0.000000,0.000000,0.059915,0.000000,0.146605,0.190829,1.980098,0.000000,0.089026,0.010585,0.199720,0.181071,5.593000,2.176983,0.495293,0.495293,0.085386,0.008028,0.000000,-0.065658,7.487779,0.004169,6.221671,63083943.0,-0.010674,-0.010674,4.242842,6.094239,51.0,-0.942159,0.201484,0.544679,46.020460,0.0
2,2008-01-02,10137,-0.009030,0.255381,0.069118,0.0,0.0,-565.060000,0.185281,11.118000,25.064342,25.414219,25.414219,21.528704,0.818877,10.517903,0.041628,0.124141,0.092177,0.124141,0.068761,0.074336,0.038289,0.035819,0.041995,0.052185,0.040433,0.051876,0.037363,0.343581,0.551349,0.581030,0.552248,0.011829,0.130308,3.498000,0.032742,0.409524,0.044230,0.101436,0.266267,0.825937,0.230972,0.198241,0.919565,0.134177,9.455024,3.625649,0.246153,0.534848,0.000000,0.000000,0.000000,-0.024882,0.860896,0.002360,19.415231,1189100.0,-0.010376,-0.010376,4.990780,18.182907,22.0,-1.022388,0.323380,1.182111,47.711559,0.0
3,2008-01-02,10138,0.012088,0.038009,-0.155647,0.0,0.0,33.137747,0.139315,11.795990,25.688000,27.058000,27.058000,27.058000,6.873424,19.311604,0.131519,0.303024,0.240342,0.424530,0.256473,0.172595,0.224700,0.137108,0.345380,0.137108,0.163710,0.137108,0.302208,0.371680,0.000000,0.000000,0.000000,1.858217,0.000000,0.000000,0.017328,0.000000,1.125424,0.747424,0.760277,0.000000,0.089238,0.054348,0.103362,0.639493,8.057970,2.518849,0.826000,0.742927,0.000000,0.008449,0.277098,-0.023658,6.023000,0.010668,27.117651,1761583.0,-0.034001,-0.034001,6.171009,19.560493,52.0,-1.359673,-0.156626,0.133547,37.203035,0.0
4,2008-01-02,10145,0.000360,-0.079194,-0.033880,0.0,0.0,30.245043,0.197000,3.540256,14.633683,14.803675,20.731000,20.731000,0.388762,8.719412,0.037637,0.031443,0.026022,0.059851,0.094000,0.050542,0.137612,0.180650,0.234751,0.246000,0.176288,0.246000,0.292000,0.290585,0.153735,0.183982,0.153735,0.024531,0.045322,0.505857,0.270728,0.129042,0.135814,0.353679,1.495760,0.224381,0.389817,0.200707,1.688457,0.952494,5.554000,6.729000,2.262060,2.943979,0.042000,0.000000,0.000000,-0.043000,5.261000,0.016200,13.653035,4162000.0,-0.027124,-0.027124,5.452684,11.368027,33.0,-1.169556,-0.815948,0.871301,44.456469,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2347308,2023-12-29,93096,0.015650,-0.800546,0.389117,0.0,0.0,5.878350,0.061757,7.542772,12.046030,13.434000,12.082907,12.082907,0.398957,14.679554,0.030110,0.037168,0.035546,0.122655,0.064759,0.033768,0.035397,0.057867,0.041995,0.067697,0.084967,0.067697,0.114379,0.050077,0.135394,0.146893,0.135394,0.013491,0.109984,0.977999,0.034657,0.180945,0.037323,0.016366,0.456695,0.426211,0.148006,0.124500,0.630629,0.405127,225.702726,1.867568,0.463062,1.206115,0.000000,0.001613,0.000000,0.008988,1.442120,0.007998,37.689338,1778807.0,0.002951,0.002951,5.328285,33.819272,45.0,-0.303988,0.438682,2.277614,69.874987,0.0
2347309,2023-12-29,93132,-0.015650,0.748245,-0.377831,0.0,0.0,43.983990,0.002000,24.248300,33.910000,34.579000,36.248000,36.248000,5.009518,15.978327,0.082513,0.047295,0.037302,0.145780,0.043400,0.130186,0.044238,6.919000,1.150494,-5.062804,0.601165,-7.092884,0.580000,0.006506,0.210346,0.226826,0.210346,0.415000,0.024709,0.178469,0.008189,0.026708,0.239248,0.582935,1.538777,7.426000,0.996000,0.976000,223.026000,0.738000,4.659000,4.274357,1.538681,106.182774,0.018173,0.000000,0.000000,-0.168000,19.774144,0.000000,32.189560,2861468.0,-0.014647,-0.014647,3.274497,14.241130,33.0,-0.221914,-0.355005,1.201211,60.657426,0.0
2347310,2023-12-29,93246,0.000000,0.187142,0.182600,0.0,0.0,20.350000,0.198941,8.894814,24.561596,24.969907,29.474172,29.474172,0.797110,23.534000,0.035977,0.035132,0.028408,0.106222,0.023416,0.068892,0.037911,0.058941,0.051617,0.062003,0.055879,0.062003,0.141947,0.606000,0.118883,0.152389,0.119809,0.035310,0.093871,2.671299,0.046693,0.113847,0.011291,0.676000,4.922566,0.186816,0.241710,0.171716,1.211000,0.237148,3.325289,1.805473,0.385061,0.918892,0.009108,0.009775,0.000000,-0.014124,1.560880,0.000000,41.322224,479268.0,-0.012908,-0.012908,4.206589,36.948577,33.0,-0.221914,-0.150663,5.177728,58.654692,0.0
2347311,2023-12-29,93429,0.011063,-0.067255,-0.043228,0.0,0.0,28.762195,0.107970,8.907735,17.668453,17.722381,24.678487,24.678487,3.240811,17.155000,0.057434,0.049577,0.049706,0.264538,0.068944,0.071021,0.042529,0.171791,0.058669,0.062684,0.144000,0.062684,0.093663,0.184379,0.093253,0.091661,0.093253,0.211845,0.063661,0.328963,0.068823,0.125676,0.048604,0.951000,3.755647,0.108039,0.515000,0.091661,1.063000,0.227047,9.308000,15.470000,0.273557,0.468313,0.000000,0.000000,0.019950,-0.049469,3.085447,0.012100,40.576484,521913.0,0.004049,0.004049,16.751922,35.234049,51.0,-0.030868,0.393936,0.589684,59.175364,0.0


In [51]:
# save the data to a csv file
definitive_dataset_2.to_csv('../../data/definitive_dataset.csv', index=False)

In [52]:
data_merged.to_csv('../../data/non_diff_adjusted_price_data.csv', index=False)