In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [124]:
datasetlabel = pd.read_csv('../../data/datasetlabel.csv') # non stationary data
fed_funds = pd.read_csv('../../data/1m-fed_fund_futures.csv') # fed funds data
other_macro = pd.read_csv('../../data/mdpurg9fqkqhobkm.csv') # CPI and yields
fracdiff_data = pd.read_csv('../../data/DATA_FINAL.csv') # fracdiff data
alternative_dataset = pd.read_csv('../../data/fixed_divyield.csv') # Ettore's data

The base is fracdiff_data, which contains the fractionally differentiated data. From that we add the features from fed_funds, other_macro and alternative_dataset. We then .diff() all the features which are not differentiated. 

In [2]:
fracdiff_data = pd.read_csv('../../data/DATA_FINAL.csv') # fracdiff data

In [3]:
fracdiff_data.columns

Index(['permno', 'CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil', 'pe_exi',
       'pe_inc', 'ps', 'pcf', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm',
       'roa', 'roe', 'roce', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'GProf', 'equity_invcap', 'debt_invcap', 'totdebt_invcap',
       'capital_ratio', 'cash_lt', 'debt_at', 'debt_ebitda', 'short_debt',
       'lt_debt', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'at_turn', 'rect_turn', 'pay_turn',
       'sale_invcap', 'sale_equity', 'rd_sale', 'adv_sale', 'staff_sale',
       'accrual', 'ptb', 'divyield', 'date', 'prc', 'vol', 'ret', 'retx',
       'mktcap', 'prc_adj', 'naics_processed', 'ret_industry_tot',
       'ret_industry_relative', 'MACD_index', 'rsi'],
      dtype='object')

In [125]:
# Interpolating the data from fracdiff_data 

fracdiff_data = fracdiff_data[fracdiff_data['date'] >= '2008-01-01']
fracdiff_data = fracdiff_data.interpolate(method='linear')

  fracdiff_data = fracdiff_data.interpolate(method='linear')


In [126]:
# test to see if the interpolation worked
np.sum(fracdiff_data.isna().sum(axis=1) > 0)

0

We first merge all the non differentiated features into one and then run .diff() on them 

In [127]:
# convert every 'Date' to datetime
datasetlabel['date'] = pd.to_datetime(datasetlabel['date'])
fed_funds['Date'] = pd.to_datetime(fed_funds['Date'])
fed_funds.rename(columns={'Date': 'date'}, inplace=True)
other_macro['caldt'] = pd.to_datetime(other_macro['caldt'])
other_macro.rename(columns={'caldt': 'date'}, inplace=True)
fracdiff_data['date'] = pd.to_datetime(fracdiff_data['date'])
alternative_dataset['date'] = pd.to_datetime(alternative_dataset['date'])
alternative_dataset['date'] = pd.to_datetime(alternative_dataset['date'])

In [128]:
fed_funds = fed_funds[['date', 'Adj Close', 'Volume']]
fed_funds.rename(columns={'Adj Close': 'fed_funds_adj_close', 'Volume': 'fed_funds_volume'}, inplace=True)

# merge fed_funds with other_macro

# CAN'T USE OTHER_MACRO BECAUSE IT STARTS IN 2009
#macro_data = pd.merge(fed_funds, other_macro, on='date', how='inner')

In [129]:
set(datasetlabel.columns) - set(fracdiff_data.columns)

{'stat_divyeld', 'target'}

In [130]:
alternative_dataset = pd.read_csv('../../data/fixed_divyield.csv') # Ettore's data

In [131]:
# keep only the columns of alternative_dataset that are not in fracdiff_data
cols_to_keep = ['date', 'permno', '12_month_return', '3_month_return']
alternative_dataset = alternative_dataset[cols_to_keep]
alternative_dataset.columns

Index(['date', 'permno', '12_month_return', '3_month_return'], dtype='object')

In [132]:
print(np.sum(alternative_dataset[alternative_dataset['date']>='2008-01-01'].isna().sum(axis=1) > 0))

# find out the columsn that have the most missing values in alternative_dataset[alternative_dataset['date']>='2008-01-01']
alternative_dataset[alternative_dataset['date']>='2008-01-01'].isna().sum().sort_values(ascending=False)

27090


12_month_return    27090
3_month_return      6679
date                   0
permno                 0
dtype: int64

In [136]:
datasetlabel = datasetlabel[['date', 'permno', 'stat_divyeld']]
datasetlabel['date'] = pd.to_datetime(datasetlabel['date'])
alternative_dataset['date'] = pd.to_datetime(alternative_dataset['date'])

In [137]:
# join the single feature of datasetlabel with alternative_dataset on 'date' and 'permno'
non_diff_dataset = pd.merge(datasetlabel, alternative_dataset, on=['date', 'permno'], how='inner')

In [138]:
# merge non_diff_dataset with fed_funds on date
non_diff_dataset = pd.merge(non_diff_dataset, fed_funds, on='date', how='inner')

# interpolate the missing values in non_diff_dataset
non_diff_dataset = non_diff_dataset.interpolate(method='linear')

In [139]:
# run .diff() on the columns that are not 'date' and 'permno' of non_diff_dataset
diff_dataset = non_diff_dataset.copy()
diff_dataset[diff_dataset.columns.difference(['date', 'permno'])] = non_diff_dataset[non_diff_dataset.columns.difference(['date', 'permno'])].diff()

In [140]:
diff_dataset.dropna(inplace=True) # drop the first row of NaNs since it's not possible to calculate the difference

In [147]:
# merge diff_dataset with fracdiff_data on 'date' and 'permno'
definitive_dataset = pd.merge(diff_dataset, fracdiff_data, on=['date', 'permno'], how='inner')

In [None]:
# adding the target variable to the definitive_dataset

In [149]:
definitive_dataset.to_csv('../../data/definitive_dataset.csv', index=False)

### Fixing the returns and MACD, RSI

In [3]:
definitive_dataset = pd.read_csv('../../data/definitive_dataset.csv')

In [4]:
definitive_dataset['date'] = pd.to_datetime(definitive_dataset['date'])

In [5]:
price_data = pd.read_csv('../../data/backtest_prices.csv')

In [26]:
# drop the problematic columns
columns_to_drop = ['3_month_return', '12_month_return', 'prc', 'rsi', 'MACD_index', 'prc_adj']
definitive_dataset.drop(columns=columns_to_drop, inplace=True)

In [28]:
# in price_data calculate the past 3 month return and the past 12 month return
price_data['date'] = pd.to_datetime(price_data['date'])
price_data['permno'] = price_data['permno'].astype(int)

# sort price_data by date and permno
price_data.sort_values(['date', 'permno'], inplace=True)
price_data.reset_index(drop=True, inplace=True)

#get the 3 month return, which corresponds to 252/4 trading days
price_data['3_month_return'] = price_data.groupby('permno')['adj_prc'].pct_change(63)*100
#get the 12 month return, which corresponds to 252 trading days
price_data['12_month_return'] = price_data.groupby('permno')['adj_prc'].pct_change(252)*100

# set to zero the nans in the 3_month_return and 12_month_return columns
price_data['3_month_return'] = price_data[(price_data['date'] >= '2008-01-01')]['3_month_return'].fillna(0)
price_data['12_month_return'] = price_data[(price_data['date'] >= '2008-01-01')]['12_month_return'].fillna(0)

# merge price_data with definitive_dataset on 'date' and 'permno'
definitive_dataset = pd.merge(definitive_dataset, price_data[['date', 'permno', '3_month_return', '12_month_return']], on=['date', 'permno'], how='left')
price_data.drop(columns=['3_month_return', '12_month_return'], inplace=True)

In [29]:
# calculate the macd_index for price_data

def calculate_macd(group, short_window=12, long_window=26, signal_window=9):
    group.sort_values('date', inplace=True)
    short_ema = group['adj_prc'].ewm(span=short_window, min_periods=1, adjust=False).mean()
    long_ema = group['adj_prc'].ewm(span=long_window, min_periods=1, adjust=False).mean()
    macd_line = short_ema - long_ema
    signal_line = macd_line.ewm(span=signal_window, min_periods=1, adjust=False).mean()
    group['MACD_index'] = signal_line

    return group

price_data = price_data.groupby('permno').apply(calculate_macd)
# drop the multiindex
price_data.reset_index(drop=True, inplace=True)

# merge price_data with definitive_dataset on 'date' and 'permno'
definitive_dataset = pd.merge(definitive_dataset, price_data[['date', 'permno', 'MACD_index']], on=['date', 'permno'], how='left')

  price_data = price_data.groupby('permno').apply(calculate_macd)


In [30]:
# calculate the RSI for price_data
def calculate_RSI(data: pd.DataFrame, price_col: str, window: int):
    data.sort_values('date', inplace=True)
    # calculate the change up and change down
    change = data[price_col].diff()
    change_up, change_down = change.copy(), change.copy()
    change_up[change_up < 0] = 0
    change_down[change_down > 0] = 0

    # check we did not make mistakes
    assert change.equals(change_up + change_down)

    # calculate EWMAs
    avg_up = change_up.ewm(span=window, adjust=False).mean()
    avg_down = change_down.ewm(span=window, adjust=False).mean().abs()

    # calculate RSI
    rsi = 100 * avg_up / (avg_up + avg_down)
    data["rsi"] = rsi
    # mask = data["permno"] != data["permno"].shift(1)
    # data.loc[mask, "rsi"] = np.nan
    return data

price_data = price_data.groupby('permno').apply(calculate_RSI, price_col='adj_prc', window=14)
price_data.reset_index(drop=True, inplace=True)

# merge price_data with definitive_dataset on 'date' and 'permno'
definitive_dataset = pd.merge(definitive_dataset, price_data[['date', 'permno', 'rsi']], on=['date', 'permno'], how='left')

  price_data = price_data.groupby('permno').apply(calculate_RSI, price_col='adj_prc', window=14)


In [34]:
# add prc_adj and prc to definitive_dataset
definitive_dataset = pd.merge(definitive_dataset, price_data[['date', 'permno', 'adj_prc', 'prc']], on=['date', 'permno'], how='left')

In [25]:
price_data[['date', 'permno', 'adj_prc', 'prc']][(price_data['permno'] == 10078)&(price_data['date'] >= '2008-01-01')]

Unnamed: 0,date,permno,adj_prc,prc
409,2008-01-02,10078,17.42,17.42
410,2008-01-03,10078,17.15,17.15
411,2008-01-04,10078,16.31,16.31
412,2008-01-07,10078,16.11,16.11
413,2008-01-08,10078,15.86,15.86
...,...,...,...,...
932,2010-01-20,10078,9.43,9.43
933,2010-01-21,10078,9.47,9.47
934,2010-01-22,10078,9.46,9.46
935,2010-01-25,10078,9.48,9.48


In [11]:
# Run .diff() on columns_to_drop
definitive_dataset.rename(columns={'adj_prc': 'prc_adj'}, inplace=True)
definitive_dataset[columns_to_drop] = definitive_dataset[columns_to_drop].diff()

In [23]:
definitive_dataset[definitive_dataset['permno'] == 10078]

Unnamed: 0,date,permno,stat_divyeld,fed_funds_adj_close,fed_funds_volume,CAPEI,bm,evm,pe_op_basic,pe_op_dil,...,naics_processed,ret_industry_tot,ret_industry_relative,target,3_month_return,12_month_return,MACD_index,rsi,prc_adj,prc
684,2008-01-03,10078,-0.000468,0.000000,-8245.0,-16.225,0.300409,14.264,90.650000,95.421000,...,33.0,0.083588,-0.820743,0.0,-48.618234,-33.894174,-2.025819,-27.932541,-39.52,-39.52
1325,2008-01-04,10078,-0.000538,0.015007,5521.0,-16.225,0.300409,14.264,90.650000,95.421000,...,33.0,-1.993803,-0.319939,0.0,-55.549499,-37.870560,-1.991136,-35.684055,-40.62,-40.62
1964,2008-01-07,10078,-0.000534,0.004997,-9271.0,-16.225,0.300409,14.264,90.650000,95.421000,...,33.0,-0.411120,-0.168356,0.0,-52.276800,-36.039413,-1.940411,-26.770286,-39.89,-39.89
2576,2008-01-08,10078,-0.000533,0.000000,3182.0,-16.225,0.300409,14.264,90.650000,95.421000,...,33.0,-1.386230,0.653228,1.0,-52.718437,-39.283250,-1.895894,-40.880023,-41.02,-41.02
3188,2008-01-09,10078,-0.000531,0.000000,-721.0,-16.225,0.300409,14.264,90.650000,95.421000,...,33.0,0.885814,-1.481740,1.0,-52.210615,-36.250494,-1.845784,-34.551936,-40.55,-40.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386444,2010-01-20,10078,-0.004061,0.000000,-2206.0,-17.621,0.444317,16.450,14.376647,14.584882,...,33.0,-0.550692,0.583925,,-4.815637,154.864865,-0.501854,-4.065474,-37.47,-37.47
387098,2010-01-21,10078,-0.004061,0.000000,1511.0,-17.621,0.444317,16.450,16.633118,16.858706,...,33.0,-0.726364,0.792833,,1.591045,150.529101,-0.542970,20.529450,-36.63,-36.63
387722,2010-01-22,10078,-0.004061,-0.002495,5700.0,-17.621,0.444317,16.450,18.889588,19.132529,...,33.0,-1.226940,1.210381,,8.717610,137.092732,-0.553970,29.841723,-35.66,-35.66
388410,2010-01-25,10078,-0.004061,0.000000,-4041.0,-17.621,0.444317,16.450,21.146059,21.406353,...,33.0,0.391625,-0.358471,,7.987692,95.061728,-0.534522,39.740161,-35.05,-35.05


In [13]:
g = definitive_dataset[['date', 'permno'] + columns_to_drop].copy()

In [14]:
g.groupby('permno').apply(print)

             date  permno  3_month_return  12_month_return    prc        rsi  \
684    2008-01-03   10078      -48.618234       -33.894174 -39.52 -27.932541   
1325   2008-01-04   10078      -55.549499       -37.870560 -40.62 -35.684055   
1964   2008-01-07   10078      -52.276800       -36.039413 -39.89 -26.770286   
2576   2008-01-08   10078      -52.718437       -39.283250 -41.02 -40.880023   
3188   2008-01-09   10078      -52.210615       -36.250494 -40.55 -34.551936   
...           ...     ...             ...              ...    ...        ...   
386444 2010-01-20   10078       -4.815637       154.864865 -37.47  -4.065474   
387098 2010-01-21   10078        1.591045       150.529101 -36.63  20.529450   
387722 2010-01-22   10078        8.717610       137.092732 -35.66  29.841723   
388410 2010-01-25   10078        7.987692        95.061728 -35.05  39.740161   
389099 2010-01-26   10078       10.736761       112.780269 -34.13  49.082540   

        MACD_index  prc_adj  
684      

  g.groupby('permno').apply(print)


In [15]:
def diff_columns(group, columns_to_diff, key='permno'):
    group.sort_values('date', inplace=True)
    group[columns_to_diff] = group[columns_to_diff].diff()
    return group

In [16]:
f = definitive_dataset[['date', 'permno'] + columns_to_drop].copy()

In [17]:
g = g.groupby('permno').apply(diff_columns, columns_to_diff=columns_to_drop)
g.reset_index(drop=True, inplace=True)
g.rename(columns={col: col + '_diff' for col in columns_to_drop}, inplace=True)

  g = g.groupby('permno').apply(diff_columns, columns_to_diff=columns_to_drop)


In [18]:
g

Unnamed: 0,date,permno,3_month_return_diff,12_month_return_diff,prc_diff,rsi_diff,MACD_index_diff,prc_adj_diff
0,2008-01-03,10078,,,,,,
1,2008-01-04,10078,-6.931265,-3.976386,-1.10000,-7.751514,0.034682,-1.100000
2,2008-01-07,10078,3.272699,1.831148,0.73000,8.913769,0.050726,0.730000
3,2008-01-08,10078,-0.441637,-3.243837,-1.13000,-14.109738,0.044516,-1.130000
4,2008-01-09,10078,0.507823,3.032756,0.47000,6.328087,0.050111,0.470000
...,...,...,...,...,...,...,...,...
2404193,2021-12-27,93436,-2.055487,1.149868,26.13994,0.160576,0.657357,8.179980
2404194,2021-12-28,93436,0.678060,-4.195234,-5.83997,-3.093723,0.978283,-2.193323
2404195,2021-12-29,93436,2.072714,1.826457,-1.11003,5.401283,1.145288,0.409990
2404196,2021-12-30,93436,-2.306881,-8.570877,-16.10996,-6.506184,1.109472,-5.543313


In [22]:
f[f['permno'] == 10078]

Unnamed: 0,date,permno,3_month_return,12_month_return,prc,rsi,MACD_index,prc_adj
684,2008-01-03,10078,-48.618234,-33.894174,-39.52,-27.932541,-2.025819,-39.52
1325,2008-01-04,10078,-55.549499,-37.870560,-40.62,-35.684055,-1.991136,-40.62
1964,2008-01-07,10078,-52.276800,-36.039413,-39.89,-26.770286,-1.940411,-39.89
2576,2008-01-08,10078,-52.718437,-39.283250,-41.02,-40.880023,-1.895894,-41.02
3188,2008-01-09,10078,-52.210615,-36.250494,-40.55,-34.551936,-1.845784,-40.55
...,...,...,...,...,...,...,...,...
386444,2010-01-20,10078,-4.815637,154.864865,-37.47,-4.065474,-0.501854,-37.47
387098,2010-01-21,10078,1.591045,150.529101,-36.63,20.529450,-0.542970,-36.63
387722,2010-01-22,10078,8.717610,137.092732,-35.66,29.841723,-0.553970,-35.66
388410,2010-01-25,10078,7.987692,95.061728,-35.05,39.740161,-0.534522,-35.05


In [20]:
price_data[(price_data['permno'] == 10078)&(price_data['date'] >= '2008-01-01')]

Unnamed: 0,date,permno,prc,cfacpr,cfacshr,divamt,adj_prc,adj_prc_w_dividend,Ticker,MACD_index,rsi
409,2008-01-02,10078,17.42,1.0,1.0,0.0,17.42,17.42,JAVA,-0.580383,10.787354
410,2008-01-03,10078,17.15,1.0,1.0,0.0,17.15,17.15,JAVA,-0.608884,9.108154
411,2008-01-04,10078,16.31,1.0,1.0,0.0,16.31,16.31,JAVA,-0.652024,5.843081
412,2008-01-07,10078,16.11,1.0,1.0,0.0,16.11,16.11,JAVA,-0.703784,5.319227
413,2008-01-08,10078,15.86,1.0,1.0,0.0,15.86,15.86,JAVA,-0.760617,4.710165
...,...,...,...,...,...,...,...,...,...,...,...
932,2010-01-20,10078,9.43,1.0,1.0,0.0,9.43,9.43,JAVA,0.159468,71.718390
933,2010-01-21,10078,9.47,1.0,1.0,0.0,9.47,9.47,JAVA,0.153479,78.558728
934,2010-01-22,10078,9.46,1.0,1.0,0.0,9.46,9.46,JAVA,0.147682,73.435238
935,2010-01-25,10078,9.48,1.0,1.0,0.0,9.48,9.48,JAVA,0.142293,76.910335
