In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
datasetlabel = pd.read_csv('../../data/datasetlabel.csv') # non stationary data
fed_funds = pd.read_csv('../../data/1m-fed_fund_futures.csv') # fed funds data
other_macro = pd.read_csv('../../data/mdpurg9fqkqhobkm.csv') # CPI and yields
fracdiff_data = pd.read_csv('../../data/DATA_FINAL.csv') # fracdiff data
alternative_dataset = pd.read_csv('../../data/fixed_divyield.csv') # Ettore's data

The base is fracdiff_data, which contains the fractionally differentiated data. From that we add the features from fed_funds, other_macro and alternative_dataset. We then .diff() all the features which are not differentiated. 

In [3]:
fracdiff_data = pd.read_csv('../../data/DATA_FINAL.csv') # fracdiff data

In [4]:
fracdiff_data.columns

Index(['permno', 'CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil', 'pe_exi',
       'pe_inc', 'ps', 'pcf', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm',
       'roa', 'roe', 'roce', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'GProf', 'equity_invcap', 'debt_invcap', 'totdebt_invcap',
       'capital_ratio', 'cash_lt', 'debt_at', 'debt_ebitda', 'short_debt',
       'lt_debt', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'at_turn', 'rect_turn', 'pay_turn',
       'sale_invcap', 'sale_equity', 'rd_sale', 'adv_sale', 'staff_sale',
       'accrual', 'ptb', 'divyield', 'date', 'prc', 'vol', 'ret', 'retx',
       'mktcap', 'prc_adj', 'naics_processed', 'ret_industry_tot',
       'ret_industry_relative', 'MACD_index', 'rsi'],
      dtype='object')

In [5]:
# Interpolating the data from fracdiff_data 

fracdiff_data = fracdiff_data[fracdiff_data['date'] >= '2008-01-01']
fracdiff_data = fracdiff_data.interpolate(method='linear')

  fracdiff_data = fracdiff_data.interpolate(method='linear')


In [6]:
# test to see if the interpolation worked
np.sum(fracdiff_data.isna().sum(axis=1) > 0)

0

We first merge all the non differentiated features into one and then run .diff() on them 

In [7]:
# convert every 'Date' to datetime
datasetlabel['date'] = pd.to_datetime(datasetlabel['date'])
fed_funds['Date'] = pd.to_datetime(fed_funds['Date'])
fed_funds.rename(columns={'Date': 'date'}, inplace=True)
other_macro['caldt'] = pd.to_datetime(other_macro['caldt'])
other_macro.rename(columns={'caldt': 'date'}, inplace=True)
fracdiff_data['date'] = pd.to_datetime(fracdiff_data['date'])
alternative_dataset['date'] = pd.to_datetime(alternative_dataset['date'])
alternative_dataset['date'] = pd.to_datetime(alternative_dataset['date'])

In [8]:
fed_funds = fed_funds[['date', 'Adj Close', 'Volume']]
fed_funds.rename(columns={'Adj Close': 'fed_funds_adj_close', 'Volume': 'fed_funds_volume'}, inplace=True)

# merge fed_funds with other_macro

# CAN'T USE OTHER_MACRO BECAUSE IT STARTS IN 2009
#macro_data = pd.merge(fed_funds, other_macro, on='date', how='inner')

In [9]:
set(datasetlabel.columns) - set(fracdiff_data.columns)

{'stat_divyeld', 'target'}

In [10]:
alternative_dataset = pd.read_csv('../../data/fixed_divyield.csv') # Ettore's data

In [11]:
# keep only the columns of alternative_dataset that are not in fracdiff_data
cols_to_keep = ['date', 'permno', '12_month_return', '3_month_return']
alternative_dataset = alternative_dataset[cols_to_keep]
alternative_dataset.columns

Index(['date', 'permno', '12_month_return', '3_month_return'], dtype='object')

In [12]:
print(np.sum(alternative_dataset[alternative_dataset['date']>='2008-01-01'].isna().sum(axis=1) > 0))

# find out the columsn that have the most missing values in alternative_dataset[alternative_dataset['date']>='2008-01-01']
alternative_dataset[alternative_dataset['date']>='2008-01-01'].isna().sum().sort_values(ascending=False)

27090


12_month_return    27090
3_month_return      6679
date                   0
permno                 0
dtype: int64

In [13]:
datasetlabel = datasetlabel[['date', 'permno', 'stat_divyeld']]
datasetlabel['date'] = pd.to_datetime(datasetlabel['date'])
alternative_dataset['date'] = pd.to_datetime(alternative_dataset['date'])

In [14]:
# join the single feature of datasetlabel with alternative_dataset on 'date' and 'permno'
non_diff_dataset = pd.merge(datasetlabel, alternative_dataset, on=['date', 'permno'], how='inner')

In [15]:
# merge non_diff_dataset with fed_funds on date
non_diff_dataset = pd.merge(non_diff_dataset, fed_funds, on='date', how='inner')

# interpolate the missing values in non_diff_dataset
non_diff_dataset = non_diff_dataset.interpolate(method='linear')

In [16]:
# run .diff() on the columns that are not 'date' and 'permno' of non_diff_dataset
diff_dataset = non_diff_dataset.copy()

def diff_columns(group, columns_to_diff, key='permno'):
    group.sort_values('date', inplace=True)
    group[columns_to_diff] = group[columns_to_diff].diff()
    return group

diff_dataset[diff_dataset.columns.difference(['date', 'permno'])] = non_diff_dataset[non_diff_dataset.columns.difference(['date', 'permno'])].diff()
diff_dataset = diff_dataset.groupby('permno').apply(diff_columns, columns_to_diff=diff_dataset.columns.difference(['date', 'permno']))

  diff_dataset = diff_dataset.groupby('permno').apply(diff_columns, columns_to_diff=diff_dataset.columns.difference(['date', 'permno']))


In [17]:
diff_dataset.dropna(inplace=True) # drop the first row of NaNs since it's not possible to calculate the difference

In [20]:
diff_dataset.reset_index(drop=True, inplace=True)

In [22]:
# merge diff_dataset with fracdiff_data on 'date' and 'permno'
definitive_dataset = pd.merge(diff_dataset, fracdiff_data, on=['date', 'permno'], how='inner')

In [None]:
# adding the target variable to the definitive_dataset

In [None]:
definitive_dataset.to_csv('../../data/definitive_dataset.csv', index=False)

### Fixing the returns and MACD, RSI

In [3]:
definitive_dataset = pd.read_csv('../../data/definitive_dataset.csv')

In [23]:
definitive_dataset['date'] = pd.to_datetime(definitive_dataset['date'])

In [24]:
price_data = pd.read_csv('../../data/backtest_prices.csv')

In [25]:
# drop the problematic columns
columns_to_drop = ['3_month_return', '12_month_return', 'prc', 'rsi', 'MACD_index', 'prc_adj']
definitive_dataset.drop(columns=columns_to_drop, inplace=True)

In [26]:
# in price_data calculate the past 3 month return and the past 12 month return
price_data['date'] = pd.to_datetime(price_data['date'])
price_data['permno'] = price_data['permno'].astype(int)

# sort price_data by date and permno
price_data.sort_values(['date', 'permno'], inplace=True)
price_data.reset_index(drop=True, inplace=True)

#get the 3 month return, which corresponds to 252/4 trading days
price_data['3_month_return'] = price_data.groupby('permno')['adj_prc'].pct_change(63)*100
#get the 12 month return, which corresponds to 252 trading days
price_data['12_month_return'] = price_data.groupby('permno')['adj_prc'].pct_change(252)*100

# set to zero the nans in the 3_month_return and 12_month_return columns
price_data['3_month_return'] = price_data[(price_data['date'] >= '2008-01-01')]['3_month_return'].fillna(0)
price_data['12_month_return'] = price_data[(price_data['date'] >= '2008-01-01')]['12_month_return'].fillna(0)

# merge price_data with definitive_dataset on 'date' and 'permno'
definitive_dataset = pd.merge(definitive_dataset, price_data[['date', 'permno', '3_month_return', '12_month_return']], on=['date', 'permno'], how='left')
price_data.drop(columns=['3_month_return', '12_month_return'], inplace=True)

In [27]:
# calculate the macd_index for price_data

def calculate_macd(group, short_window=12, long_window=26, signal_window=9):
    group.sort_values('date', inplace=True)
    short_ema = group['adj_prc'].ewm(span=short_window, min_periods=1, adjust=False).mean()
    long_ema = group['adj_prc'].ewm(span=long_window, min_periods=1, adjust=False).mean()
    macd_line = short_ema - long_ema
    signal_line = macd_line.ewm(span=signal_window, min_periods=1, adjust=False).mean()
    group['MACD_index'] = signal_line

    return group

price_data = price_data.groupby('permno').apply(calculate_macd)
# drop the multiindex
price_data.reset_index(drop=True, inplace=True)

# merge price_data with definitive_dataset on 'date' and 'permno'
definitive_dataset = pd.merge(definitive_dataset, price_data[['date', 'permno', 'MACD_index']], on=['date', 'permno'], how='left')

  price_data = price_data.groupby('permno').apply(calculate_macd)


In [28]:
# calculate the RSI for price_data
def calculate_RSI(data: pd.DataFrame, price_col: str, window: int):
    data.sort_values('date', inplace=True)
    # calculate the change up and change down
    change = data[price_col].diff()
    change_up, change_down = change.copy(), change.copy()
    change_up[change_up < 0] = 0
    change_down[change_down > 0] = 0

    # check we did not make mistakes
    assert change.equals(change_up + change_down)

    # calculate EWMAs
    avg_up = change_up.ewm(span=window, adjust=False).mean()
    avg_down = change_down.ewm(span=window, adjust=False).mean().abs()

    # calculate RSI
    rsi = 100 * avg_up / (avg_up + avg_down)
    data["rsi"] = rsi
    # mask = data["permno"] != data["permno"].shift(1)
    # data.loc[mask, "rsi"] = np.nan
    return data

price_data = price_data.groupby('permno').apply(calculate_RSI, price_col='adj_prc', window=14)
price_data.reset_index(drop=True, inplace=True)

# merge price_data with definitive_dataset on 'date' and 'permno'
definitive_dataset = pd.merge(definitive_dataset, price_data[['date', 'permno', 'rsi']], on=['date', 'permno'], how='left')

  price_data = price_data.groupby('permno').apply(calculate_RSI, price_col='adj_prc', window=14)


In [29]:
# add prc_adj and prc to definitive_dataset
definitive_dataset = pd.merge(definitive_dataset, price_data[['date', 'permno', 'adj_prc', 'prc']], on=['date', 'permno'], how='left')

In [30]:
# Run .diff() on columns_to_drop
definitive_dataset.rename(columns={'adj_prc': 'prc_adj'}, inplace=True)
definitive_dataset[columns_to_drop] = definitive_dataset[columns_to_drop].diff()

In [31]:
def diff_columns(group, columns_to_diff, key='permno'):
    group.sort_values('date', inplace=True)
    group[columns_to_diff] = group[columns_to_diff].diff()
    return group

In [32]:
definitive_dataset = definitive_dataset.groupby('permno').apply(diff_columns, columns_to_diff=columns_to_drop)
definitive_dataset.reset_index(drop=True, inplace=True)

  definitive_dataset = definitive_dataset.groupby('permno').apply(diff_columns, columns_to_diff=columns_to_drop)


In [37]:
price_data[price_data['permno'] == 93436]

Unnamed: 0,date,permno,prc,cfacpr,cfacshr,divamt,adj_prc,adj_prc_w_dividend,Ticker,MACD_index,rsi
2672571,2010-06-29,93436,23.89000,15.0,15.0,0.0,1.592667,1.592667,TSLA,0.000000,
2672572,2010-06-29,93436,23.89000,15.0,15.0,0.0,1.592667,1.592667,TSLA,0.000000,
2672573,2010-06-29,93436,23.89000,15.0,15.0,0.0,1.592667,1.592667,TSLA,0.000000,
2672574,2010-06-30,93436,23.83000,15.0,15.0,0.0,1.588667,1.588667,TSLA,-0.000064,0.000000
2672575,2010-06-30,93436,23.83000,15.0,15.0,0.0,1.588667,1.588667,TSLA,-0.000164,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
2675492,2021-12-27,93436,1093.93994,3.0,3.0,0.0,364.646647,364.646647,TSLA,-5.787332,67.448767
2675493,2021-12-28,93436,1088.46997,3.0,3.0,0.0,362.823323,362.823323,TSLA,-4.808797,65.954489
2675494,2021-12-29,93436,1086.18994,3.0,3.0,0.0,362.063313,362.063313,TSLA,-3.664446,65.259141
2675495,2021-12-30,93436,1070.33997,3.0,3.0,0.0,356.779990,356.779990,TSLA,-2.552095,60.170766


In [45]:
definitive_dataset.dropna(inplace=True)

In [47]:
definitive_dataset.to_csv('../../data/definitive_dataset.csv', index=False)