In [22]:
import pandas as pd 
import numpy as np 
from fractional_differentiation import find_stat_series
import itertools
from statsmodels.tsa.stattools import adfuller

# Get the two datasets

In [2]:
data_non_stat = pd.read_csv('../../data/dataset_non_stat.csv')

In [3]:
data_non_stat.columns

Index(['date', 'permno', 'CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil',
       'pe_exi', 'pe_inc', 'ps', 'pcf', 'dpr', 'npm', 'opmbd', 'opmad', 'gpm',
       'ptpm', 'cfm', 'roa', 'roe', 'roce', 'efftax', 'aftret_eq',
       'aftret_invcapx', 'aftret_equity', 'pretret_noa', 'pretret_earnat',
       'GProf', 'equity_invcap', 'debt_invcap', 'totdebt_invcap',
       'capital_ratio', 'int_debt', 'int_totdebt', 'cash_lt', 'invt_act',
       'rect_act', 'debt_at', 'debt_ebitda', 'short_debt', 'curr_debt',
       'lt_debt', 'profit_lct', 'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent',
       'dltt_be', 'debt_assets', 'debt_capital', 'de_ratio', 'intcov',
       'intcov_ratio', 'cash_ratio', 'quick_ratio', 'curr_ratio', 'inv_turn',
       'at_turn', 'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'prc', 'vol',
       'ret', 'retx', 'mktcap', 'prc_adj', 'naics_processed',
       'ret_industry_tot', 'ret_industry_relative', '

In [4]:
data_stat = pd.read_csv('../../data/datasetlabel.csv')  

Get the columns that are missing from the stationarized dataset

In [34]:
missing_cols = set(data_non_stat.columns).difference(set(data_stat.columns))
data_to_stationarize = data_non_stat[['permno', 'date'] + list(missing_cols)]

data_to_stationarize.head()

Unnamed: 0,permno,date,profit_lct,curr_ratio,intcov_ratio,pretret_earnat,inv_turn,int_totdebt,12_month_return,efftax,...,intcov,ocf_lct,int_debt,quick_ratio,rect_act,invt_act,dpr,pretret_noa,cash_ratio,curr_debt
0,10078,2000-02-01,0.988,2.071,2433.28,0.306,18.324,0.028,,0.358,...,1528.9,0.868,,1.951,0.368,0.058,0.0,0.509,0.9,0.811
1,85072,2000-02-01,0.884,2.056,77.617,0.258,2.409,0.034,,0.407,...,33.82,0.227,0.125,0.854,0.237,0.585,0.0,0.395,0.114,0.748
2,70536,2000-02-01,2.043,5.658,263.685,0.172,0.914,0.047,,0.314,...,181.812,1.673,1.903,2.452,0.304,0.567,0.188,0.192,0.345,0.693
3,16432,2000-02-01,0.202,1.342,3.563,0.066,4.769,0.065,,0.276,...,2.979,0.271,0.114,0.745,0.435,0.445,0.572,0.106,0.067,0.5
4,85035,2000-02-01,2.111,5.174,11.032,0.262,4.208,0.133,,0.2,...,10.073,0.735,0.172,4.163,0.18,0.195,0.0,0.307,3.197,0.694


In [28]:
len(missing_cols)

19

# Stationarize the subset of columns

In [35]:
missing_data_stationary = data_to_stationarize.copy()

diffs = np.linspace(0.05, 1, 40)

missing_data_stationary.shape

(3484790, 21)

In [None]:
from ipywidgets import IntProgress
from IPython.display import display

permnos = missing_data_stationary.index
features = missing_data_stationary.columns.drop(['date', 'permno'])
print(features)
# just to display progress
f = IntProgress(min=0, max=np.prod([len(features), len(permnos)]))
display(f)

# iterate through all permnos and features
for permno, feature in itertools.product(permnos, features):
    f.value += 1

    # select the data relative to the permno and feature
    mask = data_to_stationarize["permno"] == permno
    data = data_to_stationarize.loc[mask, [feature]]
    original_index = data.index
    data = data.dropna()

    # handle features which are empty
    if data.empty:
        print(f"there is no data for {permno} - {feature}")
        continue

    if np.max(data) - np.min(data) < 1e-6:
        print(f"there is no variation in {permno} - {feature}")
        continue

    print(permno, feature)
    print("-----")
    # check if the series is stationary
    try:
        if adfuller(data, regression="ct")[1] > 0.01:
            stat_series = find_stat_series(data, diffs=diffs)
            stat_series = stat_series.reindex(original_index)
            missing_data_stationary.loc[mask, feature] = stat_series.values

    except Exception as e:
        print(f"error in {permno} - {feature}")
        print(e)
        continue

Index(['profit_lct', 'curr_ratio', 'intcov_ratio', 'pretret_earnat',
       'inv_turn', 'int_totdebt', '12_month_return', 'efftax',
       '3_month_return', 'intcov', 'ocf_lct', 'int_debt', 'quick_ratio',
       'rect_act', 'invt_act', 'dpr', 'pretret_noa', 'cash_ratio',
       'curr_debt'],
      dtype='object')


IntProgress(value=0, max=66211010)

there is no data for 0 - profit_lct
there is no data for 0 - curr_ratio
there is no data for 0 - intcov_ratio
there is no data for 0 - pretret_earnat
there is no data for 0 - inv_turn
there is no data for 0 - int_totdebt
there is no data for 0 - 12_month_return
there is no data for 0 - efftax
there is no data for 0 - 3_month_return
there is no data for 0 - intcov
there is no data for 0 - ocf_lct
there is no data for 0 - int_debt
there is no data for 0 - quick_ratio
there is no data for 0 - rect_act
there is no data for 0 - invt_act
there is no data for 0 - dpr
there is no data for 0 - pretret_noa
there is no data for 0 - cash_ratio
there is no data for 0 - curr_debt
there is no data for 1 - profit_lct
there is no data for 1 - curr_ratio
there is no data for 1 - intcov_ratio
there is no data for 1 - pretret_earnat
there is no data for 1 - inv_turn
there is no data for 1 - int_totdebt
there is no data for 1 - 12_month_return
there is no data for 1 - efftax
there is no data for 1 - 3_mont

In [None]:
find_stat_series[data_to_stationarize['profit_lct']]

# Merge the two dataframes

In [14]:
data_stat.groupby('divyield').count()['permno']

divyield
-0.048688     1
-0.035120     1
-0.029468     1
-0.028288     1
-0.026635     1
             ..
 0.292069     1
 0.295038     1
 0.301288     1
 0.302000    41
 0.527000    21
Name: permno, Length: 328332, dtype: int64

In [9]:
data_non_stat['divyield'].unique()

array([0.    , 0.0131, 0.0525, ..., 0.308 , 0.0825, 0.0868])

# Flag for S&P500

In [165]:
constituents = pd.read_csv(
    "../../data/sp500_constituents_2010.csv",
    index_col=0,
    parse_dates=["from_date", "thru_date"],
)
constituents.head()

Unnamed: 0_level_0,from_date,thru_date
permno,Unnamed: 1_level_1,Unnamed: 2_level_1
26825,1961-04-26,2023-12-29
46877,1975-06-12,2018-01-02
24109,1951-09-06,2023-12-29
25320,1957-03-01,2023-12-29
70519,1988-05-26,2023-12-29


In [166]:
# check that the permnos in the two DataFrames are the same
print(set(prices["permno"].unique()) - set(constituents.index.unique()))
print(set(constituents.index.unique()) - set(prices["permno"].unique()))

set()
set()


In [170]:
prices["in_sp500"] = 0

permnos = prices["permno"].unique()


def set_in_sp500(permno, from_date, thru_date):
    mask = (
        (prices["permno"] == permno)
        & (prices["date"] >= from_date)
        & (prices["date"] <= thru_date)
    )
    prices.loc[mask, "in_sp500"] = 1


for permno in permnos:

    row = constituents.loc[permno]

    # if there is a single observation process it normally
    if isinstance(row, pd.Series):
        from_date = row.from_date
        thru_date = row.thru_date

        set_in_sp500(permno, from_date, thru_date)

    # if the stock went in and out of the S&P, then process each row separately
    elif isinstance(row, pd.DataFrame):
        for row in row.itertuples():
            from_date = row.from_date
            thru_date = row.thru_date

            set_in_sp500(permno, from_date, thru_date)

In [175]:
prices.tail()

Unnamed: 0,permno,date,nameendt,shrcd,exchcd,siccd,ticker,comnam,tsymbol,naics,...,mktcap,naics_processed,industry_return,industry_relative_return,ret_industry_tot,ret_industry_relative,prc_adj,MACD_index,rsi,in_sp500
3892525,93436,2023-12-22,,11,3,9999,TSLA,TESLA INC,TSLA,336110.0,...,802804700.0,33.0,0.003517,-0.22048,0.003517,-0.22048,252.53999,4.42123,58.473892,1
3892526,93436,2023-12-26,,11,3,9999,TSLA,TESLA INC,TSLA,336110.0,...,815742900.0,33.0,0.315202,0.144926,0.315202,0.144926,256.60999,4.636162,63.398606,1
3892527,93436,2023-12-27,,11,3,9999,TSLA,TESLA INC,TSLA,336110.0,...,831097100.0,33.0,0.106531,0.43104,0.106531,0.43104,261.44,4.915272,68.511971,1
3892528,93436,2023-12-28,,11,3,9999,TSLA,TESLA INC,TSLA,336110.0,...,804839200.0,33.0,0.008914,-0.930388,0.008914,-0.930388,253.17999,5.076989,53.706654,1
3892529,93436,2023-12-29,,11,3,9999,TSLA,TESLA INC,TSLA,336110.0,...,789898300.0,33.0,-0.221914,-0.319851,-0.221914,-0.319851,248.48,5.070092,47.033612,1


Some sanity checks


In [173]:
prices["in_sp500"].sum() / prices.shape[0]

0.6985015602803338