These functions are used in the 'feature_engineer' function

To run these functions, we need variables 'data', 'factors', and 'cs_factors'

In [None]:
import pandas as pd
import numpy as np
from ta import add_all_ta_features

In [None]:
def momentum_235(data, factors, cs_factors):
    '''
    Calculates 2,3,5-day momentum of open price
    '''
    data['Mom_2day'] = data.groupby('tic')['prcod'].pct_change(periods=2)
    factors.append('Mom_2day')
    cs_factors.append('Mom_2day')
    data['Mom_3day'] = data.groupby('tic')['prcod'].pct_change(periods=3)
    factors.append('Mom_3day')
    cs_factors.append('Mom_3day')
    data['Mom_5day'] = data.groupby('tic')['prcod'].pct_change(periods=5)
    factors.append('Mom_5day')
    cs_factors.append('Mom_5day')
    return data, factors, cs_factors

In [None]:
def MA_510(data, factors, cs_factors):
    '''
    Calculates 5,10-day simple moving averages of open prices
    '''
    data['MA_5day'] = data.groupby('tic')['prcod'].transform(lambda x: x.rolling(5, min_periods=1).mean())
    data['MA_10day'] = data.groupby('tic')['prcod'].transform(lambda x: x.rolling(10, min_periods=1).mean())
    factors.append('MA_5day')
    cs_factors.append('MA_5day')
    factors.append('MA_10day')
    cs_factors.append('MA_10day')
    return data, factors, cs_factors

In [None]:
def price_vs_MA(data, factors, cs_factors):
    '''
    Calculates the ratio of open/close prices and the 5,10-day moving averages
    '''
    data['close/MA10'] = data['prccd'] / data['MA_10day']
    data['close/MA5'] = data['prccd'] / data['MA_5day']
    factors.append('close/MA10')
    cs_factors.append('close/MA10')
    factors.append('close/MA5')
    cs_factors.append('close/MA5')
    data['open/MA10'] = data['prcod'] / data['MA_10day']
    data['open/MA5'] = data['prcod'] / data['MA_5day']
    factors.append('open/MA10')
    cs_factors.append('open/MA10')
    factors.append('open/MA5')
    cs_factors.append('open/MA5')
    return data, factors, cs_factors

In [None]:
def STD_10(data, factors, cs_factors):
    '''
    Calculates the 10-day moving standard deviation of open price
    '''
    data['STD_10day'] = data.groupby('tic')['prcod'].transform(lambda x: x.rolling(10, min_periods=1).std())
    factors.append('STD_10day')
    cs_factors.append('STD_10day')    
    return data, factors, cs_factors

In [None]:
def H_L(data, factors, cs_factors):
    '''
    Calculates the daily spread: high - low
    '''
    data['H-L'] = data['prchd'] - data['prcld']
    factors.append('H-L')
    cs_factors.append('H-L')
    return data, factors, cs_factors

In [None]:
def RSI_14(data, factors, cs_factors):
    '''
    Calculates the relative strength index (RSI) using 14-day period
    '''
    data['delta'] = data.groupby('tic')['prcod'].diff()
    data['gain'] = data['delta'].clip(lower=0)
    data['loss'] = -data['delta'].clip(upper=0)
    data['avg_gain'] = data.groupby('tic')['gain'].rolling(window=14, min_periods=1).mean().reset_index(level=0, drop=True)
    data['avg_loss'] = data.groupby('tic')['loss'].rolling(window=14, min_periods=1).mean().reset_index(level=0, drop=True)
    data['RSI'] = 100 - (100 / (1 + data['avg_gain'] / data['avg_loss']))
    data = data.drop(columns=['delta', 'gain', 'loss', 'avg_gain', 'avg_loss'])
    data = data.fillna(0)
    data['RSI'] = data.groupby('tic')['RSI'].transform(lambda x: x.replace(0, x[x != 0].mean()))
    factors.append('RSI')
    cs_factors.append('RSI')
    return data, factors, cs_factors

In [None]:
def MACD_Line(data, factors, cs_factors):
    '''
    Calculates the Moving Average Convergence Divergence (MACD)
    MACD = EMA12 - EMA26
    MACD_Signal_Line = 9-day exponential moving average of MACD
    '''
    data['EMA12'] = data['prcod'].ewm(span=12, adjust=False, min_periods=1).mean().reset_index(drop=True)
    data['EMA26'] = data['prcod'].ewm(span=26, adjust=False, min_periods=1).mean().reset_index(drop=True)
    data['MACD'] = data['EMA12'] - data['EMA26']
    data['MACD_Signal_Line'] = data['MACD'].ewm(span=9, adjust=False, min_periods=1).mean()
    
    data = data.drop(columns=['EMA12', 'EMA26'])
    
    data = data.fillna(0)
    data['RSI'] = data.groupby('tic')['RSI'].transform(lambda x: x.replace(0, x[x != 0].mean()))
    
    factors.extend(['MACD', 'MACD_Signal_Line'])
    cs_factors.extend(['MACD', 'MACD_Signal_Line'])
    
    return data, factors, cs_factors

In [None]:
def all_features_ta(data, factors, cs_factors):
    '''
    Calculates all features in library ta (Technical Analysis Library)
    '''
    data_out = data.groupby("tic").apply(lambda x: add_all_ta_features(x, open="prcod", high="prchd", low="prcld", close="prccd", volume="cshtrd", fillna=True))
    factors.extend(list(set(data_out.columns) - set(data.columns)))
    cs_factors.extend(list(set(data_out.columns) - set(data.columns)))
    
    return data_out, factors, cs_factors

In [None]:
def feature_engineer(data, factors, cs_factors):
    '''
    Input: data, factors, cs_factors (factors that are standardized cross-sectionally)
    1. Add new features that are defined above
    Output: data, factors, cs_factors
    '''
    data, factors, cs_factors = momentum_235(data, factors, cs_factors)
    data, factors, cs_factors = MA_510(data, factors, cs_factors)
    data, factors, cs_factors = price_vs_MA(data, factors, cs_factors)
    data, factors, cs_factors = STD_10(data, factors, cs_factors)
    data, factors, cs_factors = H_L(data, factors, cs_factors)
    data, factors, cs_factors = RSI_14(data, factors, cs_factors)
    data, factors, cs_factors, = MACD_Line(data, factors, cs_factors)
    
    # Fill missing data with cross-sectional mean
    for f in factors:
        data[f] = data.groupby('datadate')[f].transform(lambda x:x.fillna(x.median()))
    
    return data, factors, cs_factors
