In [1]:
import pandas as pd
import numpy as np
import talib
import warnings
import scipy
from alpha101.Alpha_code_1 import *
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
root_file_path = './data/binance_data/spot_data'
outfile_path = './data_with_features'
target_coin_list = ['BTC-USDT', 'ETH-USDT']
interval = '5min'

In [3]:
for coin in target_coin_list:
    df = pd.read_csv(f'{root_file_path}/{coin}/{coin}({interval}).csv')
    df.columns = ['datetime', 'open', 'high', 'low', 'close', 'volume', 'amount']
    df = df.copy()
    # price change
    df['price_change_pct'] = (df['close'] - df['close'].shift(1)) / df['close'].shift(1)
    # classify price change
    df['price_direction'] = np.where(df['price_change_pct'].shift(-1) > 0, 1, -1)
    # use next close price to predict (for some regression models)
    df['next_close'] = df['close'].shift(-1)
    # generate technical index from TA-Lib
    df['RSI'] = talib.RSI(df['close'], timeperiod=12)  # RSI
    df['MOM'] = talib.MOM(df['close'], timeperiod=5)  # Momentum
    df['EMA7'] = talib.EMA(df['close'], timeperiod=7)  # EMA7
    df['EMA21'] = talib.EMA(df['close'], timeperiod=21)  # EMA 21
    df['ATR'] = talib.ATR(df['high'].values, df['low'].values, df['close'], timeperiod=14)  # ATR
    df['TSF'] = talib.TSF(df['close'], timeperiod=14)  # Timeseries factor
    df['Boll_upper'], df['Boll_middle'], df['Boll_lower'] = talib.BBANDS(df['close'], 5, matype=talib.MA_Type.EMA)
    df['DEMA'] = talib.DEMA(df['close'], timeperiod=30)
    df['KAMA'] = talib.KAMA(df['close'], timeperiod=30)
    df['MIDPOINT'] = talib.MIDPOINT(df['close'], timeperiod=14)
    df['SAR'] = talib.SAR(df['high'].values, df['low'].values, acceleration=0, maximum=0)
    df['T3'] = talib.T3(df['close'], timeperiod=5, vfactor=0)
    df['TEMA'] = talib.TEMA(df['close'], timeperiod=30)
    df['SAREXT'] = talib.SAREXT(df['high'].values, df['low'].values, startvalue=0, offsetonreverse=0,
                                accelerationinitlong=0, accelerationlong=0, accelerationmaxlong=0,
                                accelerationinitshort=0, accelerationshort=0, accelerationmaxshort=0)
    df['NATR'] = talib.NATR(df['high'].values, df['low'].values, df['close'], timeperiod=14)
    df['AD'] = talib.AD(df['high'], df['low'], df['close'], df['volume'])
    df['ADOSC'] = talib.ADOSC(df['high'], df['low'], df['close'], df['volume'], fastperiod=3, slowperiod=10)
    df['OBV'] = talib.OBV(df['close'], df['volume'])
    df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df['close'])
    df['HT_DCPHASE'] = talib.HT_DCPHASE(df['close'])
    df['HT_TRENDMODE'] = talib.HT_TRENDMODE(df['close'])
    df['AVGPRICE'] = talib.AVGPRICE(df['open'].values, df['high'].values, df['low'].values, df['close'])
    df['MEDPRICE'] = talib.MEDPRICE(df['high'].values, df['low'].values)
    df['TYPPRICE'] = talib.TYPPRICE(df['high'].values, df['low'].values, df['close'])
    df['WCLPRICE'] = talib.WCLPRICE(df['high'].values, df['low'].values, df['close'])
    df['ADXR'] = talib.ADXR(df['high'].values, df['low'].values, df['close'], timeperiod=14)
    df['APO'] = talib.APO(df['close'], fastperiod=12, slowperiod=26, matype=0)
    df['AROONOSC'] = talib.AROONOSC(df['high'].values, df['low'].values, timeperiod=14)
    df['BOP'] = talib.BOP(df['open'].values, df['high'].values, df['low'].values, df['close'])
    df['CCI'] = talib.CCI(df['high'].values, df['low'].values, df['close'], timeperiod=14)
    df['CMO'] = talib.CMO(df['close'], timeperiod=14)
    df['DX'] = talib.DX(df['high'].values, df['low'].values, df['close'], timeperiod=14)
    df['MFI'] = talib.MFI(df['high'].values, df['low'].values, df['close'], df['volume'], timeperiod=14)
    df['MINUS_DI'] = talib.MINUS_DI(df['high'].values, df['low'].values, df['close'], timeperiod=14)
    df['MINUS_DM'] = talib.MINUS_DM(df['high'].values, df['low'].values, timeperiod=14)
    df['PLUS_DI'] = talib.PLUS_DI(df['high'].values, df['low'].values, df['close'], timeperiod=14)
    df['PLUS_DM'] = talib.PLUS_DM(df['high'].values, df['low'].values, timeperiod=14)
    df['PPO'] = talib.PPO(df['close'], fastperiod=12, slowperiod=26, matype=0)
    df['ROC'] = talib.ROC(df['close'], timeperiod=10)
    df['TRIX'] = talib.TRIX(df['close'], timeperiod=30)
    df['ULTOSC'] = talib.ULTOSC(df['high'].values, df['low'].values, df['close'], timeperiod1=7, timeperiod2=14,
                                timeperiod3=28)
    df['WILLR'] = talib.WILLR(df['high'].values, df['low'].values, df['close'], timeperiod=14)
    df['BETA'] = talib.BETA(df['high'].values, df['low'].values, timeperiod=5)
    df['CORREL'] = talib.CORREL(df['high'].values, df['low'].values, timeperiod=30)
    print('coming to alpha101')
    # Alpha101 features - well...not all 101 features, just choose some interesting ones
    get_alpha(df)
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna()
#     break
    df.to_csv(f'{outfile_path}/{coin}_{interval}_data_with_features')
# df

coming to alpha101
coming to alpha101


In [4]:
df

Unnamed: 0,datetime,open,high,low,close,volume,amount,price_change_pct,price_direction,next_close,...,alpha005,alpha006,alpha007,alpha008,alpha011,alpha015,alpha016,alpha018,alpha026,alpha040
88,2017-08-17 19:25:00,308.92,310.40,308.92,310.40,7.98512,2.478159e+03,0.004791,1,310.51,...,-0.238242,-0.226292,-1.0,-0.749131,0.402433,-0.731074,-0.277044,-0.765921,-0.126745,-0.037573
89,2017-08-17 19:30:00,310.40,310.68,310.40,310.51,8.22478,2.554476e+03,0.000354,-1,309.56,...,-0.239286,-0.190266,-1.0,-0.784843,0.422744,-0.753835,-0.282891,-0.632002,-0.186816,-0.015471
90,2017-08-17 19:35:00,310.51,310.51,308.39,309.56,58.62900,1.814294e+04,-0.003059,-1,309.56,...,-0.239771,-0.292496,-34.0,-0.632376,0.450052,-0.781421,-0.467189,-0.240421,-0.186816,-0.051973
91,2017-08-17 19:40:00,309.56,309.56,309.56,309.56,8.65148,2.678152e+03,0.000000,-1,309.56,...,-0.239127,0.040401,-1.0,-0.543286,0.415468,-0.968873,-0.384758,-0.626026,-0.186816,0.085511
92,2017-08-17 19:45:00,309.56,309.56,308.39,309.56,9.27844,2.865200e+03,0.000000,-1,308.95,...,-0.239507,0.061365,-1.0,-0.614842,0.415136,-1.913946,-0.495000,-0.626961,0.123282,0.091463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556259,2022-12-06 20:35:00,1258.40,1260.00,1258.20,1259.22,589.68630,7.425671e+05,0.000644,-1,1258.76,...,-0.220776,-0.138942,-1.0,-0.743799,0.588357,-1.554586,-0.413506,-0.712491,-0.042220,-0.080390
556260,2022-12-06 20:40:00,1259.22,1259.22,1258.00,1258.76,883.95150,1.112741e+06,-0.000365,-1,1258.25,...,-0.220932,-0.124593,-1.0,-0.679953,0.948762,-1.288754,-0.570058,-0.401089,-0.042220,-0.048374
556261,2022-12-06 20:45:00,1258.76,1259.44,1258.04,1258.25,1052.99700,1.325388e+06,-0.000405,1,1258.74,...,-0.220908,-0.242061,-1.0,-0.779173,0.730312,-0.720155,-0.490856,-0.367612,-0.042220,-0.087033
556262,2022-12-06 20:50:00,1258.25,1258.76,1257.46,1258.74,1435.28650,1.805731e+06,0.000389,-1,1257.79,...,-0.220992,-0.301781,-30.0,-0.571694,1.124784,-0.440651,-0.189488,-0.662277,0.097312,-0.099674
