# Price Direction Model
The goal with this model is to predict the price direction in a 3h timespan, on a 45 min timeframe.
I will test it on ADA/USDT trading pair.

In [57]:
import pandas as pd
import numpy as np
import talib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("../Data/ADA/ADA_USDT_45m.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df.set_index("datetime", inplace=True)

df.head(5)

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-22 00:00:00,2.141,2.169,2.133,2.169,2871.3
2021-10-22 00:45:00,2.169,2.178,2.156,2.17,2385.7
2021-10-22 01:30:00,2.17,2.178,2.161,2.171,1496.3
2021-10-22 02:15:00,2.171,2.176,2.165,2.167,920.1
2021-10-22 03:00:00,2.167,2.194,2.167,2.19,3111.7


In [26]:
# 4 candlesticks = 45m * 4 = 3h
prediction_candles = 4

# set labels as nan to begin with
df["price_direction"] = np.nan

for i in range(len(df) - prediction_candles):
    current_price = df.iloc[i]['close']
    future_price = df.iloc[i + prediction_candles]['close']
    
    # Label as 1 if the price goes up, 0 if it goes down
    df.at[df.index[i], 'price_direction'] = 1 if future_price > current_price else 0

df.dropna(inplace=True)

df["price_direction"] = df["price_direction"].astype(int)

df["price_direction"].value_counts()

price_direction
0    13119
1    12061
Name: count, dtype: int64

In [27]:
# Technical indicators - used ChatGPT to add all of them to the dataset. Later, I will optimize to see which ones are best.

# Overlap studies in order of importance (tested *before* parameter optimization)
df['SAREXT'] = talib.SAREXT(df['high'], df['low'], startvalue=0, offsetonreverse=0, accelerationinitlong=0.02,
                            accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02,
                            accelerationshort=0.02, accelerationmaxshort=0.2)
df['SAR'] = talib.SAR(df['high'], df['low'], acceleration=0.02, maximum=0.2)
df['KAMA'] = talib.KAMA(df['close'], timeperiod=30)
df['SMA'] = talib.SMA(df['close'], timeperiod=30)
df['MA'] = talib.MA(df['close'], timeperiod=30)
df['TRIMA'] = talib.TRIMA(df['close'], timeperiod=30)
df['EMA'] = talib.EMA(df['close'], timeperiod=30)
_, df['FAMA'] = talib.MAMA(df['close'], fastlimit=0.5, slowlimit=0.05)
df['T3'] = talib.T3(df['close'], timeperiod=3, vfactor=0.7)
df['WMA'] = talib.WMA(df['close'], timeperiod=10)
df['DEMA'] = talib.DEMA(df['close'], timeperiod=10)
df['MAMA'], _ = talib.MAMA(df['close'], fastlimit=0.5, slowlimit=0.05)
df['TEMA'] = talib.TEMA(df['close'], timeperiod=30)
df['MIDPRICE'] = talib.MIDPRICE(df['high'], df['low'], timeperiod=14)
df['MIDPOINT'] = talib.MIDPOINT(df['close'], timeperiod=14)

# Momentum indicators in order of importance (tested *before* parameter optimization)
df['TRIX'] = talib.TRIX(df['close'], timeperiod=30)
df['ADX'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=14)
df['ADXR'] = talib.ADXR(df['high'], df['low'], df['close'], timeperiod=14)
df['ULTOSC'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
df['MFI'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=14)
_, _, df['MACDHISTEXT'] = talib.MACDEXT(df['close'], fastperiod=12, fastmatype=0, slowperiod=26,
                                                                      slowmatype=0, signalperiod=9, signalmatype=0)
df['PLUS_DI'] = talib.PLUS_DI(df['high'], df['low'], df['close'], timeperiod=14)
df['MINUS_DI'] = talib.MINUS_DI(df['high'], df['low'], df['close'], timeperiod=14)
_, df['STOCH_SLOWD'] = talib.STOCH(df['high'], df['low'], df['close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
df['DX'] = talib.DX(df['high'], df['low'], df['close'], timeperiod=14)
df['PLUS_DM'] = talib.PLUS_DM(df['high'], df['low'], timeperiod=14)
df['CCI'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=14)
df['WILLR'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=14)
_, df['MACDSIGNALEXT'], _ = talib.MACDEXT(df['close'], fastperiod=12, fastmatype=0, slowperiod=26,
                                                                      slowmatype=0, signalperiod=9, signalmatype=0)
df['RSI'] = talib.RSI(df['close'], timeperiod=14)
df['CMO'] = talib.CMO(df['close'], timeperiod=14)
df['MINUS_DM'] = talib.MINUS_DM(df['high'], df['low'], timeperiod=14)
df['PPO'] = talib.PPO(df['close'], fastperiod=12, slowperiod=26, matype=0)
_, df['STOCHF_FASTD'] = talib.STOCHF(df['high'], df['low'], df['close'], fastk_period=5, fastd_period=3, fastd_matype=0)
_, df['MACDSIGNALFIX'], _ = talib.MACDFIX(df['close'], signalperiod=9)
_, df['MACDSIGNAL'], _ = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)
df['MACDEXT'], _, _ = talib.MACDEXT(df['close'], fastperiod=12, fastmatype=0, slowperiod=26,
                                                                      slowmatype=0, signalperiod=9, signalmatype=0)
df['STOCH'], _ = talib.STOCH(df['high'], df['low'], df['close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
_, _, df['MACDHISTFIX'] = talib.MACDFIX(df['close'], signalperiod=9)
df['BOP'] = talib.BOP(df['open'], df['high'], df['low'], df['close'])
df['STOCHF'], _ = talib.STOCHF(df['high'], df['low'], df['close'], fastk_period=5, fastd_period=3, fastd_matype=0)
_, _, df['MACDHIST'] = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)
_, df['STOCHRSI_FASTD'] = talib.STOCHRSI(df['close'], timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)

# Statistical functions in order of importance (tested *before* parameter optimization)
df['BETA'] = talib.BETA(df['high'], df['low'], timeperiod=5)
df['CORREL'] = talib.CORREL(df['high'], df['low'], timeperiod=30)
df['STDDEV'] = talib.STDDEV(df['close'], timeperiod=5, nbdev=1)
df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df['close'], timeperiod=14)
df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(df['close'], timeperiod=14)
df['VAR'] = talib.VAR(df['close'], timeperiod=5, nbdev=1)
df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(df['close'], timeperiod=14)
df['TSF'] = talib.TSF(df['close'], timeperiod=14)
df['LINEARREG'] = talib.LINEARREG(df['close'], timeperiod=14)

# Volume indicators in order of importance (tested *before* parameter optimization)

df['AD'] = talib.AD(df['high'], df['low'], df['close'], df['volume'])
df['OBV'] = talib.OBV(df['close'], df['volume'])
df['ADOSC'] = talib.ADOSC(df['high'], df['low'], df['close'], df['volume'], fastperiod=3, slowperiod=10)

# Volatility indicators in order of importance (tested *before* parameter optimization)

df['NATR'] = talib.NATR(df['high'], df['low'], df['close'], timeperiod=14)
df['ATR'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=14)
df['TRANGE'] = talib.TRANGE(df['high'], df['low'], df['close'])

# Price transform in order of importance (tested *before* parameter optimization)

df['TYPPRICE'] = talib.TYPPRICE(df['high'], df['low'], df['close'])
df['WCLPRICE'] = talib.WCLPRICE(df['high'], df['low'], df['close'])
df['AVGPRICE'] = talib.AVGPRICE(df['open'], df['high'], df['low'], df['close'])
df['MEDPRICE'] = talib.MEDPRICE(df['high'], df['low'])

# Cycle indicators in order of importance (tested *before* parameter optimization)

df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df['close'])
_, df['HT_PHASOR_QUADRATURE'] = talib.HT_PHASOR(df['close'])
df['HT_DCPHASE'] = talib.HT_DCPHASE(df['close'])
df['HT_SINE'], _ = talib.HT_SINE(df['close'])
_, df['HT_LEADSINE'] = talib.HT_SINE(df['close'])
df['HT_PHASOR_INPHASE'], _ = talib.HT_PHASOR(df['close'])
df['HT_TRENDMODE'] = talib.HT_TRENDMODE(df['close'])

# Bands in order of importance (tested *before* parameter optimization)

_, _, df['BBANDS_LOWER'] = talib.BBANDS(df['close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
df['BBANDS_UPPER'], _, _ = talib.BBANDS(df['close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
_, df['BBANDS_MIDDLE'], _ = talib.BBANDS(df['close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)

df.dropna(inplace=True)

X = df.drop(columns=["price_direction"]) 
y = df['price_direction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# I put the model in a separate cell because for some reason, eventhough I use a random state, the datasets are different each time. 
# But not when I run the train_test_split and model in separate cells.
model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluation metrics
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Precision:", precision)
print("F1 score:", f1)
print("Recall:", recall)
print("Accuracy:", accuracy)

Precision: 0.7553889409559512
F1 score: 0.7128012381162945
Recall: 0.6747593135203014
Accuracy: 0.7411835026897788


## Parameter optimization for indicators

I will now try to optimize the parameters for the indicators. I have these categories:  

Overlap  
Momentum  
Volume  
Volatility  
Price transform  
Cycle  
Statistical functions  
Bollinger bands  

Price transform and cycle indicators only have ohlc, so those can't be optimized.

In [29]:
# Overlap indicators
def optimize_overlap_indicators(X_train, y_train, X_test, y_test, indicators):
    best_params = {}
    for indicator, param_grid in indicators.items():
        best_score = 0
        best_param = None
        for params in param_grid:
            df_temp = X_train.copy()
            if indicator == 'SAREXT':
                df_temp[indicator] = talib.SAREXT(df_temp['high'], df_temp['low'], **params)
            elif indicator == 'SAR':
                df_temp[indicator] = talib.SAR(df_temp['high'], df_temp['low'], **params)
            elif indicator in ['KAMA', 'SMA', 'MA', 'TRIMA', 'EMA', 'WMA', 'DEMA', 'TEMA']:
                df_temp[indicator] = getattr(talib, indicator)(df_temp['close'], **params)
            elif indicator == 'MIDPRICE':
                df_temp[indicator] = talib.MIDPRICE(df_temp['high'], df_temp['low'], **params)
            elif indicator == 'MIDPOINT':
                df_temp[indicator] = talib.MIDPOINT(df_temp['close'], **params)

            model = RandomForestClassifier(random_state=42)
            model.fit(df_temp, y_train)
            y_pred = model.predict(X_test[df_temp.columns])
            score = f1_score(y_test, y_pred, average='macro')

            if score > best_score:
                best_score = score
                best_param = params
        
        best_params[indicator] = (best_param, best_score)
        print(f'Best parameters for {indicator}: {best_param} with F1 score: {best_score}')
    return best_params

overlap_indicators = {
    'SAREXT': [{'startvalue':0, 'offsetonreverse':0,
         'accelerationinitlong': a, 'accelerationlong': a, 
         'accelerationmaxlong': m, 'accelerationinitshort': a,
         'accelerationshort': a, 'accelerationmaxshort': m}
        for a in [0.02, 0.05, 0.1] for m in [0.2, 0.5, 1]],
    'SAR': [{'acceleration': a, 'maximum': m} for a in [0.02, 0.05, 0.1] for m in [0.2, 0.5, 1]],
    'T3': [{'timeperiod': p, 'vfactor': v} for p in [3, 5, 10] for v in [0.5, 0.7, 0.9]],
    'MAMA': [{'fastlimit': f, 'slowlimit': s} for f in [0.01, 0.05, 0.1] for s in [0.01, 0.05, 0.1]],
    'FAMA': [{'fastlimit': f, 'slowlimit': s} for f in [0.01, 0.05, 0.1] for s in [0.01, 0.05, 0.1]],
    'KAMA': [{'timeperiod': p} for p in range(10, 50, 5)],
    'SMA': [{'timeperiod': p} for p in range(10, 50, 5)],
    'EMA': [{'timeperiod': p} for p in range(10, 50, 5)],
    'WMA': [{'timeperiod': p} for p in range(10, 50, 5)],
    'DEMA': [{'timeperiod': p} for p in range(10, 50, 5)],
    'TEMA': [{'timeperiod': p} for p in range(10, 50, 5)],
    'TRIMA': [{'timeperiod': p} for p in range(10, 50, 5)],
    'MA': [{'timeperiod': p} for p in range(10, 50, 5)],
    'MIDPRICE': [{'timeperiod': p} for p in range(10, 30, 5)],
    'MIDPOINT': [{'timeperiod': p} for p in range(10, 30, 5)]
}

X_train, X_test, y_train, y_test = train_test_split(df.drop('price_direction', axis=1), df['price_direction'], test_size=0.2, random_state=42)

best_settings = optimize_overlap_indicators(X_train, y_train, X_test, y_test, overlap_indicators)

Best parameters for SAREXT: {'startvalue': 0, 'offsetonreverse': 0, 'accelerationinitlong': 0.02, 'accelerationlong': 0.02, 'accelerationmaxlong': 0.2, 'accelerationinitshort': 0.02, 'accelerationshort': 0.02, 'accelerationmaxshort': 0.2} with F1 score: 0.7291006348947482
Best parameters for SAR: {'acceleration': 0.1, 'maximum': 0.5} with F1 score: 0.7361204650897779
Best parameters for T3: {'timeperiod': 3, 'vfactor': 0.5} with F1 score: 0.7386309001098245
Best parameters for MAMA: {'fastlimit': 0.01, 'slowlimit': 0.01} with F1 score: 0.7386309001098245
Best parameters for FAMA: {'fastlimit': 0.01, 'slowlimit': 0.01} with F1 score: 0.7386309001098245
Best parameters for KAMA: {'timeperiod': 15} with F1 score: 0.7337365537111085
Best parameters for SMA: {'timeperiod': 10} with F1 score: 0.735255390812431
Best parameters for EMA: {'timeperiod': 40} with F1 score: 0.730110337415502
Best parameters for WMA: {'timeperiod': 15} with F1 score: 0.7353982868595883
Best parameters for DEMA: {'t

In [41]:
# Momentum indicators
def optimize_momentum_indicators(X_train, y_train, X_test, y_test, indicators):
    best_params = {}
    for indicator, param_grid in indicators.items():
        best_score = 0
        best_param = None
        for params in param_grid:
            df_temp = X_train.copy()
            try:
                if indicator in ['SAREXT', 'SAR', 'ADX', 'ADXR', 'PLUS_DI', 'MINUS_DI', 'DX']:
                    df_temp[indicator] = getattr(talib, indicator)(df_temp['high'], df_temp['low'], df_temp['close'], **params)
                elif indicator == 'ULTOSC':
                    df_temp[indicator] = talib.ULTOSC(df_temp['high'], df_temp['low'], df_temp['close'], **params)
                elif indicator == 'MFI':
                    df_temp[indicator] = talib.MFI(df_temp['high'], df_temp['low'], df_temp['close'], df_temp['volume'], **params)
                elif indicator in ['TRIX', 'KAMA', 'SMA', 'MA', 'TRIMA', 'EMA', 'WMA', 'DEMA', 'TEMA', 'RSI', 'CMO', 'PPO']:
                    df_temp[indicator] = getattr(talib, indicator)(df_temp['close'], **params)
                elif indicator in ['MIDPRICE', 'MIDPOINT']:
                    df_temp[indicator] = getattr(talib, indicator)(df_temp['high'], df_temp['low'], **params)

                model = RandomForestClassifier(random_state=42)
                model.fit(df_temp, y_train)
                y_pred = model.predict(X_test[df_temp.columns.intersection(X_train.columns)])
                score = f1_score(y_test, y_pred, average='macro')

                if score > best_score:
                    best_score = score
                    best_param = params
            except Exception as e:
                print(f"Error processing {indicator} with params {params}: {str(e)}")
        
        best_params[indicator] = (best_param, best_score)
        print(f'Best parameters for {indicator}: {best_param} with F1 score: {best_score}')
    return best_params

momentum_indicators = {
    'TRIX': [{'timeperiod': p} for p in range(10, 31, 5)],
    'ADX': [{'timeperiod': p} for p in range(10, 21, 2)],
    'ADXR': [{'timeperiod': p} for p in range(10, 21, 2)],
    'ULTOSC': [
        {'timeperiod1': tp1, 'timeperiod2': tp2, 'timeperiod3': tp3} 
        for tp1, tp2, tp3 in [(7, 14, 28), (6, 12, 24), (8, 16, 32)]
    ],
    'MFI': [{'timeperiod': p} for p in range(10, 21, 2)],
    'PLUS_DI': [{'timeperiod': p} for p in range(10, 21, 2)],
    'MINUS_DI': [{'timeperiod': p} for p in range(10, 21, 2)],
    'DX': [{'timeperiod': p} for p in range(10, 21, 2)],
    'CCI': [{'timeperiod': p} for p in range(10, 31, 5)],
    'WILLR': [{'timeperiod': p} for p in range(10, 21, 2)],
    'RSI': [{'timeperiod': p} for p in range(10, 21, 2)],
    'CMO': [{'timeperiod': p} for p in range(10, 21, 2)],
    'PPO': [
        {'fastperiod': fp, 'slowperiod': sp, 'matype': 0}
        for fp, sp in [(12, 26), (10, 30), (5, 35)]
    ],
    'MACDSIGNALFIX': [
        {'signalperiod': 9}, {'signalperiod': 12}, {'signalperiod': 20}
    ],
    'MACDSIGNAL': [
        {'fastperiod': 12, 'slowperiod': 26, 'signalperiod': 9},
        {'fastperiod': 5, 'slowperiod': 35, 'signalperiod': 5},
        {'fastperiod': 10, 'slowperiod': 50, 'signalperiod': 10}
    ],
    'BOP': [{}],
    'MACDHIST': [
        {'fastperiod': 12, 'slowperiod': 26, 'signalperiod': 9},
        {'fastperiod': 5, 'slowperiod': 35, 'signalperiod': 5},
        {'fastperiod': 10, 'slowperiod': 50, 'signalperiod': 10}
    ]
}

X_train, X_test, y_train, y_test = train_test_split(df.drop('price_direction', axis=1), df['price_direction'], test_size=0.2, random_state=42)

best_settings = optimize_momentum_indicators(X_train, y_train, X_test, y_test, momentum_indicators)

Best parameters for TRIX: {'timeperiod': 20} with F1 score: 0.730363325152448
Best parameters for ADX: {'timeperiod': 16} with F1 score: 0.735501326363461
Best parameters for ADXR: {'timeperiod': 10} with F1 score: 0.7304003756744095
Best parameters for ULTOSC: {'timeperiod1': 8, 'timeperiod2': 16, 'timeperiod3': 32} with F1 score: 0.736441880124659
Best parameters for MFI: {'timeperiod': 18} with F1 score: 0.7326790240722898
Best parameters for PLUS_DI: {'timeperiod': 18} with F1 score: 0.739688439906883
Best parameters for MINUS_DI: {'timeperiod': 18} with F1 score: 0.7383638486016278
Best parameters for DX: {'timeperiod': 20} with F1 score: 0.7360881463205837
Best parameters for CCI: {'timeperiod': 10} with F1 score: 0.7386309001098245
Best parameters for WILLR: {'timeperiod': 10} with F1 score: 0.7386309001098245
Best parameters for RSI: {'timeperiod': 14} with F1 score: 0.7377789174245764
Best parameters for CMO: {'timeperiod': 20} with F1 score: 0.7309119335829963
Best parameters

I had some errors with these momentum indicators:

MACDEXT  
STOCH  
STOCHF  
STOCHRSI  
PLUS_DM  
MINUS_DM  

So I will do them separately.

In [31]:
# MACDEXT
def optimize_macdext(X_train, y_train, X_test, y_test):
    param_grid = [
        {'fastperiod': fp, 'slowperiod': sp, 'signalperiod': sig, 'fastmatype': mat, 'slowmatype': mat, 'signalmatype': mat}
        for fp in [12, 5, 10]
        for sp in [26, 35, 50]
        for sig in [9, 5, 10]
        for mat in range(0, 8)
    ]
    best_score = 0
    best_params = None
    for params in param_grid:
        df_temp = X_train.copy()
        df_temp['MACDEXT'], _, _ = talib.MACDEXT(df_temp['close'], **params)
        model = RandomForestClassifier(random_state=42)
        model.fit(df_temp[['MACDEXT']], y_train)
        y_pred = model.predict(X_test[['MACDEXT']])
        score = f1_score(y_test, y_pred, average='macro')
        if score > best_score:
            best_score = score
            best_params = params

    print(f'Best parameters for MACDEXT: {best_params} with F1 score: {best_score}')
    return best_params

best_macdext = optimize_macdext(X_train, y_train, X_test, y_test)

Best parameters for MACDEXT: {'fastperiod': 10, 'slowperiod': 26, 'signalperiod': 10, 'fastmatype': 1, 'slowmatype': 1, 'signalmatype': 1} with F1 score: 0.5217865534872489


In [32]:
# PLUS_DM, MINUS_DM
def optimize_dm(X_train, y_train, X_test, y_test):
    param_grid = [{'timeperiod': tp} for tp in range(10, 21, 2)]
    best_scores = {}
    for indicator in ['PLUS_DM', 'MINUS_DM']:
        best_score = 0
        best_params = None
        for params in param_grid:
            df_temp = X_train.copy()
            df_temp[indicator] = getattr(talib, indicator)(df_temp['high'], df_temp['low'], **params)
            model = RandomForestClassifier(random_state=42)
            model.fit(df_temp[[indicator]], y_train)
            y_pred = model.predict(X_test[[indicator]])
            score = f1_score(y_test, y_pred, average='macro')
            if score > best_score:
                best_score = score
                best_params = params
        best_scores[indicator] = (best_params, best_score)
        print(f'Best parameters for {indicator}: {best_params} with F1 score: {best_score}')
    return best_scores

best_dm = optimize_dm(X_train, y_train, X_test, y_test)

Best parameters for PLUS_DM: {'timeperiod': 10} with F1 score: 0.3224892008639309
Best parameters for MINUS_DM: {'timeperiod': 10} with F1 score: 0.3224892008639309


In [40]:
from itertools import product

def optimize_stoch(X_train, y_train, X_test, y_test):
    param_grid = {
        'STOCH': [{'fastk_period': fk, 'slowk_period': sk, 'slowd_period': sd, 'slowk_matype': mat, 'slowd_matype': mat}
                  for fk, sk, sd in product([5, 14, 20], [3, 9, 14], [3, 9])
                  for mat in range(0, 8)],
        'STOCHF': [{'fastk_period': fk, 'fastd_period': fd, 'fastd_matype': mat}
                   for fk, fd in product([5, 14, 20], [3, 9])
                   for mat in range(0, 8)],
        'STOCHRSI': [{'timeperiod': tp, 'fastk_period': fk, 'fastd_period': fd, 'fastd_matype': mat}
                     for tp in [14, 20]
                     for fk, fd in product([5, 14], [3, 9])
                     for mat in range(0, 8)]
    }
    best_scores = {}
    for indicator, params_list in param_grid.items():
        best_score = 0
        best_params = None
        for params in params_list:
            df_temp = X_train.copy()
            df_test_temp = X_test.copy()
            feature = None
            if indicator == 'STOCH':
                df_temp['slowk'], df_temp['slowd'] = talib.STOCH(df_temp['high'], df_temp['low'], df_temp['close'], **params)
                df_test_temp['slowk'], df_test_temp['slowd'] = talib.STOCH(df_test_temp['high'], df_test_temp['low'], df_test_temp['close'], **params)
                feature = 'slowd' 
            elif indicator == 'STOCHF':
                df_temp['fastk'], df_temp['fastd'] = talib.STOCHF(df_temp['high'], df_temp['low'], df_temp['close'], **params)
                df_test_temp['fastk'], df_test_temp['fastd'] = talib.STOCHF(df_test_temp['high'], df_test_temp['low'], df_test_temp['close'], **params)
                feature = 'fastd'
            elif indicator == 'STOCHRSI':
                df_temp['fastk'], df_temp['fastd'] = talib.STOCHRSI(df_temp['close'], **params)
                df_test_temp['fastk'], df_test_temp['fastd'] = talib.STOCHRSI(df_test_temp['close'], **params)
                feature = 'fastd'

            model = RandomForestClassifier(random_state=42)
            model.fit(df_temp[[feature]], y_train)
            y_pred = model.predict(df_test_temp[[feature]])
            score = f1_score(y_test, y_pred, average='macro')

            if score > best_score:
                best_score = score
                best_params = params

        best_scores[indicator] = (best_params, best_score)
        print(f'Best parameters for {indicator}: {best_params} with F1 score: {best_score}')
    return best_scores

# Example of running the optimization
best_stoch = optimize_stoch(X_train, y_train, X_test, y_test)

Best parameters for STOCH: {'fastk_period': 5, 'slowk_period': 9, 'slowd_period': 9, 'slowk_matype': 4, 'slowd_matype': 4} with F1 score: 0.51700807687173
Best parameters for STOCHF: {'fastk_period': 5, 'fastd_period': 3, 'fastd_matype': 1} with F1 score: 0.5173542915912162
Best parameters for STOCHRSI: {'timeperiod': 14, 'fastk_period': 5, 'fastd_period': 9, 'fastd_matype': 0} with F1 score: 0.5123394677838156


In [None]:
# Statistical indicators
def optimize_statistical_indicators(X_train, y_train, X_test, y_test, indicators):
    best_params = {}
    for indicator, param_grid in indicators.items():
        best_score = 0
        best_param = None
        for params in param_grid:
            df_temp = X_train.copy()
            try:
                if indicator in ['BETA', 'CORREL']:
                    df_temp[indicator] = getattr(talib, indicator)(df_temp['high'], df_temp['low'], **params)
                elif indicator in ['STDDEV', 'VAR', 'LINEARREG', 'LINEARREG_ANGLE', 'LINEARREG_SLOPE', 'LINEARREG_INTERCEPT', 'TSF']:
                    df_temp[indicator] = getattr(talib, indicator)(df_temp['close'], **params)

                model = RandomForestClassifier(random_state=42)
                model.fit(df_temp, y_train)
                y_pred = model.predict(X_test[df_temp.columns.intersection(X_train.columns)])
                score = f1_score(y_test, y_pred, average='macro')

                if score > best_score:
                    best_score = score
                    best_param = params
            except Exception as e:
                print(f"Error processing {indicator} with params {params}: {str(e)}")
        
        best_params[indicator] = (best_param, best_score)
        print(f'Best parameters for {indicator}: {best_param} with F1 score: {best_score}')
    return best_params

statistical_indicators = {
    'BETA': [{'timeperiod': p} for p in range(5, 20, 3)],
    'CORREL': [{'timeperiod': p} for p in range(10, 31, 5)],
    'STDDEV': [{'timeperiod': p, 'nbdev': n} for p in range(5, 15, 2) for n in [1, 2, 3]],
    'VAR': [{'timeperiod': p, 'nbdev': n} for p in range(5, 15, 2) for n in [1, 2, 3]],
    'LINEARREG': [{'timeperiod': p} for p in range(10, 30, 5)],
    'LINEARREG_ANGLE': [{'timeperiod': p} for p in range(10, 30, 5)],
    'LINEARREG_SLOPE': [{'timeperiod': p} for p in range(10, 30, 5)],
    'LINEARREG_INTERCEPT': [{'timeperiod': p} for p in range(10, 30, 5)],
    'TSF': [{'timeperiod': p} for p in range(10, 30, 5)]
}

X_train, X_test, y_train, y_test = train_test_split(df.drop('price_direction', axis=1), df['price_direction'], test_size=0.2, random_state=42)

best_settings = optimize_statistical_indicators(X_train, y_train, X_test, y_test, statistical_indicators)

Best parameters for BETA: {'timeperiod': 5} with F1 score: 0.7348966121276552
Best parameters for CORREL: {'timeperiod': 10} with F1 score: 0.7335560640952139
Best parameters for STDDEV: {'timeperiod': 5, 'nbdev': 1} with F1 score: 0.7366880241433851
Best parameters for VAR: {'timeperiod': 5, 'nbdev': 1} with F1 score: 0.7385906675399289
Best parameters for LINEARREG: {'timeperiod': 10} with F1 score: 0.7400407430203393
Best parameters for LINEARREG_ANGLE: {'timeperiod': 25} with F1 score: 0.7381201571326026
Best parameters for LINEARREG_SLOPE: {'timeperiod': 25} with F1 score: 0.7380700177674429
Best parameters for LINEARREG_INTERCEPT: {'timeperiod': 15} with F1 score: 0.7336533960098248
Best parameters for TSF: {'timeperiod': 25} with F1 score: 0.7353658399775631


In [None]:
# Volume, volatility and BBands
def optimize_other_indicators(X_train, y_train, X_test, y_test, indicators):
    best_params = {}
    for indicator, param_grid in indicators.items():
        best_score = 0
        best_param = None
        for params in param_grid:
            df_temp = X_train.copy()
            try:
                if indicator == 'ADOSC':
                    df_temp[indicator] = talib.ADOSC(df_temp['high'], df_temp['low'], df_temp['close'], df_temp['volume'], **params)
                elif indicator in ['NATR', 'ATR']:
                    df_temp[indicator] = getattr(talib, indicator)(df_temp['high'], df_temp['low'], df_temp['close'], **params)
                elif indicator == 'BBANDS':
                    upper, middle, lower = talib.BBANDS(df_temp['close'], **params)
                    df_temp['BBANDS_UPPER'] = upper
                    df_temp['BBANDS_MIDDLE'] = middle
                    df_temp['BBANDS_LOWER'] = lower
                
                model = RandomForestClassifier(random_state=42)
                model.fit(df_temp, y_train)
                y_pred = model.predict(X_test[df_temp.columns.intersection(X_train.columns)])
                score = f1_score(y_test, y_pred, average='macro')

                if score > best_score:
                    best_score = score
                    best_param = params
            except Exception as e:
                print(f"Error processing {indicator} with params {params}: {str(e)}")
        
        best_params[indicator] = (best_param, best_score)
        print(f'Best parameters for {indicator}: {best_param} with F1 score: {best_score}')
    return best_params

other_indicators = {
    'ADOSC': [{'fastperiod': fp, 'slowperiod': sp} for fp in range(3, 10, 2) for sp in range(10, 20, 5)],
    'NATR': [{'timeperiod': p} for p in range(10, 21, 2)],
    'ATR': [{'timeperiod': p} for p in range(10, 21, 2)],
    'BBANDS': [
        {'timeperiod': tp, 'nbdevup': nu, 'nbdevdn': nd, 'matype': 0}
        for tp in [5, 10, 15] for nu, nd in [(2, 2), (2.5, 2.5)]
    ]
}

X_train, X_test, y_train, y_test = train_test_split(df.drop('price_direction', axis=1), df['price_direction'], test_size=0.2, random_state=42)

best_settings = optimize_other_indicators(X_train, y_train, X_test, y_test, other_indicators)

Best parameters for ADOSC: {'fastperiod': 5, 'slowperiod': 10} with F1 score: 0.7370774953486415
Best parameters for NATR: {'timeperiod': 16} with F1 score: 0.7428662130052404
Best parameters for ATR: {'timeperiod': 12} with F1 score: 0.7376645618833119
Best parameters for BBANDS: {'timeperiod': 10, 'nbdevup': 2.5, 'nbdevdn': 2.5, 'matype': 0} with F1 score: 0.7266960312717823


# Results

#### **Overlap indicator parameters**

Best parameters for SAREXT: {'startvalue': 0, 'offsetonreverse': 0, 'accelerationinitlong': 0.02, 'accelerationlong': 0.02, 'accelerationmaxlong': 0.2, 'accelerationinitshort': 0.02, 'accelerationshort': 0.02, 'accelerationmaxshort': 0.2} with F1 score: 0.7291006348947482  
Best parameters for SAR: {'acceleration': 0.1, 'maximum': 0.5} with F1 score: 0.7361204650897779  
Best parameters for T3: {'timeperiod': 3, 'vfactor': 0.5} with F1 score: 0.7386309001098245  
Best parameters for MAMA: {'fastlimit': 0.01, 'slowlimit': 0.01} with F1 score: 0.7386309001098245  
Best parameters for FAMA: {'fastlimit': 0.01, 'slowlimit': 0.01} with F1 score: 0.7386309001098245  
Best parameters for KAMA: {'timeperiod': 15} with F1 score: 0.7337365537111085  
Best parameters for SMA: {'timeperiod': 10} with F1 score: 0.735255390812431  
Best parameters for EMA: {'timeperiod': 40} with F1 score: 0.730110337415502  
Best parameters for WMA: {'timeperiod': 15} with F1 score: 0.7353982868595883  
Best parameters for DEMA: {'timeperiod': 30} with F1 score: 0.7390822193474198  
Best parameters for TEMA: {'timeperiod': 35} with F1 score: 0.7332720137355899  
Best parameters for TRIMA: {'timeperiod': 45} with F1 score: 0.7392397076195109  
Best parameters for MA: {'timeperiod': 10} with F1 score: 0.7377270376006468  
Best parameters for MIDPRICE: {'timeperiod': 20} with F1 score: 0.7331641263381782  
Best parameters for MIDPOINT: {'timeperiod': 20} with F1 score: 0.7388217108911425  

#### **Momentum indicator parameters**

Best parameters for TRIX: {'timeperiod': 20} with F1 score: 0.730363325152448  
Best parameters for ADX: {'timeperiod': 16} with F1 score: 0.735501326363461  
Best parameters for ADXR: {'timeperiod': 10} with F1 score: 0.7304003756744095  
Best parameters for ULTOSC: {'timeperiod1': 8, 'timeperiod2': 16, 'timeperiod3': 32} with F1 score: 0.736441880124659  
Best parameters for MFI: {'timeperiod': 18} with F1 score: 0.7326790240722898  
Best parameters for PLUS_DI: {'timeperiod': 18} with F1 score: 0.739688439906883  
Best parameters for MINUS_DI: {'timeperiod': 18} with F1 score: 0.7383638486016278  
Best parameters for DX: {'timeperiod': 20} with F1 score: 0.7360881463205837  
Best parameters for CCI: {'timeperiod': 10} with F1 score: 0.7386309001098245  
Best parameters for WILLR: {'timeperiod': 10} with F1 score: 0.7386309001098245  
Best parameters for RSI: {'timeperiod': 14} with F1 score: 0.7377789174245764  
Best parameters for CMO: {'timeperiod': 20} with F1 score: 0.7309119335829963  
Best parameters for PPO: {'fastperiod': 5, 'slowperiod': 35, 'matype': 0} with F1 score: 0.7365992093783829  
Best parameters for MACDEXT: {'fastperiod': 10, 'slowperiod': 26, 'signalperiod': 10, 'fastmatype': 1, 'slowmatype': 1, 'signalmatype': 1} with F1 score: 0.5217865534872489  (tested separately)  
Best parameters for PLUS_DM: {'timeperiod': 10} with F1 score: 0.3224892008639309 (tested separately)  
Best parameters for MINUS_DM: {'timeperiod': 10} with F1 score: 0.3224892008639309 (tested separately)  

#### **Statistical indicator parameters**

Best parameters for BETA: {'timeperiod': 5} with F1 score: 0.7348966121276552  
Best parameters for CORREL: {'timeperiod': 10} with F1 score: 0.7335560640952139  
Best parameters for STDDEV: {'timeperiod': 5, 'nbdev': 1} with F1 score: 0.7366880241433851  
Best parameters for VAR: {'timeperiod': 5, 'nbdev': 1} with F1 score: 0.7385906675399289  
Best parameters for LINEARREG: {'timeperiod': 10} with F1 score: 0.7400407430203393  
Best parameters for LINEARREG_ANGLE: {'timeperiod': 25} with F1 score: 0.7381201571326026  
Best parameters for LINEARREG_SLOPE: {'timeperiod': 25} with F1 score: 0.7380700177674429  
Best parameters for LINEARREG_INTERCEPT: {'timeperiod': 15} with F1 score: 0.7336533960098248  
Best parameters for TSF: {'timeperiod': 25} with F1 score: 0.7353658399775631  

#### **Volume, volatility, and Bollinger Bands parameters**

Best parameters for ADOSC: {'fastperiod': 5, 'slowperiod': 10} with F1 score: 0.7370774953486415  
Best parameters for NATR: {'timeperiod': 16} with F1 score: 0.7428662130052404  
Best parameters for ATR: {'timeperiod': 12} with F1 score: 0.7376645618833119  
Best parameters for BBANDS: {'timeperiod': 10, 'nbdevup': 2.5, 'nbdevdn': 2.5, 'matype': 0} with F1 score: 0.7266960312717823  

Next I will try these parameters and compare the results. With the original settings, these were the metrics:

Precision: 0.7553889409559512  
F1 score: 0.7128012381162945  
Recall: 0.6747593135203014  
Accuracy: 0.7411835026897788  

In [43]:
df = pd.read_csv("../Data/ADA/ADA_USDT_45m.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df.set_index("datetime", inplace=True)

# 4 candlesticks = 45m * 4 = 3h
prediction_candles = 4

# set labels as nan to begin with
df["price_direction"] = np.nan

for i in range(len(df) - prediction_candles):
    current_price = df.iloc[i]['close']
    future_price = df.iloc[i + prediction_candles]['close']
    
    # Label as 1 if the price goes up, 0 if it goes down
    df.at[df.index[i], 'price_direction'] = 1 if future_price > current_price else 0

df.dropna(inplace=True)

df["price_direction"] = df["price_direction"].astype(int)

df["price_direction"].value_counts()

df.sample(5)

Unnamed: 0_level_0,open,high,low,close,volume,price_direction
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-11-29 16:30:00,0.3827,0.3846,0.3804,0.3837,552014.05,1
2023-07-29 04:30:00,0.3118,0.3135,0.311,0.3118,738043.58,0
2022-06-06 19:30:00,0.598,0.606,0.594,0.599,116061.58,1
2022-02-03 00:45:00,1.033,1.035,1.029,1.031,27022.12,0
2023-03-19 07:30:00,0.3389,0.3395,0.3378,0.338,56921.96,1


In [53]:
# Overlap studies
df['SAREXT'] = talib.SAREXT(df['high'], df['low'], startvalue=0, offsetonreverse=0, accelerationinitlong=0.02,
                            accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02,
                            accelerationshort=0.02, accelerationmaxshort=0.2)
df['SAR'] = talib.SAR(df['high'], df['low'], acceleration=0.1, maximum=0.5)
df['KAMA'] = talib.KAMA(df['close'], timeperiod=15)
df['SMA'] = talib.SMA(df['close'], timeperiod=10)
df['MA'] = talib.MA(df['close'], timeperiod=10)
df['TRIMA'] = talib.TRIMA(df['close'], timeperiod=45)
df['EMA'] = talib.EMA(df['close'], timeperiod=40)
_, df['FAMA'] = talib.MAMA(df['close'], fastlimit=0.01, slowlimit=0.01)
df['T3'] = talib.T3(df['close'], timeperiod=3, vfactor=0.5)
df['WMA'] = talib.WMA(df['close'], timeperiod=15)
df['DEMA'] = talib.DEMA(df['close'], timeperiod=30)
df['MAMA'], _ = talib.MAMA(df['close'], fastlimit=0.01, slowlimit=0.01)
df['TEMA'] = talib.TEMA(df['close'], timeperiod=35)
df['MIDPRICE'] = talib.MIDPRICE(df['high'], df['low'], timeperiod=20)
df['MIDPOINT'] = talib.MIDPOINT(df['close'], timeperiod=20)

# Momentum indicators
df['TRIX'] = talib.TRIX(df['close'], timeperiod=20)
df['ADX'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=16)
df['ADXR'] = talib.ADXR(df['high'], df['low'], df['close'], timeperiod=10)
df['ULTOSC'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=8, timeperiod2=16, timeperiod3=32)
df['MFI'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=18)
_, _, df['MACDHISTEXT'] = talib.MACDEXT(df['close'], fastperiod=10, fastmatype=1, slowperiod=26,
                                        slowmatype=1, signalperiod=10, signalmatype=1)
df['PLUS_DI'] = talib.PLUS_DI(df['high'], df['low'], df['close'], timeperiod=18)
df['MINUS_DI'] = talib.MINUS_DI(df['high'], df['low'], df['close'], timeperiod=18)
_, df['STOCH_SLOWD'] = talib.STOCH(df['high'], df['low'], df['close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
df['DX'] = talib.DX(df['high'], df['low'], df['close'], timeperiod=20)
df['PLUS_DM'] = talib.PLUS_DM(df['high'], df['low'], timeperiod=10)
df['CCI'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=10)
df['WILLR'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=10)
df['RSI'] = talib.RSI(df['close'], timeperiod=14)
df['CMO'] = talib.CMO(df['close'], timeperiod=20)
df['MINUS_DM'] = talib.MINUS_DM(df['high'], df['low'], timeperiod=10)
df['PPO'] = talib.PPO(df['close'], fastperiod=5, slowperiod=35, matype=0)
df['BOP'] = talib.BOP(df['open'], df['high'], df['low'], df['close'])

# Statistical functions
df['BETA'] = talib.BETA(df['high'], df['low'], timeperiod=5)
df['CORREL'] = talib.CORREL(df['high'], df['low'], timeperiod=10)
df['STDDEV'] = talib.STDDEV(df['close'], timeperiod=5, nbdev=1)
df['VAR'] = talib.VAR(df['close'], timeperiod=5, nbdev=1)
df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df['close'], timeperiod=25)
df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(df['close'], timeperiod=25)
df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(df['close'], timeperiod=15)
df['TSF'] = talib.TSF(df['close'], timeperiod=25)
df['LINEARREG'] = talib.LINEARREG(df['close'], timeperiod=14)

# Volume indicators
df['AD'] = talib.AD(df['high'], df['low'], df['close'], df['volume'])
df['OBV'] = talib.OBV(df['close'], df['volume'])
df['ADOSC'] = talib.ADOSC(df['high'], df['low'], df['close'], df['volume'], fastperiod=5, slowperiod=10)

# Volatility indicators
df['NATR'] = talib.NATR(df['high'], df['low'], df['close'], timeperiod=16)
df['ATR'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=12)
df['TRANGE'] = talib.TRANGE(df['high'], df['low'], df['close'])

# Price transform
df['TYPPRICE'] = talib.TYPPRICE(df['high'], df['low'], df['close'])
df['WCLPRICE'] = talib.WCLPRICE(df['high'], df['low'], df['close'])
df['AVGPRICE'] = talib.AVGPRICE(df['open'], df['high'], df['low'], df['close'])
df['MEDPRICE'] = talib.MEDPRICE(df['high'], df['low'])

# Cycle indicators
df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df['close'])
_, df['HT_PHASOR_QUADRATURE'] = talib.HT_PHASOR(df['close'])
df['HT_DCPHASE'] = talib.HT_DCPHASE(df['close'])
df['HT_SINE'], _ = talib.HT_SINE(df['close'])
_, df['HT_LEADSINE'] = talib.HT_SINE(df['close'])
df['HT_PHASOR_INPHASE'], _ = talib.HT_PHASOR(df['close'])
df['HT_TRENDMODE'] = talib.HT_TRENDMODE(df['close'])

# Bollinger Bands
df['BBANDS_LOWER'], df['BBANDS_MIDDLE'], df['BBANDS_UPPER'] = talib.BBANDS(df['close'], timeperiod=10, nbdevup=2.5, nbdevdn=2.5, matype=0)

df.dropna(inplace=True)

X = df.drop(columns=["price_direction"]) 
y = df['price_direction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Precision: 0.7548786292241789
F1 score: 0.7204178968884851
Recall: 0.6889661164205039
Accuracy: 0.7473835419659347


In [47]:
# As before, I put the model in a separate cell because for some reason, eventhough I use a random state, the datasets are different each time. 
# But not when I run the train_test_split and model in separate cells.
model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluation metrics
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Precision:", precision)
print("F1 score:", f1)
print("Recall:", recall)
print("Accuracy:", accuracy)

Precision: 0.7754060324825987
F1 score: 0.7311310435353314
Recall: 0.6916390728476821
Accuracy: 0.7540032025620497


### Result

So, from these metrics originally:

Precision: 0,7482962289868241
F1 score: 0,7207877461706783
Recall: 0,6952300548754748
Accuracy: 0,7456140350877193

To these:

Precision: 0,7754060324825987
F1 score: 0,7311310435353314
Recall: 0,6916390728476821
Accuracy: 0,7540032025620497

Approximate changes:

Precision: + 0.027
F1 score: + 0.010
Recall: - 0.004
Accuracy: + 0.008

So the changes aren't very impressive, but any improvement helps. This is probably overfitted, but still likely to perform better on a real dataset than the random numbers I put in originally.

Next I will test this model on a more recent dataset.

In [65]:
df = pd.read_csv("../Data/ADA/ADA_USDT_45m_Recent.csv")
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)

# Check datetime to ensure there's no overlap. Training set's last entry was 2023-12-18 00:00:00
df.head(5)

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-12-19 00:00:00,0.602,0.6081,0.5964,0.6075,547327.13
2023-12-19 00:45:00,0.6075,0.6081,0.5976,0.6035,445002.55
2023-12-19 01:30:00,0.6035,0.6165,0.6016,0.6138,671110.53
2023-12-19 02:15:00,0.6138,0.6138,0.6065,0.6098,504098.5
2023-12-19 03:00:00,0.6098,0.6137,0.608,0.6119,352368.51


In [66]:
# 4 candlesticks = 45m * 4 = 3h
prediction_candles = 4

# set labels as nan to begin with
df["price_direction"] = np.nan

for i in range(len(df) - prediction_candles):
    current_price = df.iloc[i]['close']
    future_price = df.iloc[i + prediction_candles]['close']
    
    # Label as 1 if the price goes up, 0 if it goes down
    df.at[df.index[i], 'price_direction'] = 1 if future_price > current_price else 0

df.dropna(inplace=True)

df["price_direction"] = df["price_direction"].astype(int)

df["price_direction"].value_counts()

price_direction
0    1691
1    1535
Name: count, dtype: int64

In [71]:
# Overlap studies
df['SAREXT'] = talib.SAREXT(df['high'], df['low'], startvalue=0, offsetonreverse=0, accelerationinitlong=0.02,
                            accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02,
                            accelerationshort=0.02, accelerationmaxshort=0.2)
df['SAR'] = talib.SAR(df['high'], df['low'], acceleration=0.1, maximum=0.5)
df['KAMA'] = talib.KAMA(df['close'], timeperiod=15)
df['SMA'] = talib.SMA(df['close'], timeperiod=10)
df['MA'] = talib.MA(df['close'], timeperiod=10)
df['TRIMA'] = talib.TRIMA(df['close'], timeperiod=45)
df['EMA'] = talib.EMA(df['close'], timeperiod=40)
_, df['FAMA'] = talib.MAMA(df['close'], fastlimit=0.01, slowlimit=0.01)
df['T3'] = talib.T3(df['close'], timeperiod=3, vfactor=0.5)
df['WMA'] = talib.WMA(df['close'], timeperiod=15)
df['DEMA'] = talib.DEMA(df['close'], timeperiod=30)
df['MAMA'], _ = talib.MAMA(df['close'], fastlimit=0.01, slowlimit=0.01)
df['TEMA'] = talib.TEMA(df['close'], timeperiod=35)
df['MIDPRICE'] = talib.MIDPRICE(df['high'], df['low'], timeperiod=20)
df['MIDPOINT'] = talib.MIDPOINT(df['close'], timeperiod=20)

# Momentum indicators
df['TRIX'] = talib.TRIX(df['close'], timeperiod=20)
df['ADX'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=16)
df['ADXR'] = talib.ADXR(df['high'], df['low'], df['close'], timeperiod=10)
df['ULTOSC'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=8, timeperiod2=16, timeperiod3=32)
df['MFI'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=18)
_, _, df['MACDHISTEXT'] = talib.MACDEXT(df['close'], fastperiod=10, fastmatype=1, slowperiod=26,
                                        slowmatype=1, signalperiod=10, signalmatype=1)
df['PLUS_DI'] = talib.PLUS_DI(df['high'], df['low'], df['close'], timeperiod=18)
df['MINUS_DI'] = talib.MINUS_DI(df['high'], df['low'], df['close'], timeperiod=18)
_, df['STOCH_SLOWD'] = talib.STOCH(df['high'], df['low'], df['close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
df['DX'] = talib.DX(df['high'], df['low'], df['close'], timeperiod=20)
df['PLUS_DM'] = talib.PLUS_DM(df['high'], df['low'], timeperiod=10)
df['CCI'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=10)
df['WILLR'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=10)
df['RSI'] = talib.RSI(df['close'], timeperiod=14)
df['CMO'] = talib.CMO(df['close'], timeperiod=20)
df['MINUS_DM'] = talib.MINUS_DM(df['high'], df['low'], timeperiod=10)
df['PPO'] = talib.PPO(df['close'], fastperiod=5, slowperiod=35, matype=0)
df['BOP'] = talib.BOP(df['open'], df['high'], df['low'], df['close'])

# Statistical functions
df['BETA'] = talib.BETA(df['high'], df['low'], timeperiod=5)
df['CORREL'] = talib.CORREL(df['high'], df['low'], timeperiod=10)
df['STDDEV'] = talib.STDDEV(df['close'], timeperiod=5, nbdev=1)
df['VAR'] = talib.VAR(df['close'], timeperiod=5, nbdev=1)
df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df['close'], timeperiod=25)
df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(df['close'], timeperiod=25)
df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(df['close'], timeperiod=15)
df['TSF'] = talib.TSF(df['close'], timeperiod=25)
df['LINEARREG'] = talib.LINEARREG(df['close'], timeperiod=14)

# Volume indicators
df['AD'] = talib.AD(df['high'], df['low'], df['close'], df['volume'])
df['OBV'] = talib.OBV(df['close'], df['volume'])
df['ADOSC'] = talib.ADOSC(df['high'], df['low'], df['close'], df['volume'], fastperiod=5, slowperiod=10)

# Volatility indicators
df['NATR'] = talib.NATR(df['high'], df['low'], df['close'], timeperiod=16)
df['ATR'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=12)
df['TRANGE'] = talib.TRANGE(df['high'], df['low'], df['close'])

# Price transform
df['TYPPRICE'] = talib.TYPPRICE(df['high'], df['low'], df['close'])
df['WCLPRICE'] = talib.WCLPRICE(df['high'], df['low'], df['close'])
df['AVGPRICE'] = talib.AVGPRICE(df['open'], df['high'], df['low'], df['close'])
df['MEDPRICE'] = talib.MEDPRICE(df['high'], df['low'])

# Cycle indicators
df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df['close'])
_, df['HT_PHASOR_QUADRATURE'] = talib.HT_PHASOR(df['close'])
df['HT_DCPHASE'] = talib.HT_DCPHASE(df['close'])
df['HT_SINE'], _ = talib.HT_SINE(df['close'])
_, df['HT_LEADSINE'] = talib.HT_SINE(df['close'])
df['HT_PHASOR_INPHASE'], _ = talib.HT_PHASOR(df['close'])
df['HT_TRENDMODE'] = talib.HT_TRENDMODE(df['close'])

# Bollinger Bands
df['BBANDS_LOWER'], df['BBANDS_MIDDLE'], df['BBANDS_UPPER'] = talib.BBANDS(df['close'], timeperiod=10, nbdevup=2.5, nbdevdn=2.5, matype=0)

df.dropna(inplace=True)

X = df.drop(columns=["price_direction"]) 
y = df['price_direction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluation metrics
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Precision:", precision)
print("F1 score:", f1)
print("Recall:", recall)
print("Accuracy:", accuracy)

Precision: 0.782051282051282
F1 score: 0.7484662576687117
Recall: 0.7176470588235294
Accuracy: 0.7738970588235294


# Final results

**Evaluation metrics:**

Precision: 0.782051282051282  
F1 score: 0.7484662576687117  
Recall: 0.7176470588235294  
Accuracy: 0.7738970588235294  

**Conclusion:**

The model performed better than I anticipated and I'm happy with the result.  
The model predicted the price direction with more than 78% precision (predicted 78% of the cases where the price *increased*) which I think is excellent.  
Price direction by itself might not be so reliable, but combined with other models it could be quite useful and reliable I think.