# Feature selection for signal classification
Here I will test which features are best to train the signals_classification_model with.
The goal with the signals_classification_model is to correctly identify buy signals.

In [165]:
import pandas as pd
import talib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [166]:
df = pd.read_csv("../Data/ADA_USDT_Signals.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df.set_index("datetime", inplace=True)

df.head(5)

Unnamed: 0_level_0,open,high,low,close,volume,buy_signal,sell_signal
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-10-22 00:00:00,2.141,2.194,2.133,2.185,10918.3,0,0
2021-10-22 04:00:00,2.185,2.202,2.185,2.198,26091.8,0,0
2021-10-22 08:00:00,2.198,2.208,2.182,2.192,24092.3,0,0
2021-10-22 12:00:00,2.192,2.201,2.147,2.151,30423.4,0,0
2021-10-22 16:00:00,2.151,2.183,2.142,2.165,17709.4,0,0


In [167]:
# Technical indicators - used ChatGPT to add all of them to the dataset. Later, I will optimize to see which ones are best.

# Overlap Studies
df['DEMA'] = talib.DEMA(df['close'], timeperiod=30)
df['EMA'] = talib.EMA(df['close'], timeperiod=30)
df['KAMA'] = talib.KAMA(df['close'], timeperiod=30)
df['MA'] = talib.MA(df['close'], timeperiod=30)
df['MAMA'], df['FAMA'] = talib.MAMA(df['close'], fastlimit=0.5, slowlimit=0.05)
df['MIDPOINT'] = talib.MIDPOINT(df['close'], timeperiod=14)
df['MIDPRICE'] = talib.MIDPRICE(df['high'], df['low'], timeperiod=14)
df['SAR'] = talib.SAR(df['high'], df['low'], acceleration=0.02, maximum=0.2)
df['SAREXT'] = talib.SAREXT(df['high'], df['low'], startvalue=0, offsetonreverse=0, accelerationinitlong=0.02,
                            accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02,
                            accelerationshort=0.02, accelerationmaxshort=0.2)
df['SMA'] = talib.SMA(df['close'], timeperiod=30)
df['T3'] = talib.T3(df['close'], timeperiod=5, vfactor=0.7)
df['TEMA'] = talib.TEMA(df['close'], timeperiod=30)
df['TRIMA'] = talib.TRIMA(df['close'], timeperiod=30)
df['WMA'] = talib.WMA(df['close'], timeperiod=30)

# Momentum Indicators
df['ADX'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=14)
df['ADXR'] = talib.ADXR(df['high'], df['low'], df['close'], timeperiod=14)
df['APO'] = talib.APO(df['close'], fastperiod=12, slowperiod=26, matype=0)
df['AROON_DOWN'], df['AROON_UP'] = talib.AROON(df['high'], df['low'], timeperiod=14)
df['AROONOSC'] = talib.AROONOSC(df['high'], df['low'], timeperiod=14)
df['BOP'] = talib.BOP(df['open'], df['high'], df['low'], df['close'])
df['CCI'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=14)
df['CMO'] = talib.CMO(df['close'], timeperiod=14)
df['DX'] = talib.DX(df['high'], df['low'], df['close'], timeperiod=14)
df['MACD'], df['MACDSIGNAL'], df['MACDHIST'] = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)
df['MACDEXT'], df['MACDSIGNALEXT'], df['MACDHISTEXT'] = talib.MACDEXT(df['close'], fastperiod=12, fastmatype=0, slowperiod=26,
                                                                      slowmatype=0, signalperiod=9, signalmatype=0)
df['MACDFIX'], df['MACDSIGNALFIX'], df['MACDHISTFIX'] = talib.MACDFIX(df['close'], signalperiod=9)
df['MFI'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=14)
df['MINUS_DI'] = talib.MINUS_DI(df['high'], df['low'], df['close'], timeperiod=14)
df['MINUS_DM'] = talib.MINUS_DM(df['high'], df['low'], timeperiod=14)
df['MOM'] = talib.MOM(df['close'], timeperiod=10)
df['PLUS_DI'] = talib.PLUS_DI(df['high'], df['low'], df['close'], timeperiod=14)
df['PLUS_DM'] = talib.PLUS_DM(df['high'], df['low'], timeperiod=14)
df['PPO'] = talib.PPO(df['close'], fastperiod=12, slowperiod=26, matype=0)
df['ROC'] = talib.ROC(df['close'], timeperiod=10)
df['ROCP'] = talib.ROCP(df['close'], timeperiod=10)
df['ROCR'] = talib.ROCR(df['close'], timeperiod=10)
df['ROCR100'] = talib.ROCR100(df['close'], timeperiod=10)
df['RSI'] = talib.RSI(df['close'], timeperiod=14)
df['STOCH'], df['STOCH_SLOWD'] = talib.STOCH(df['high'], df['low'], df['close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
df['STOCHF'], df['STOCHF_FASTD'] = talib.STOCHF(df['high'], df['low'], df['close'], fastk_period=5, fastd_period=3, fastd_matype=0)
df['STOCHRSI'], df['STOCHRSI_FASTD'] = talib.STOCHRSI(df['close'], timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
df['TRIX'] = talib.TRIX(df['close'], timeperiod=30)
df['ULTOSC'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
df['WILLR'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=14)

# Volume Indicators
df['AD'] = talib.AD(df['high'], df['low'], df['close'], df['volume'])
df['ADOSC'] = talib.ADOSC(df['high'], df['low'], df['close'], df['volume'], fastperiod=3, slowperiod=10)
df['OBV'] = talib.OBV(df['close'], df['volume'])

# Volatility Indicators
df['ATR'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=14)
df['NATR'] = talib.NATR(df['high'], df['low'], df['close'], timeperiod=14)
df['TRANGE'] = talib.TRANGE(df['high'], df['low'], df['close'])

# Price Transform
df['AVGPRICE'] = talib.AVGPRICE(df['open'], df['high'], df['low'], df['close'])
df['MEDPRICE'] = talib.MEDPRICE(df['high'], df['low'])
df['TYPPRICE'] = talib.TYPPRICE(df['high'], df['low'], df['close'])
df['WCLPRICE'] = talib.WCLPRICE(df['high'], df['low'], df['close'])

# Cycle Indicators
df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df['close'])
df['HT_DCPHASE'] = talib.HT_DCPHASE(df['close'])
df['HT_PHASOR_INPHASE'], df['HT_PHASOR_QUADRATURE'] = talib.HT_PHASOR(df['close'])
df['HT_SINE'], df['HT_LEADSINE'] = talib.HT_SINE(df['close'])
df['HT_TRENDMODE'] = talib.HT_TRENDMODE(df['close'])

# Statistical Functions
df['BETA'] = talib.BETA(df['high'], df['low'], timeperiod=5)
df['CORREL'] = talib.CORREL(df['high'], df['low'], timeperiod=30)
df['LINEARREG'] = talib.LINEARREG(df['close'], timeperiod=14)  # Linear Regression
df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df['close'], timeperiod=14)
df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(df['close'], timeperiod=14)
df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(df['close'], timeperiod=14)
df['STDDEV'] = talib.STDDEV(df['close'], timeperiod=5, nbdev=1)
df['TSF'] = talib.TSF(df['close'], timeperiod=14)
df['VAR'] = talib.VAR(df['close'], timeperiod=5, nbdev=1)

# Bands
df['BBANDS_UPPER'], df['BBANDS_MIDDLE'], df['BBANDS_LOWER'] = talib.BBANDS(df['close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)

df.dropna(inplace=True)

In [168]:
X = df.drop(columns=['buy_signal', 'sell_signal']) 
y = df['buy_signal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix:\n {conf_matrix}")

# feature importances
feature_importances = model.feature_importances_
features_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
features_df = features_df.sort_values(by='Importance', ascending=False)

Accuracy: 0.9083063646170443
Precision: 0.125
Recall: 0.012658227848101266
F1 score: 0.022988505747126436
ROC AUC: 0.5022017554334846
Confusion matrix:
 [[841   7]
 [ 78   1]]


#### Results:  
Accuracy: 0.9083063646170443  
Precision: 0.125  
Recall: 0.012658227848101266  
F1 score: 0.022988505747126436  
ROC AUC: 0.5022017554334846  
Confusion matrix:  
 [841   7]  
 [ 78   1]  

First, the accuracy of the model is high, but the precision is very low and recall is extremely low. Precision is very important in this case, as correctly identifying the buy signals is the priority.

Before trying to remove features I will experiment with the Random Forest classification threshold.

In [171]:
X = df.drop(columns=['buy_signal', 'sell_signal']) 
y = df['buy_signal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

probabilities = model.predict_proba(X_test)[:, 1]

threshold = 0.2

predictions = (probabilities >= threshold).astype(int)

# evaluation metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
conf_matrix = confusion_matrix(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix:\n {conf_matrix}")

# feature importances
feature_importances = model.feature_importances_
features_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
features_df = features_df.sort_values(by='Importance', ascending=False)

Accuracy: 0.8425026968716289
Precision: 0.18691588785046728
Recall: 0.25316455696202533
F1 score: 0.21505376344086022
ROC AUC: 0.7231460472892286
Confusion matrix:
 [[761  87]
 [ 59  20]]


#### Results
Accuracy: 0.8425026968716289  
Precision: 0.18691588785046728  
Recall: 0.25316455696202533  
F1 score: 0.21505376344086022  
ROC AUC: 0.7231460472892286  
Confusion matrix:  
 [761  87]  
 [ 59  20]  

After some experimenting with the threshold, these were the best results I could get. However, the precision and recall are still far too low for the model to be useful.  

#### Conclusion
I will have to try another algorithm or change my approach. Another approach could be to instead of classifying buy signals, market condition would be better.