# Feature selection for signal classification
Here I will test which features are best to train the signals_classification_model with.
The goal with the signals_classification_model is to correctly identify buy signals.

In [20]:
import pandas as pd
import talib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [21]:
df = pd.read_csv("ADA_USDT_Signals.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df.set_index("datetime", inplace=True)

df.head(5)

Unnamed: 0_level_0,open,high,low,close,volume,buy_signal,sell_signal
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-10-22 00:00:00,2.141,2.194,2.133,2.185,10918.3,0,0
2021-10-22 04:00:00,2.185,2.202,2.185,2.198,26091.8,0,0
2021-10-22 08:00:00,2.198,2.208,2.182,2.192,24092.3,0,0
2021-10-22 12:00:00,2.192,2.201,2.147,2.151,30423.4,0,0
2021-10-22 16:00:00,2.151,2.183,2.142,2.165,17709.4,0,0


In [22]:
# Technical indicators - used ChatGPT to add all of them to the dataset. Later, I will optimize to see which ones are best.

# Overlap Studies
df['DEMA'] = talib.DEMA(df['close'], timeperiod=30)
df['EMA'] = talib.EMA(df['close'], timeperiod=30)
df['KAMA'] = talib.KAMA(df['close'], timeperiod=30)
df['MA'] = talib.MA(df['close'], timeperiod=30)
df['MAMA'], df['FAMA'] = talib.MAMA(df['close'], fastlimit=0.5, slowlimit=0.05)
df['MIDPOINT'] = talib.MIDPOINT(df['close'], timeperiod=14)
df['MIDPRICE'] = talib.MIDPRICE(df['high'], df['low'], timeperiod=14)
df['SAR'] = talib.SAR(df['high'], df['low'], acceleration=0.02, maximum=0.2)
df['SAREXT'] = talib.SAREXT(df['high'], df['low'], startvalue=0, offsetonreverse=0, accelerationinitlong=0.02,
                            accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02,
                            accelerationshort=0.02, accelerationmaxshort=0.2)
df['SMA'] = talib.SMA(df['close'], timeperiod=30)
df['T3'] = talib.T3(df['close'], timeperiod=5, vfactor=0.7)
df['TEMA'] = talib.TEMA(df['close'], timeperiod=30)
df['TRIMA'] = talib.TRIMA(df['close'], timeperiod=30)
df['WMA'] = talib.WMA(df['close'], timeperiod=30)

# Momentum Indicators
df['ADX'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=14)
df['ADXR'] = talib.ADXR(df['high'], df['low'], df['close'], timeperiod=14)
df['APO'] = talib.APO(df['close'], fastperiod=12, slowperiod=26, matype=0)
df['AROON_DOWN'], df['AROON_UP'] = talib.AROON(df['high'], df['low'], timeperiod=14)
df['AROONOSC'] = talib.AROONOSC(df['high'], df['low'], timeperiod=14)
df['BOP'] = talib.BOP(df['open'], df['high'], df['low'], df['close'])
df['CCI'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=14)
df['CMO'] = talib.CMO(df['close'], timeperiod=14)
df['DX'] = talib.DX(df['high'], df['low'], df['close'], timeperiod=14)
df['MACD'], df['MACDSIGNAL'], df['MACDHIST'] = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)
df['MACDEXT'], df['MACDSIGNALEXT'], df['MACDHISTEXT'] = talib.MACDEXT(df['close'], fastperiod=12, fastmatype=0, slowperiod=26,
                                                                      slowmatype=0, signalperiod=9, signalmatype=0)
df['MACDFIX'], df['MACDSIGNALFIX'], df['MACDHISTFIX'] = talib.MACDFIX(df['close'], signalperiod=9)
df['MFI'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=14)
df['MINUS_DI'] = talib.MINUS_DI(df['high'], df['low'], df['close'], timeperiod=14)
df['MINUS_DM'] = talib.MINUS_DM(df['high'], df['low'], timeperiod=14)
df['MOM'] = talib.MOM(df['close'], timeperiod=10)
df['PLUS_DI'] = talib.PLUS_DI(df['high'], df['low'], df['close'], timeperiod=14)
df['PLUS_DM'] = talib.PLUS_DM(df['high'], df['low'], timeperiod=14)
df['PPO'] = talib.PPO(df['close'], fastperiod=12, slowperiod=26, matype=0)
df['ROC'] = talib.ROC(df['close'], timeperiod=10)
df['ROCP'] = talib.ROCP(df['close'], timeperiod=10)
df['ROCR'] = talib.ROCR(df['close'], timeperiod=10)
df['ROCR100'] = talib.ROCR100(df['close'], timeperiod=10)
df['RSI'] = talib.RSI(df['close'], timeperiod=14)
df['STOCH'], df['STOCH_SLOWD'] = talib.STOCH(df['high'], df['low'], df['close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
df['STOCHF'], df['STOCHF_FASTD'] = talib.STOCHF(df['high'], df['low'], df['close'], fastk_period=5, fastd_period=3, fastd_matype=0)
df['STOCHRSI'], df['STOCHRSI_FASTD'] = talib.STOCHRSI(df['close'], timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
df['TRIX'] = talib.TRIX(df['close'], timeperiod=30)
df['ULTOSC'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
df['WILLR'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=14)

# Volume Indicators
df['AD'] = talib.AD(df['high'], df['low'], df['close'], df['volume'])
df['ADOSC'] = talib.ADOSC(df['high'], df['low'], df['close'], df['volume'], fastperiod=3, slowperiod=10)
df['OBV'] = talib.OBV(df['close'], df['volume'])

# Volatility Indicators
df['ATR'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=14)
df['NATR'] = talib.NATR(df['high'], df['low'], df['close'], timeperiod=14)
df['TRANGE'] = talib.TRANGE(df['high'], df['low'], df['close'])

# Price Transform
df['AVGPRICE'] = talib.AVGPRICE(df['open'], df['high'], df['low'], df['close'])
df['MEDPRICE'] = talib.MEDPRICE(df['high'], df['low'])
df['TYPPRICE'] = talib.TYPPRICE(df['high'], df['low'], df['close'])
df['WCLPRICE'] = talib.WCLPRICE(df['high'], df['low'], df['close'])

# Cycle Indicators
df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df['close'])
df['HT_DCPHASE'] = talib.HT_DCPHASE(df['close'])
df['HT_PHASOR_INPHASE'], df['HT_PHASOR_QUADRATURE'] = talib.HT_PHASOR(df['close'])
df['HT_SINE'], df['HT_LEADSINE'] = talib.HT_SINE(df['close'])
df['HT_TRENDMODE'] = talib.HT_TRENDMODE(df['close'])

# Statistical Functions
df['BETA'] = talib.BETA(df['high'], df['low'], timeperiod=5)
df['CORREL'] = talib.CORREL(df['high'], df['low'], timeperiod=30)
df['LINEARREG'] = talib.LINEARREG(df['close'], timeperiod=14)  # Linear Regression
df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df['close'], timeperiod=14)
df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(df['close'], timeperiod=14)
df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(df['close'], timeperiod=14)
df['STDDEV'] = talib.STDDEV(df['close'], timeperiod=5, nbdev=1)
df['TSF'] = talib.TSF(df['close'], timeperiod=14)
df['VAR'] = talib.VAR(df['close'], timeperiod=5, nbdev=1)

# Bands
df['BBANDS_UPPER'], df['BBANDS_MIDDLE'], df['BBANDS_LOWER'] = talib.BBANDS(df['close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)

df.dropna(inplace=True)

In [23]:
X = df.drop(columns=['buy_signal', 'sell_signal']) 
y = df['buy_signal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix:\n {conf_matrix}")

# feature importances
feature_importances = model.feature_importances_
features_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
features_df = features_df.sort_values(by='Importance', ascending=False)

Accuracy: 0.9083063646170443
Precision: 0.125
Recall: 0.012658227848101266
F1 score: 0.022988505747126436
ROC AUC: 0.5022017554334846
Confusion matrix:
 [[841   7]
 [ 78   1]]


#### Results:  
Accuracy: 0.9083063646170443  
Precision: 0.125  
Recall: 0.012658227848101266  
F1 score: 0.022988505747126436  
ROC AUC: 0.5022017554334846  
Confusion matrix:  
 [841   7]  
 [ 78   1]  

First, the accuracy of the model is high, but the precision is very low and recall is extremely low. Precision is very important in this case, as correctly identifying the buy signals is the priority.

Before trying to remove features I will experiment with the Random Forest classification threshold.

In [24]:
X = df.drop(columns=['buy_signal', 'sell_signal']) 
y = df['buy_signal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

probabilities = model.predict_proba(X_test)[:, 1]

threshold = 0.2

predictions = (probabilities >= threshold).astype(int)

# evaluation metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
conf_matrix = confusion_matrix(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix:\n {conf_matrix}")

# feature importances
feature_importances = model.feature_importances_
features_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
features_df = features_df.sort_values(by='Importance', ascending=False)

Accuracy: 0.8425026968716289
Precision: 0.18691588785046728
Recall: 0.25316455696202533
F1 score: 0.21505376344086022
ROC AUC: 0.7231460472892286
Confusion matrix:
 [[761  87]
 [ 59  20]]


#### Results
Accuracy: 0.8425026968716289  
Precision: 0.18691588785046728  
Recall: 0.25316455696202533  
F1 score: 0.21505376344086022  
ROC AUC: 0.7231460472892286  
Confusion matrix:  
 [761  87]  
 [ 59  20]  

After some experimenting with the threshold, these were the best results I could get. However, the precision and recall are still far too low for the model to be useful.  

#### Conclusion
I will have to try another algorithm or change my approach. Another approach could be to instead of classifying buy signals, try to classify market condition (bearish, bullish, sideways).

First, I will try a different way of generating the buy signals. The way I created the buy feature included a stop/loss and didn't allow for buy signals to be generated if a position had already been entered. My hypothesis here is that this makes it more difficult for the algorithm to detect patterns, as sometimes where the conditions are met for a buy signal to be generated are skipped, since I applied the logic of a trading strategy.

In [25]:
# New dataset where the buy signals are generated without restrictions
df = pd.read_csv("ADA_USDT_Signals2.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df.set_index("datetime", inplace=True)

# Technical indicators - used ChatGPT to add all of them to the dataset. Later, I will optimize to see which ones are best.

# Overlap Studies
df['DEMA'] = talib.DEMA(df['close'], timeperiod=30)
df['EMA'] = talib.EMA(df['close'], timeperiod=30)
df['KAMA'] = talib.KAMA(df['close'], timeperiod=30)
df['MA'] = talib.MA(df['close'], timeperiod=30)
df['MAMA'], df['FAMA'] = talib.MAMA(df['close'], fastlimit=0.5, slowlimit=0.05)
df['MIDPOINT'] = talib.MIDPOINT(df['close'], timeperiod=14)
df['MIDPRICE'] = talib.MIDPRICE(df['high'], df['low'], timeperiod=14)
df['SAR'] = talib.SAR(df['high'], df['low'], acceleration=0.02, maximum=0.2)
df['SAREXT'] = talib.SAREXT(df['high'], df['low'], startvalue=0, offsetonreverse=0, accelerationinitlong=0.02,
                            accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02,
                            accelerationshort=0.02, accelerationmaxshort=0.2)
df['SMA'] = talib.SMA(df['close'], timeperiod=30)
df['T3'] = talib.T3(df['close'], timeperiod=5, vfactor=0.7)
df['TEMA'] = talib.TEMA(df['close'], timeperiod=30)
df['TRIMA'] = talib.TRIMA(df['close'], timeperiod=30)
df['WMA'] = talib.WMA(df['close'], timeperiod=30)

# Momentum Indicators
df['ADX'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=14)
df['ADXR'] = talib.ADXR(df['high'], df['low'], df['close'], timeperiod=14)
df['APO'] = talib.APO(df['close'], fastperiod=12, slowperiod=26, matype=0)
df['AROON_DOWN'], df['AROON_UP'] = talib.AROON(df['high'], df['low'], timeperiod=14)
df['AROONOSC'] = talib.AROONOSC(df['high'], df['low'], timeperiod=14)
df['BOP'] = talib.BOP(df['open'], df['high'], df['low'], df['close'])
df['CCI'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=14)
df['CMO'] = talib.CMO(df['close'], timeperiod=14)
df['DX'] = talib.DX(df['high'], df['low'], df['close'], timeperiod=14)
df['MACD'], df['MACDSIGNAL'], df['MACDHIST'] = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)
df['MACDEXT'], df['MACDSIGNALEXT'], df['MACDHISTEXT'] = talib.MACDEXT(df['close'], fastperiod=12, fastmatype=0, slowperiod=26,
                                                                      slowmatype=0, signalperiod=9, signalmatype=0)
df['MACDFIX'], df['MACDSIGNALFIX'], df['MACDHISTFIX'] = talib.MACDFIX(df['close'], signalperiod=9)
df['MFI'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=14)
df['MINUS_DI'] = talib.MINUS_DI(df['high'], df['low'], df['close'], timeperiod=14)
df['MINUS_DM'] = talib.MINUS_DM(df['high'], df['low'], timeperiod=14)
df['MOM'] = talib.MOM(df['close'], timeperiod=10)
df['PLUS_DI'] = talib.PLUS_DI(df['high'], df['low'], df['close'], timeperiod=14)
df['PLUS_DM'] = talib.PLUS_DM(df['high'], df['low'], timeperiod=14)
df['PPO'] = talib.PPO(df['close'], fastperiod=12, slowperiod=26, matype=0)
df['ROC'] = talib.ROC(df['close'], timeperiod=10)
df['ROCP'] = talib.ROCP(df['close'], timeperiod=10)
df['ROCR'] = talib.ROCR(df['close'], timeperiod=10)
df['ROCR100'] = talib.ROCR100(df['close'], timeperiod=10)
df['RSI'] = talib.RSI(df['close'], timeperiod=14)
df['STOCH'], df['STOCH_SLOWD'] = talib.STOCH(df['high'], df['low'], df['close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
df['STOCHF'], df['STOCHF_FASTD'] = talib.STOCHF(df['high'], df['low'], df['close'], fastk_period=5, fastd_period=3, fastd_matype=0)
df['STOCHRSI'], df['STOCHRSI_FASTD'] = talib.STOCHRSI(df['close'], timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
df['TRIX'] = talib.TRIX(df['close'], timeperiod=30)
df['ULTOSC'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
df['WILLR'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=14)

# Volume Indicators
df['AD'] = talib.AD(df['high'], df['low'], df['close'], df['volume'])
df['ADOSC'] = talib.ADOSC(df['high'], df['low'], df['close'], df['volume'], fastperiod=3, slowperiod=10)
df['OBV'] = talib.OBV(df['close'], df['volume'])

# Volatility Indicators
df['ATR'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=14)
df['NATR'] = talib.NATR(df['high'], df['low'], df['close'], timeperiod=14)
df['TRANGE'] = talib.TRANGE(df['high'], df['low'], df['close'])

# Price Transform
df['AVGPRICE'] = talib.AVGPRICE(df['open'], df['high'], df['low'], df['close'])
df['MEDPRICE'] = talib.MEDPRICE(df['high'], df['low'])
df['TYPPRICE'] = talib.TYPPRICE(df['high'], df['low'], df['close'])
df['WCLPRICE'] = talib.WCLPRICE(df['high'], df['low'], df['close'])

# Cycle Indicators
df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df['close'])
df['HT_DCPHASE'] = talib.HT_DCPHASE(df['close'])
df['HT_PHASOR_INPHASE'], df['HT_PHASOR_QUADRATURE'] = talib.HT_PHASOR(df['close'])
df['HT_SINE'], df['HT_LEADSINE'] = talib.HT_SINE(df['close'])
df['HT_TRENDMODE'] = talib.HT_TRENDMODE(df['close'])

# Statistical Functions
df['BETA'] = talib.BETA(df['high'], df['low'], timeperiod=5)
df['CORREL'] = talib.CORREL(df['high'], df['low'], timeperiod=30)
df['LINEARREG'] = talib.LINEARREG(df['close'], timeperiod=14)  # Linear Regression
df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df['close'], timeperiod=14)
df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(df['close'], timeperiod=14)
df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(df['close'], timeperiod=14)
df['STDDEV'] = talib.STDDEV(df['close'], timeperiod=5, nbdev=1)
df['TSF'] = talib.TSF(df['close'], timeperiod=14)
df['VAR'] = talib.VAR(df['close'], timeperiod=5, nbdev=1)

# Bands
df['BBANDS_UPPER'], df['BBANDS_MIDDLE'], df['BBANDS_LOWER'] = talib.BBANDS(df['close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)

df.dropna(inplace=True)

In [26]:
X = df.drop(columns=['buy_signal']) 
y = df['buy_signal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix:\n {conf_matrix}")

Accuracy: 0.8295577130528586
Precision: 0.8455882352941176
Recall: 0.6647398843930635
F1 score: 0.7443365695792881
ROC AUC: 0.7962253638832788
Confusion matrix:
 [[539  42]
 [116 230]]


### Results
Accuracy: 0.8295577130528586  
Precision: 0.8455882352941176  
Recall: 0.6647398843930635  
F1 score: 0.7443365695792881  
ROC AUC: 0.7962253638832788  
Confusion matrix:  
 [539  42]  
 [116 230]  

Ah, there we go. Much better. The precision is quite high, and a trading strategy with 85% good entries is excellent. Still, I will try other algorithms as well before I try optimizing features.

In [55]:
# Logistic Regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg_model = LogisticRegression(random_state=42, max_iter=1000)
logreg_model.fit(X_train_scaled, y_train)

y_pred = logreg_model.predict(X_test_scaled)

# evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.6353829557713053
Precision: 0.5338983050847458
Recall: 0.18208092485549132
F1 Score: 0.27155172413793105
Confusion Matrix:
[[526  55]
 [283  63]]


### Results
Accuracy: 0.6353829557713053  
Precision: 0.5338983050847458  
Recall: 0.18208092485549132  
F1 Score: 0.27155172413793105  
Confusion Matrix:  
 [526  55]  
 [283  63]  

Clearly Logistic Regression performed far worse than Random Forest. Next I will try SVM.

In [28]:
# SVM
from sklearn.svm import SVC

# try linear kernel first
svm_model = SVC(kernel="linear", probability=True, random_state=42)

svm_model.fit(X_train_scaled, y_train)

y_pred = svm_model.predict(X_test_scaled)

# evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.6299892125134844
Precision: 0.5405405405405406
Recall: 0.057803468208092484
F1 Score: 0.10443864229765012
Confusion Matrix:
[[564  17]
 [326  20]]


In [29]:
# SVM
# try non-linear kernel next, I'll go with RBF
svm_model = SVC(kernel="rbf", probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)

y_pred = svm_model.predict(X_test_scaled)

# evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.697950377562028
Precision: 0.7619047619047619
Recall: 0.2774566473988439
F1 Score: 0.4067796610169492
Confusion Matrix:
[[551  30]
 [250  96]]


### Results
Accuracy: 0.697950377562028  
Precision: 0.7619047619047619  
Recall: 0.2774566473988439  
F1 Score: 0.4067796610169492  
Confusion Matrix:  
[551  30]  
 [250  96]  


SVM performed better with a non-linear kernal than a linear one, but the results are still worse than Random Forest. Next I'll try k-NN.

In [30]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=1)

knn_model.fit(X_train_scaled, y_train)

y_pred = knn_model.predict(X_test_scaled)

# evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.819848975188781
Precision: 0.7720364741641338
Recall: 0.7341040462427746
F1 Score: 0.7525925925925926
Confusion Matrix:
[[506  75]
 [ 92 254]]


### Results
Accuracy: 0.819848975188781  
Precision: 0.7720364741641338  
Recall: 0.7341040462427746  
F1 Score: 0.7525925925925926  
Confusion Matrix:  
[506  75]  
 [ 92 254]  

The k-NN model performed best with n_neighbors=1, but this is likely because of how the buy signals are generated. Sometimes buy signals are generated consecutively, so this score may be deceptive. These results were slightly better than the Random Forest algorithm, but nevertheless I feel more confident in the Random Forest algorithm.

### Conclusion
After trying Random Forest, Logistic Regression, SVM and k-NN, it's either Random Forest or k-NN. I will test them both on the more recent ADAUSDT dataset to see which performs better there.

In [49]:
df_recent = pd.read_csv("ADA_USDT_recent_Signals2.csv")
df_recent["datetime"] = pd.to_datetime(df_recent["datetime"])
df_recent.set_index("datetime", inplace=True)

# Overlap Studies
df_recent['DEMA'] = talib.DEMA(df_recent['close'], timeperiod=30)
df_recent['EMA'] = talib.EMA(df_recent['close'], timeperiod=30)
df_recent['KAMA'] = talib.KAMA(df_recent['close'], timeperiod=30)
df_recent['MA'] = talib.MA(df_recent['close'], timeperiod=30)
df_recent['MAMA'], df_recent['FAMA'] = talib.MAMA(df_recent['close'], fastlimit=0.5, slowlimit=0.05)
df_recent['MIDPOINT'] = talib.MIDPOINT(df_recent['close'], timeperiod=14)
df_recent['MIDPRICE'] = talib.MIDPRICE(df_recent['high'], df_recent['low'], timeperiod=14)
df_recent['SAR'] = talib.SAR(df_recent['high'], df_recent['low'], acceleration=0.02, maximum=0.2)
df_recent['SAREXT'] = talib.SAREXT(df_recent['high'], df_recent['low'], startvalue=0, offsetonreverse=0, accelerationinitlong=0.02,
                            accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02,
                            accelerationshort=0.02, accelerationmaxshort=0.2)
df_recent['SMA'] = talib.SMA(df_recent['close'], timeperiod=30)
df_recent['T3'] = talib.T3(df_recent['close'], timeperiod=5, vfactor=0.7)
df_recent['TEMA'] = talib.TEMA(df_recent['close'], timeperiod=30)
df_recent['TRIMA'] = talib.TRIMA(df_recent['close'], timeperiod=30)
df_recent['WMA'] = talib.WMA(df_recent['close'], timeperiod=30)

# Momentum Indicators
df_recent['ADX'] = talib.ADX(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)
df_recent['ADXR'] = talib.ADXR(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)
df_recent['APO'] = talib.APO(df_recent['close'], fastperiod=12, slowperiod=26, matype=0)
df_recent['AROON_DOWN'], df_recent['AROON_UP'] = talib.AROON(df_recent['high'], df_recent['low'], timeperiod=14)
df_recent['AROONOSC'] = talib.AROONOSC(df_recent['high'], df_recent['low'], timeperiod=14)
df_recent['BOP'] = talib.BOP(df_recent['open'], df_recent['high'], df_recent['low'], df_recent['close'])
df_recent['CCI'] = talib.CCI(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)
df_recent['CMO'] = talib.CMO(df_recent['close'], timeperiod=14)
df_recent['DX'] = talib.DX(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)
df_recent['MACD'], df_recent['MACDSIGNAL'], df_recent['MACDHIST'] = talib.MACD(df_recent['close'], fastperiod=12, slowperiod=26, signalperiod=9)
df_recent['MACDEXT'], df_recent['MACDSIGNALEXT'], df_recent['MACDHISTEXT'] = talib.MACDEXT(df_recent['close'], fastperiod=12, fastmatype=0, slowperiod=26,
                                                                      slowmatype=0, signalperiod=9, signalmatype=0)
df_recent['MACDFIX'], df_recent['MACDSIGNALFIX'], df_recent['MACDHISTFIX'] = talib.MACDFIX(df_recent['close'], signalperiod=9)
df_recent['MFI'] = talib.MFI(df_recent['high'], df_recent['low'], df_recent['close'], df_recent['volume'], timeperiod=14)
df_recent['MINUS_DI'] = talib.MINUS_DI(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)
df_recent['MINUS_DM'] = talib.MINUS_DM(df_recent['high'], df_recent['low'], timeperiod=14)
df_recent['MOM'] = talib.MOM(df_recent['close'], timeperiod=10)
df_recent['PLUS_DI'] = talib.PLUS_DI(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)
df_recent['PLUS_DM'] = talib.PLUS_DM(df_recent['high'], df_recent['low'], timeperiod=14)
df_recent['PPO'] = talib.PPO(df_recent['close'], fastperiod=12, slowperiod=26, matype=0)
df_recent['ROC'] = talib.ROC(df_recent['close'], timeperiod=10)
df_recent['ROCP'] = talib.ROCP(df_recent['close'], timeperiod=10)
df_recent['ROCR'] = talib.ROCR(df_recent['close'], timeperiod=10)
df_recent['ROCR100'] = talib.ROCR100(df_recent['close'], timeperiod=10)
df_recent['RSI'] = talib.RSI(df_recent['close'], timeperiod=14)
df_recent['STOCH'], df_recent['STOCH_SLOWD'] = talib.STOCH(df_recent['high'], df_recent['low'], df_recent['close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
df_recent['STOCHF'], df_recent['STOCHF_FASTD'] = talib.STOCHF(df_recent['high'], df_recent['low'], df_recent['close'], fastk_period=5, fastd_period=3, fastd_matype=0)
df_recent['STOCHRSI'], df_recent['STOCHRSI_FASTD'] = talib.STOCHRSI(df_recent['close'], timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
df_recent['TRIX'] = talib.TRIX(df_recent['close'], timeperiod=30)
df_recent['ULTOSC'] = talib.ULTOSC(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
df_recent['WILLR'] = talib.WILLR(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)

# Volume Indicators
df_recent['AD'] = talib.AD(df_recent['high'], df_recent['low'], df_recent['close'], df_recent['volume'])
df_recent['ADOSC'] = talib.ADOSC(df_recent['high'], df_recent['low'], df_recent['close'], df_recent['volume'], fastperiod=3, slowperiod=10)
df_recent['OBV'] = talib.OBV(df_recent['close'], df_recent['volume'])

# Volatility Indicators
df_recent['ATR'] = talib.ATR(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)
df_recent['NATR'] = talib.NATR(df_recent['high'], df_recent['low'], df_recent['close'], timeperiod=14)
df_recent['TRANGE'] = talib.TRANGE(df_recent['high'], df_recent['low'], df_recent['close'])

# Price Transform
df_recent['AVGPRICE'] = talib.AVGPRICE(df_recent['open'], df_recent['high'], df_recent['low'], df_recent['close'])
df_recent['MEDPRICE'] = talib.MEDPRICE(df_recent['high'], df_recent['low'])
df_recent['TYPPRICE'] = talib.TYPPRICE(df_recent['high'], df_recent['low'], df_recent['close'])
df_recent['WCLPRICE'] = talib.WCLPRICE(df_recent['high'], df_recent['low'], df_recent['close'])

# Cycle Indicators
df_recent['HT_DCPERIOD'] = talib.HT_DCPERIOD(df_recent['close'])
df_recent['HT_DCPHASE'] = talib.HT_DCPHASE(df_recent['close'])
df_recent['HT_PHASOR_INPHASE'], df_recent['HT_PHASOR_QUADRATURE'] = talib.HT_PHASOR(df_recent['close'])
df_recent['HT_SINE'], df_recent['HT_LEADSINE'] = talib.HT_SINE(df_recent['close'])
df_recent['HT_TRENDMODE'] = talib.HT_TRENDMODE(df_recent['close'])

# Statistical Functions
df_recent['BETA'] = talib.BETA(df_recent['high'], df_recent['low'], timeperiod=5)
df_recent['CORREL'] = talib.CORREL(df_recent['high'], df_recent['low'], timeperiod=30)
df_recent['LINEARREG'] = talib.LINEARREG(df_recent['close'], timeperiod=14)  # Linear Regression
df_recent['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df_recent['close'], timeperiod=14)
df_recent['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(df_recent['close'], timeperiod=14)
df_recent['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(df_recent['close'], timeperiod=14)
df_recent['STDDEV'] = talib.STDDEV(df_recent['close'], timeperiod=5, nbdev=1)
df_recent['TSF'] = talib.TSF(df_recent['close'], timeperiod=14)
df_recent['VAR'] = talib.VAR(df_recent['close'], timeperiod=5, nbdev=1)

# Bands
df_recent['BBANDS_UPPER'], df_recent['BBANDS_MIDDLE'], df_recent['BBANDS_LOWER'] = talib.BBANDS(df_recent['close'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)

df_recent.dropna(inplace=True)

In [50]:
# Random Forest
X_recent = df_recent.drop(columns=["buy_signal"])
y_recent_true = df_recent["buy_signal"]

y_pred = rf_model.predict(X_recent)

# evaluation metrics
accuracy = accuracy_score(y_recent_true, y_pred)
precision = precision_score(y_recent_true, y_pred)
recall = recall_score(y_recent_true, y_pred)
f1 = f1_score(y_recent_true, y_pred)
conf_matrix = confusion_matrix(y_recent_true, y_pred)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"Confusion matrix:\n {conf_matrix}")

Accuracy: 0.6025878003696857
Precision: 0.45569620253164556
Recall: 0.17307692307692307
F1 score: 0.2508710801393728
ROC AUC: 0.5219738969738971
Confusion matrix:
 [[290  43]
 [172  36]]


In [53]:
# k-NN
X_recent_scaled = scaler.fit_transform(X_recent)

y_pred = knn_model.predict(X_recent_scaled)

# evaluation metrics
accuracy = accuracy_score(y_recent_true, y_pred)
precision = precision_score(y_recent_true, y_pred)
recall = recall_score(y_recent_true, y_pred)
f1 = f1_score(y_recent_true, y_pred)
conf_matrix = confusion_matrix(y_recent_true, y_pred)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"Confusion matrix:\n {conf_matrix}")

Accuracy: 0.5785582255083179
Precision: 0.4484536082474227
Recall: 0.4182692307692308
F1 score: 0.43283582089552236
ROC AUC: 0.5484739547239548
Confusion matrix:
 [[226 107]
 [121  87]]


In [56]:
# SVM
y_pred = svm_model.predict(X_recent_scaled)

# evaluation metrics
accuracy = accuracy_score(y_recent_true, y_pred)
precision = precision_score(y_recent_true, y_pred)
recall = recall_score(y_recent_true, y_pred)
f1 = f1_score(y_recent_true, y_pred)
conf_matrix = confusion_matrix(y_recent_true, y_pred)
roc_auc = roc_auc_score(y_recent_true, y_pred)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix:\n {conf_matrix}")

Accuracy: 0.6136783733826248
Precision: 0.48717948717948717
Recall: 0.09134615384615384
F1 score: 0.15384615384615385
ROC AUC: 0.5156430468930469
Confusion matrix:
 [[313  20]
 [189  19]]


In [57]:
# Logarithmic Regression
y_pred = logreg_model.predict(X_recent_scaled)

# evaluation metrics
accuracy = accuracy_score(y_recent_true, y_pred)
precision = precision_score(y_recent_true, y_pred)
recall = recall_score(y_recent_true, y_pred)
f1 = f1_score(y_recent_true, y_pred)
conf_matrix = confusion_matrix(y_recent_true, y_pred)
roc_auc = roc_auc_score(y_recent_true, y_pred)

print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix:\n {conf_matrix}")

Accuracy: 0.6173752310536045
Precision: 0.5050505050505051
Recall: 0.2403846153846154
F1 score: 0.3257328990228013
ROC AUC: 0.5466187341187341
Confusion matrix:
 [[284  49]
 [158  50]]


### Result
Well, turns out, the models didn't work particularly well on the recent data using any of the previously used algorithms. I will have to change my approach.  
Still, good practise and learning opportunity for a student such as myself!