In [1]:
from data_storage import create_connection
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix


In [None]:
#this notebook trains each machine learning model for each cryptocurrency and is evaluated in the end by 
#weighted F1 score, the macro F1 score, the macro recall and the macro precision

In [2]:
connection = create_connection("../database/crypto_billionairs.db")
scaler = MinMaxScaler()

In [3]:
def apply_random_forest(X_train, X_test, y_train, y_test, table, db_connection):
    
    clf = RandomForestClassifier(criterion="entropy", min_samples_split= 0.01, min_samples_leaf= 0.005, max_depth=10, class_weight="balanced_subsample")
    clf.fit(X_train, y_train)
    
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = clf.predict(X_test_normalized)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"random_forest_{table}", db_connection, if_exists="replace")
    return "random forest", f1score, f1score_macro, recall, precision

In [4]:
def apply_knn(X_train, X_test, y_train, y_test, table, db_connection):
    
    neigh = KNeighborsClassifier(weights="uniform", n_neighbors=3, algorithm="ball_tree")
    neigh.fit(X_train, y_train)
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = neigh.predict(X_test_normalized)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"knn_{table}", db_connection, if_exists="replace")
    
    return "k-nearest neighbour", f1score, f1score_macro, recall, precision

In [5]:
def apply_support_vector_machine(X_train, X_test, y_train, y_test, table, db_connection):
    
    svc = SVC(kernel="poly", degree=4, C=1)
    svc.fit(X_train, y_train)
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = svc.predict(X_test)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"svc_{table}", db_connection, if_exists="replace")
    
    return "support vector classifier", f1score, f1score_macro, precision, recall

In [6]:
def apply_mlp(X_train, X_test, y_train, y_test, table, db_connection):
    
    mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)
    mlp.fit(X_train, y_train)
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = mlp.predict(X_test_normalized)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"mlp_{table}", db_connection, if_exists="replace")
    
    return "mlp classifier", f1score, f1score_macro, precision, recall 

In [7]:
def apply_logistic_regression(X_train, X_test, y_train, y_test, table, db_connection):
    
    lg = LogisticRegression(solver="liblinear", penalty="l1", C=1)
    lg.fit(X_train, y_train)
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    
    y_pred = lg.predict(X_test_normalized)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"logistic_regression_{table}", db_connection, if_exists="replace")
    
    return "logistic regression", f1score, f1score_macro, recall, precision
    

In [8]:
def apply_ml_algorithms(db_connection):
    
    df_ml = pd.DataFrame(columns = range(6))
    df_ml.columns = ["table_name", "model", "f1-score weighted", "f1-score macro", "recall macro", "precision macro"]
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and not 'equity_curve' in name and '_pooling' not in name]
    
    for table in filtered_table_names:
        
        df_temp = pd.read_sql_query(f"select * from {table}", db_connection)
        
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        
        X = df_temp.drop(["return", "buy_indicator", "short_indicator", "close_buy_indicator", "close_short_indicator", "time", "index", "market_cap"], axis=1)
        
        X_train = X.iloc[:-365]
        X_train[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_train[["open", "close", "high", "low", "volume"]])
        
        X_test = X.iloc[-365:]
        
        y_train = y.iloc[:-365]
        y_test = y.iloc[-365:]
        
        string, score, f1score_macro, recall, precision  = apply_random_forest(X_train, X_test, y_train, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = apply_knn(X_train, X_test, y_train, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = apply_support_vector_machine(X_train, X_test, y_train, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = apply_mlp(X_train, X_test, y_train, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = apply_logistic_regression(X_train, X_test, y_train, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
    return df_ml
        

In [9]:
%%capture
df_ml = apply_ml_algorithms(connection)

In [10]:
df_ml

Unnamed: 0,table_name,model,f1-score weighted,f1-score macro,recall macro,precision macro
0,ADA_1min_complete_1day_preprocessed_1day_features,random forest,0.617352,0.409826,0.422421,0.425032
1,ADA_1min_complete_1day_preprocessed_1day_features,k-nearest neighbour,0.633140,0.332671,0.359540,0.340431
2,ADA_1min_complete_1day_preprocessed_1day_features,support vector classifier,0.036822,0.084530,0.048402,0.333333
3,ADA_1min_complete_1day_preprocessed_1day_features,mlp classifier,0.614428,0.281035,0.242922,0.333333
4,ADA_1min_complete_1day_preprocessed_1day_features,logistic regression,0.628637,0.324293,0.353547,0.341202
5,BCH_1min_complete_1day_preprocessed_1day_features,random forest,0.613981,0.349614,0.349751,0.350334
6,BCH_1min_complete_1day_preprocessed_1day_features,k-nearest neighbour,0.634718,0.283912,0.330882,0.248619
7,BCH_1min_complete_1day_preprocessed_1day_features,support vector classifier,0.031778,0.078905,0.044749,0.333333
8,BCH_1min_complete_1day_preprocessed_1day_features,mlp classifier,0.636408,0.284668,0.248402,0.333333
9,BCH_1min_complete_1day_preprocessed_1day_features,logistic regression,0.629131,0.343504,0.370098,0.320738


In [12]:
df_ml.groupby(df_ml["model"]).mean()

Unnamed: 0_level_0,f1-score weighted,f1-score macro,recall macro,precision macro
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
k-nearest neighbour,0.54874,0.274738,0.333255,0.287409
logistic regression,0.597128,0.313604,0.353131,0.381633
mlp classifier,0.637193,0.288008,0.257263,0.33406
random forest,0.599273,0.345774,0.361947,0.353656
support vector classifier,0.052684,0.082999,0.050292,0.333284
