In [1]:
from data_storage import create_connection
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix


In [None]:
#this notebook trains each machine learning model for each cryptocurrency and is evaluated in the end by 
#weighted F1 score, the macro F1 score, the macro recall and the macro precision
#however the models are trained on an oversampled dataset

In [2]:
connection = create_connection("../database/crypto_billionairs.db")
scaler = MinMaxScaler()

In [3]:
def apply_random_forest(X_train, X_test, y_train, y_test, table, db_connection):
    
    clf = RandomForestClassifier(criterion="entropy", min_samples_split= 0.01, min_samples_leaf= 0.005, max_depth=10, class_weight="balanced_subsample")
    clf.fit(X_train, y_train)
    #print(X_test.head())
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = clf.predict(X_test_normalized)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"{table}_rf", db_connection, if_exists="replace")
    return "random forest", f1score, f1score_macro, recall, precision

In [4]:
def apply_knn(X_train, X_test, y_train, y_test, table, db_connection):
    
    neigh = KNeighborsClassifier(weights="uniform", n_neighbors=3, algorithm="ball_tree")
    neigh.fit(X_train, y_train)
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = neigh.predict(X_test_normalized)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"{table}_knn", db_connection, if_exists="replace")
    
    return "k-nearest neighbour", f1score, f1score_macro, recall, precision

In [5]:
def apply_support_vector_machine(X_train, X_test, y_train, y_test, table, db_connection):
    
    svc = SVC(kernel="poly", degree=4, C=1)
    svc.fit(X_train, y_train)
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = svc.predict(X_test)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"{table}_svc", db_connection, if_exists="replace")
    
    return "support vector classifier", f1score, f1score_macro, precision, recall

In [6]:
def apply_mlp(X_train, X_test, y_train, y_test, table, db_connection):
    
    mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)
    mlp.fit(X_train, y_train)
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = mlp.predict(X_test_normalized)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"{table}_mlp", db_connection, if_exists="replace")
    
    return "mlp classifier", f1score, f1score_macro, precision, recall 

In [7]:
def apply_logistic_regression(X_train, X_test, y_train, y_test, table, db_connection):
    
    lg = LogisticRegression(solver="liblinear", penalty="l1", C=1)
    lg.fit(X_train, y_train)
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    
    y_pred = lg.predict(X_test_normalized)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    df.to_sql(f"{table}_lr", db_connection, if_exists="replace")
    
    return "logistic regression", f1score, f1score_macro, recall, precision
    

In [8]:
def apply_ml_algorithms(db_connection):
    
    df_ml = pd.DataFrame(columns = range(6))
    df_ml.columns = ["table_name", "model", "f1-score weighted", "f1-score macro", "recall macro", "precision macro"]
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and not 'equity_curve' in name and '_pooling' not in name]
    ros = RandomOverSampler(random_state=0)
    print(filtered_table_names)
    for table in filtered_table_names:
        
        df_temp = pd.read_sql_query(f"select * from {table}", db_connection)
    
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        
        X = df_temp.drop(["return", "buy_indicator", "short_indicator", "close_buy_indicator", "close_short_indicator", "time", "index"], axis=1)
        
        X_train = X.iloc[:-365]
        print(table)
        X_train[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_train[["open", "close", "high", "low", "volume"]])
        
        X_test = X.iloc[-365:]
        
        y_train = y.iloc[:-365]
        y_test = y.iloc[-365:]
        
        X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
        
        string, score, f1score_macro, recall, precision  = apply_random_forest(X_resampled, X_test, y_resampled, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = apply_knn(X_resampled, X_test, y_resampled, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = apply_support_vector_machine(X_resampled, X_test, y_resampled, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = apply_mlp(X_resampled, X_test, y_resampled, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = apply_logistic_regression(X_resampled, X_test, y_resampled, y_test, table[:4] ,db_connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
    return df_ml
        

In [9]:
%%capture
df_ml = apply_ml_algorithms(connection)

In [14]:
df_ml

Unnamed: 0,table_name,model,f1-score weighted,f1-score macro,recall macro,precision macro
0,ADA_1min_complete_1day_preprocessed_1day_features,random forest,0.434613,0.306404,0.369069,0.394230
1,ADA_1min_complete_1day_preprocessed_1day_features,k-nearest neighbour,0.180962,0.150563,0.366187,0.283912
2,ADA_1min_complete_1day_preprocessed_1day_features,support vector classifier,0.195157,0.176035,0.314289,0.308376
3,ADA_1min_complete_1day_preprocessed_1day_features,mlp classifier,0.036822,0.084530,0.048402,0.333333
4,ADA_1min_complete_1day_preprocessed_1day_features,logistic regression,0.606267,0.355632,0.353440,0.383783
...,...,...,...,...,...,...
120,TRX_1min_complete_1day_preprocessed_1day_features,random forest,0.670199,0.436679,0.454135,0.429374
121,TRX_1min_complete_1day_preprocessed_1day_features,k-nearest neighbour,0.677228,0.291152,0.333333,0.258447
122,TRX_1min_complete_1day_preprocessed_1day_features,support vector classifier,0.021687,0.064356,0.035813,0.317073
123,TRX_1min_complete_1day_preprocessed_1day_features,mlp classifier,0.405566,0.208565,0.261103,0.252693


In [15]:
df_ml.groupby(df_ml["model"]).mean()

Unnamed: 0_level_0,f1-score weighted,f1-score macro,recall macro,precision macro
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
k-nearest neighbour,0.495017,0.279261,0.330234,0.337764
logistic regression,0.345094,0.242474,0.354005,0.342169
mlp classifier,0.308326,0.190413,0.25118,0.331732
random forest,0.564501,0.343584,0.371608,0.367727
support vector classifier,0.305066,0.21084,0.284053,0.342252
