In [1]:
from data_storage import create_connection
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
import pickle
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix


In [2]:
connection = create_connection("../database/crypto_billionairs.db")
scaler = MinMaxScaler()

In [3]:
def random_forest(X_train, y_train):
    
    clf = RandomForestClassifier(criterion="entropy", min_samples_split= 0.01, min_samples_leaf= 0.005, max_depth=10, class_weight="balanced_subsample")
    print("training random forest!")
    clf.fit(X_train, y_train)
   
    return "random_forest", clf

In [4]:
def knn(X_train, y_train):
    
    neigh = KNeighborsClassifier(weights="uniform", n_neighbors=5, algorithm="ball_tree")
    print("training knn!")
    neigh.fit(X_train, y_train)
    
    return "knn", neigh

In [5]:
def support_vector_machine(X_train, y_train):
    
    svc = SVC(kernel="poly", degree=4, C=1)
    print("training svc!")
    svc.fit(X_train, y_train)
    
    return "support_vector_classifier", svc

In [6]:
def mlp(X_train, y_train):
    
    mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)
    print("training mlp!")
    mlp.fit(X_train, y_train)
    
    return "mlp_classifier", mlp

In [7]:
def logistic_regression(X_train, y_train):
    global lg
    lg = LogisticRegression(solver="liblinear", penalty="l1", C=1)
    print("training lr!")
    lg.fit(X_train, y_train)
    
    return "logistic_regression", lg
    

In [8]:
def model_ensemble(X_train, y_train):
    
    print("training ensemble!")
    global rf_new
    
    level0 = list()
    level0.append(('lg', LogisticRegression(solver="liblinear", penalty="l1", C=1)))
    level0.append(('knn', KNeighborsClassifier(weights="uniform", n_neighbors=5, algorithm="ball_tree")))
    level0.append(('rf', RandomForestClassifier(criterion="entropy", min_samples_split= 0.01, min_samples_leaf= 0.005, max_depth=10, class_weight="balanced_subsample")))
    level0.append(('mlp', MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)))
    #mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)
    #mlp.fit(ensemble_dataset, y_train)
    
    level1 = RandomForestClassifier(criterion="entropy")
    rf_new = StackingClassifier(estimators=level0, final_estimator=level1)
    rf_new.fit(X_train, y_train)
    
    print("finished training ensemble!")
    return "ensemble", rf_new
    
    

In [9]:
def evaluation(X_test, y_test, model, model_name, table, db_connection):
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = model.predict(X_test_normalized)
    y_pred_proba = model.predict_proba(X_test_normalized)
    
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    
    df.to_sql(f"{table}_{model_name}_pooling", db_connection, if_exists="replace")
    
    return f"{model_name}_pooling", f1score, f1score_macro, recall, precision, y_pred_proba[:, 0], y_pred_proba[:, 1], y_pred_proba[:, 2]

In [10]:
def evaluation_ensemble(X_test, y_test, model, model_name, table, db_connection):
    
    y_pred = model.predict(X_test)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    
    df.to_sql(f"{table}_{model_name}_pooling", db_connection, if_exists="replace")
    
    return f"{model_name}_pooling", f1score, f1score_macro, recall, precision

In [11]:
def apply_ml_algorithms_pooling(db_connection):
    
    df_temp = pd.read_sql_query(f"select * from cryptocurrency_pooling_dataset", db_connection)
    df_temp = shuffle(df_temp, random_state=42069)
    
    y_train = df_temp["buy_indicator"] + df_temp["short_indicator"]
    y_train = y_train.fillna(0)
    y_train = y_train.astype(str)
        
    X_train = df_temp.drop(["return", "buy_indicator", "short_indicator","close_buy_indicator", "close_short_indicator", "time", "index", "level_0", "market_cap"], axis=1)
    
    X_train[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_train[["open", "close", "high", "low", "volume"]])
    
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    table_names_list = table_names['name'].tolist()
    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and 'equity_curve' not in name and '_pooling' not in name and "_threshold_ensemble" not in name]
    print(filtered_table_names)
    
    # rf_name, rf_model = random_forest(X_resampled, y_resampled)
    # knn_name, knn_model = knn(X_resampled, y_resampled)
    #svc_name, svc_model = support_vector_machine(X_resampled, y_resampled)
    # mlp_name, mlp_model = mlp(X_resampled, y_resampled)
    # lr_name, lr_model = logistic_regression(X_resampled, y_resampled)
    
    ensemble_name, ensemble_model = model_ensemble(X_resampled, y_resampled)
    #creating the evaluation metric
    df_ml = pd.DataFrame(columns = range(6))
    df_ml.columns = ["table_name", "model", "f1-score weighted", "f1-score macro", "recall macro", "precision macro"]
    
    
    for table in filtered_table_names:
        
        df_temp = pd.read_sql_query(f"select * from {table}", db_connection)
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        X = df_temp.drop(["return", "buy_indicator", "short_indicator","close_buy_indicator", "close_short_indicator", "time", "index", "level_0", "market_cap"], axis=1)

        X_test = X.iloc[-365:]
        y_test = y.iloc[-365:]
        
        # string, score, f1score_macro, recall, precision, rf_pred_class1, rf_pred_class0, rf_pred_class_negative_1 = evaluation(X_test, y_test, rf_model, rf_name, table, connection)
        # df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        # string, score, f1score_macro, recall, precision, knn_pred_class1, knn_pred_class0, knn_pred_class_negative_1 = evaluation(X_test, y_test, knn_model, knn_name, table, connection)
        # df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        # string, score, f1score_macro, recall, precision = evaluation(X_test, y_test, svc_model, svc_name, table, connection)
        # df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        # string, score, f1score_macro, recall, precision, mlp_pred_class1, mlp_pred_class0, mlp_pred_class_negative_1 = evaluation(X_test, y_test, mlp_model, mlp_name, table, connection)
        # df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        # string, score, f1score_macro, recall, precision, lr_pred_class1, lr_pred_class0, lr_pred_class_negative_1 = evaluation(X_test, y_test, lr_model, lr_name, table, connection)
        # df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        # ensemble_dataset = pd.DataFrame(columns = range(12))
        # ensemble_dataset.columns = ["rf_1", "rf_0", "rf_negative_1", "knn_1", "knn_0", "knn_negative_1", "mlp_1", "mlp_0", "mlp_negative_1", "lr_1", "lr_0", "lr_negative_1"]
        # ensemble_dataset["rf_1"] = rf_pred_class1
        # ensemble_dataset["rf_0"] = rf_pred_class0
        # ensemble_dataset["rf_negative_1"] = rf_pred_class_negative_1
        
        # ensemble_dataset["knn_1"] = knn_pred_class1
        # ensemble_dataset["knn_0"] = knn_pred_class0
        # ensemble_dataset["knn_negative_1"] = knn_pred_class_negative_1
        
        # ensemble_dataset["mlp_1"] = mlp_pred_class1
        # ensemble_dataset["mlp_0"] = mlp_pred_class0
        # ensemble_dataset["mlp_negative_1"] = mlp_pred_class_negative_1
        
        # ensemble_dataset["lr_1"] = lr_pred_class1
        # ensemble_dataset["lr_0"] = lr_pred_class0
        # ensemble_dataset["lr_negative_1"] = lr_pred_class_negative_1
        
        string, score, f1score_macro, recall, precision = evaluation_ensemble(X_test, y_test, ensemble_model, ensemble_name, table, connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
    return df_ml
    

In [12]:
#%%capture
df_ml = apply_ml_algorithms_pooling(connection)

['ADA_1min_complete_1day_preprocessed_1day_features', 'BAT_1min_complete_1day_preprocessed_1day_features', 'BCH_1min_complete_1day_preprocessed_1day_features', 'BNT_1min_complete_1day_preprocessed_1day_features', 'BSV_1min_complete_1day_preprocessed_1day_features', 'BTC_1min_complete_1day_preprocessed_1day_features', 'BTG_1min_complete_1day_preprocessed_1day_features', 'DASH_1min_complete_1day_preprocessed_1day_features', 'DOGE_1min_complete_1day_preprocessed_1day_features', 'EOS_1min_complete_1day_preprocessed_1day_features', 'ETC_1min_complete_1day_preprocessed_1day_features', 'ETH_1min_complete_1day_preprocessed_1day_features', 'FUN_1min_complete_1day_preprocessed_1day_features', 'ICX_1min_complete_1day_preprocessed_1day_features', 'KNC_1min_complete_1day_preprocessed_1day_features', 'LINK_1min_complete_1day_preprocessed_1day_features', 'LRC_1min_complete_1day_preprocessed_1day_features', 'LSK_1min_complete_1day_preprocessed_1day_features', 'LTC_1min_complete_1day_preprocessed_1day_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
df_ml

Unnamed: 0,table_name,model,f1-score weighted,f1-score macro,recall macro,precision macro
0,ADA_1min_complete_1day_preprocessed_1day_features,ensemble_pooling,0.3877,0.247893,0.376531,0.302389
1,BAT_1min_complete_1day_preprocessed_1day_features,ensemble_pooling,0.659106,0.368184,0.365869,0.384705
2,BCH_1min_complete_1day_preprocessed_1day_features,ensemble_pooling,0.63277,0.316726,0.336429,0.355627
3,BNT_1min_complete_1day_preprocessed_1day_features,ensemble_pooling,0.652184,0.368524,0.368258,0.397416
4,BSV_1min_complete_1day_preprocessed_1day_features,ensemble_pooling,0.642905,0.362376,0.383796,0.362228
5,BTC_1min_complete_1day_preprocessed_1day_features,ensemble_pooling,0.616685,0.371658,0.371334,0.372062
6,BTG_1min_complete_1day_preprocessed_1day_features,ensemble_pooling,0.689821,0.351762,0.368865,0.342454
7,DASH_1min_complete_1day_preprocessed_1day_feat...,ensemble_pooling,0.672002,0.373118,0.386384,0.421681
8,DOGE_1min_complete_1day_preprocessed_1day_feat...,ensemble_pooling,0.66603,0.289406,0.333333,0.255708
9,EOS_1min_complete_1day_preprocessed_1day_features,ensemble_pooling,0.545704,0.367037,0.401066,0.430664


In [112]:
def alternative_argmax_evaluation(model, X_train, table, db_connection):
    
    class_probabilities = model.predict_proba(X_train)
    thresholds_long = []
    for i in np.arange(0, 1, 0.02):
        thresholds_long.append(i)
        
    i = 0
    for threshold in thresholds_long:
        class1 = class_probabilities[:, 0].copy()

        class1[class1 > threshold] = 1
        class1[class1 < threshold] = 0
        
        class1_str = [str(x) for x in class1.tolist()]
        
        df = pd.concat([X_train])
        df["buy_short_indicator"] = class1_str
        df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
        df.to_sql(f"no_{i}_threshold_ensemble_long_{table}", db_connection, if_exists="replace")
        
        i += 1
        
    
    thresholds_short = []
    for i in np.arange(0, 1, 0.02):
        thresholds_short.append(i)
    
    k = 0
    for threshold_short in thresholds_short:
        print(k)
        class3 = class_probabilities[:, 2].copy()

        class3[class3 < threshold_short] = 0
        class3[class3 > threshold_short] = -1
        class3_str = [str(x) for x in class3.tolist()]
        
        df = pd.concat([X_train])
        df["buy_short_indicator"] = class3_str
        df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
        df.to_sql(f"no_{k}_threshold_ensemble_short_{table}", db_connection, if_exists="replace")
        
        k += 1

In [113]:
def execute_alternative_argmax_evaluation(db_connection):

    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and 'equity_curve' not in name and '_pooling' not in name and "ensemble" not in name]
    print(filtered_table_names)
    for table in filtered_table_names:
        df_temp = pd.read_sql_query(f"select * from {table}", connection)
        
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        
        X = df_temp.drop(["return", "buy_indicator", "short_indicator", "close_buy_indicator", "close_short_indicator", "time", "index", "level_0", "market_cap"], axis=1) 
        X_train = X.iloc[:-365]

        X_train[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_train[["open", "close", "high", "low", "volume"]])

        
        alternative_argmax_evaluation(rf_new, X_train, table, db_connection)

In [114]:
%%capture
execute_alternative_argmax_evaluation(connection)

In [96]:
def best_argmax_evaluation(model, X_test, y_test, table, db_connection):
    
    class_probabilities = model.predict_proba(X_test)
    
    class1 = class_probabilities[:, 0].copy()
    class3 = class_probabilities[:, 2].copy()
    
    class1[class1 >= 0.34] = 1
    class1[class1 < 0.34] = 0

    class3[class3 < 0.42] = 0
    class3[class3 >= 0.42] = -1

    
    y_pred = class3 + class1
    
    y_pred1 = y_pred.tolist()
    y_pred = [str(x) for x in y_pred1]
    
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    
    df.to_sql(f"{table}_ensemble_pooling_final", db_connection, if_exists="replace")
    
    return f1score, f1score_macro, recall, precision

In [97]:
def execute_alternative_argmax_evaluation(db_connection):

    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and 'equity_curve' not in name and '_pooling' not in name and "ensemble" not in name]
    print(filtered_table_names)
    for table in filtered_table_names:
        df_temp = pd.read_sql_query(f"select * from {table}", connection)
        
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        
        X = df_temp.drop(["return", "buy_indicator", "short_indicator","close_buy_indicator", "close_short_indicator", "time", "index", "level_0", "market_cap"], axis=1)
        
        
        X_test = X.iloc[-365:]
        X_test[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test[["open", "close", "high", "low", "volume"]])
        y_test = y.iloc[-365:]

        
        f1score, f1score_macro, recall, precision = best_argmax_evaluation(rf_new, X_test, y_test, table, db_connection)

In [98]:
%%capture
execute_alternative_argmax_evaluation(connection)