In [23]:
from data_storage import create_connection
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
import pickle
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix


In [24]:
connection = create_connection("../database/crypto_billionairs.db")
scaler = MinMaxScaler()

In [25]:
def random_forest(X_train, y_train):
    
    global clf
    
    clf = RandomForestClassifier(criterion="entropy", min_samples_split= 0.01, min_samples_leaf= 0.005, max_depth=10, class_weight="balanced_subsample")
    print("training random forest!")
    clf.fit(X_train, y_train)
   
    return "random_forest", clf

In [26]:
def knn(X_train, y_train):
    
    neigh = KNeighborsClassifier(weights="uniform", n_neighbors=5, algorithm="ball_tree")
    print("training knn!")
    neigh.fit(X_train, y_train)
    
    return "knn", neigh

In [27]:
def support_vector_machine(X_train, y_train):
    
    svc = SVC(kernel="poly", degree=4, C=1)
    print("training svc!")
    svc.fit(X_train, y_train)
    
    return "support_vector_classifier", svc

In [28]:
def mlp(X_train, y_train):
    
    mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)
    print("training mlp!")
    mlp.fit(X_train, y_train)
    
    return "mlp_classifier", mlp

In [29]:
def logistic_regression(X_train, y_train):
    global lg
    lg = LogisticRegression(solver="liblinear", penalty="l1", C=1)
    print("training lr!")
    lg.fit(X_train, y_train)
    
    return "logistic_regression", lg
    

In [30]:
def model_ensemble(X_train, y_train):
    
    print("training ensemble!")
    global rf_new
    
    level0 = list()
    level0.append(('lg', LogisticRegression(solver="liblinear", penalty="l1", C=1)))
    level0.append(('knn', KNeighborsClassifier(weights="uniform", n_neighbors=5, algorithm="ball_tree")))
    level0.append(('rf', RandomForestClassifier(criterion="entropy", min_samples_split= 0.01, min_samples_leaf= 0.005, max_depth=10, class_weight="balanced_subsample")))
    level0.append(('mlp', MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)))
    #mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)
    #mlp.fit(ensemble_dataset, y_train)
    
    level1 = RandomForestClassifier(criterion="entropy")
    rf_new = StackingClassifier(estimators=level0, final_estimator=level1)
    rf_new.fit(X_train, y_train)
    
    print("finished training ensemble!")
    return "ensemble", rf_new
    
    

In [31]:
def evaluation(X_test, y_test, model, model_name, table, db_connection):
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    y_pred = model.predict(X_test_normalized)
    
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    y_pred = y_pred.tolist()
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    
    df.to_sql(f"{table}_{model_name}_pooling", db_connection, if_exists="replace")
    
    return f"{model_name}_pooling", f1score, f1score_macro, recall, precision

In [35]:
def apply_ml_algorithms_pooling(db_connection):
    
    df_temp = pd.read_sql_query(f"select * from cryptocurrency_pooling_dataset", db_connection)
    df_temp = shuffle(df_temp, random_state=42069)
    
    y_train = df_temp["buy_indicator"] + df_temp["short_indicator"]
    y_train = y_train.fillna(0)
    y_train = y_train.astype(str)
        
    X_train = df_temp.drop(["return", "buy_indicator", "short_indicator","close_buy_indicator", "close_short_indicator", "time", "index", "level_0", "market_cap"], axis=1)
    
    X_train[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_train[["open", "close", "high", "low", "volume"]])
    
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    table_names_list = table_names['name'].tolist()
    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and 'equity_curve' not in name and '_pooling' not in name and "_threshold_ensemble" not in name]
    print(filtered_table_names)
    
    rf_name, rf_model = random_forest(X_resampled, y_resampled)
    knn_name, knn_model = knn(X_resampled, y_resampled)
    svc_name, svc_model = support_vector_machine(X_resampled, y_resampled)
    mlp_name, mlp_model = mlp(X_resampled, y_resampled)
    lr_name, lr_model = logistic_regression(X_resampled, y_resampled)
    
    ensemble_name, ensemble_model = model_ensemble(X_resampled, y_resampled)
    #creating the evaluation metric
    df_ml = pd.DataFrame(columns = range(6))
    df_ml.columns = ["table_name", "model", "f1-score weighted", "f1-score macro", "recall macro", "precision macro"]
    
    
    for table in filtered_table_names:
        
        df_temp = pd.read_sql_query(f"select * from {table}", db_connection)
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        X = df_temp.drop(["return", "buy_indicator", "short_indicator","close_buy_indicator", "close_short_indicator", "time", "index", "level_0", "market_cap"], axis=1)

        X_test = X.iloc[-365:]
        y_test = y.iloc[-365:]
        
        string, score, f1score_macro, recall, precision= evaluation(X_test, y_test, rf_model, rf_name, table, connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision= evaluation(X_test, y_test, knn_model, knn_name, table, connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = evaluation(X_test, y_test, mlp_model, mlp_name, table, connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = evaluation(X_test, y_test, svc_model, svc_name, table, connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
        string, score, f1score_macro, recall, precision = evaluation(X_test, y_test, lr_model, lr_name, table, connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]

        string, score, f1score_macro, recall, precision = evaluation(X_test, y_test, ensemble_model, ensemble_name, table, connection)
        df_ml.loc[len(df_ml)] = [table, string, score, f1score_macro, recall, precision]
        
    return df_ml
    

In [37]:
#%%capture
df_ml = apply_ml_algorithms_pooling(connection)

['ADA_1min_complete_1day_preprocessed_1day_features', 'BCH_1min_complete_1day_preprocessed_1day_features', 'BNT_1min_complete_1day_preprocessed_1day_features', 'BSV_1min_complete_1day_preprocessed_1day_features', 'BTC_1min_complete_1day_preprocessed_1day_features', 'BTG_1min_complete_1day_preprocessed_1day_features', 'DASH_1min_complete_1day_preprocessed_1day_features', 'DOGE_1min_complete_1day_preprocessed_1day_features', 'EOS_1min_complete_1day_preprocessed_1day_features', 'ETC_1min_complete_1day_preprocessed_1day_features', 'ETH_1min_complete_1day_preprocessed_1day_features', 'FUN_1min_complete_1day_preprocessed_1day_features', 'ICX_1min_complete_1day_preprocessed_1day_features', 'KNC_1min_complete_1day_preprocessed_1day_features', 'LINK_1min_complete_1day_preprocessed_1day_features', 'LRC_1min_complete_1day_preprocessed_1day_features', 'LTC_1min_complete_1day_preprocessed_1day_features', 'MKR_1min_complete_1day_preprocessed_1day_features', 'NEO_1min_complete_1day_preprocessed_1day_



training ensemble!


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


finished training ensemble!


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [68]:
df_ml

Unnamed: 0,table_name,model,f1-score weighted,f1-score macro,recall macro,precision macro
0,ADA_1min_complete_1day_preprocessed_1day_features,random_forest_pooling,0.599927,0.410078,0.433363,0.414063
1,ADA_1min_complete_1day_preprocessed_1day_features,knn_pooling,0.086336,0.103538,0.347118,0.288254
2,ADA_1min_complete_1day_preprocessed_1day_features,mlp_classifier_pooling,0.263205,0.185256,0.357524,0.298873
3,ADA_1min_complete_1day_preprocessed_1day_features,support_vector_classifier_pooling,0.237486,0.208149,0.310242,0.342288
4,ADA_1min_complete_1day_preprocessed_1day_features,logistic_regression_pooling,0.600359,0.330929,0.342752,0.339018
...,...,...,...,...,...,...
145,TRX_1min_complete_1day_preprocessed_1day_features,knn_pooling,0.062413,0.080278,0.302106,0.191848
146,TRX_1min_complete_1day_preprocessed_1day_features,mlp_classifier_pooling,0.675747,0.332433,0.351604,0.322083
147,TRX_1min_complete_1day_preprocessed_1day_features,support_vector_classifier_pooling,0.101578,0.097678,0.311529,0.240526
148,TRX_1min_complete_1day_preprocessed_1day_features,logistic_regression_pooling,0.671547,0.352146,0.352553,0.372066


In [67]:
df_ml.groupby(df_ml["model"]).mean()

Unnamed: 0_level_0,f1-score weighted,f1-score macro,recall macro,precision macro
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ensemble_pooling,0.536287,0.31722,0.36483,0.380304
knn_pooling,0.396891,0.264533,0.349259,0.338722
logistic_regression_pooling,0.597038,0.354307,0.373753,0.375307
mlp_classifier_pooling,0.287362,0.182996,0.337384,0.219905
random_forest_pooling,0.563813,0.371933,0.403736,0.382023
support_vector_classifier_pooling,0.065469,0.089324,0.331538,0.070509


In [42]:
def alternative_argmax_evaluation(model, X_train, table, db_connection):
    
    X_train_normalized = X_train.copy()
    X_train_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_train_normalized[["open", "close", "high", "low", "volume"]])
    
    class_probabilities = model.predict_proba(X_train_normalized)
    thresholds_long = []
    for i in np.arange(0, 1, 0.02):
        thresholds_long.append(i)
        
    i = 0
    for threshold in thresholds_long:
        class1 = class_probabilities[:, 0].copy()

        class1[class1 > threshold] = 1
        class1[class1 < threshold] = 0
        
        class1_str = [str(x) for x in class1.tolist()]
        
        df = pd.concat([X_train])
        df["buy_short_indicator"] = class1_str
        df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
        df.to_sql(f"no_{i}_threshold_ensemble_long_{table[:4]}", db_connection, if_exists="replace")
        
        i += 1
        
    
    thresholds_short = []
    for i in np.arange(0, 1, 0.02):
        thresholds_short.append(i)
    
    k = 0
    for threshold_short in thresholds_short:
        print(k)
        class3 = class_probabilities[:, 2].copy()

        class3[class3 < threshold_short] = 0
        class3[class3 > threshold_short] = -1
        class3_str = [str(x) for x in class3.tolist()]
        
        df = pd.concat([X_train])
        df["buy_short_indicator"] = class3_str
        df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
        df.to_sql(f"no_{k}_threshold_ensemble_short_{table[:4]}", db_connection, if_exists="replace")
        
        k += 1

In [43]:
def execute_alternative_argmax_evaluation(db_connection):

    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and 'equity_curve' not in name and '_pooling' not in name and "ensemble" not in name]
    print(filtered_table_names)
    for table in filtered_table_names:
        df_temp = pd.read_sql_query(f"select * from {table}", connection)
        
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        
        X = df_temp.drop(["return", "buy_indicator", "short_indicator", "close_buy_indicator", "close_short_indicator", "time", "index", "level_0", "market_cap"], axis=1) 
        X_train = X.iloc[:-365]
  
        alternative_argmax_evaluation(clf, X_train, table, db_connection)

In [45]:
%%capture
execute_alternative_argmax_evaluation(connection)

In [64]:
def best_argmax_evaluation(model, X_test, y_test, table, db_connection):
    
    X_test_normalized = X_test.copy()
    X_test_normalized[["open", "close", "high", "low", "volume"]] = scaler.fit_transform(X_test_normalized[["open", "close", "high", "low", "volume"]])
    
    class_probabilities = model.predict_proba(X_test_normalized)
    
    class1 = class_probabilities[:, 0].copy()
    class3 = class_probabilities[:, 2].copy()
    
    class1[class1 >= 0.18] = 1
    class1[class1 < 0.18] = 0

    class3[class3 < 0.02] = 0
    class3[class3 >= 0.02] = -1

    
    y_pred = class3 + class1
    y_pred1 = y_pred.tolist()
    y_pred = [str(x) for x in y_pred1]
    print(y_pred)
    f1score = f1_score(y_test, y_pred, average="weighted")
    f1score_macro = f1_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    
    df = pd.concat([X_test])
    df["buy_short_indicator"] = y_pred
    df['close_buy_short_indicator'] = df["buy_short_indicator"].shift(1).fillna(0.0)
    
    df.to_sql(f"ensemble_pooling_final_{table[:5]}", db_connection, if_exists="replace")
    
    return f1score, f1score_macro, recall, precision

In [65]:
def execute_alternative_argmax_evaluation(db_connection):

    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and 'equity_curve' not in name and '_pooling' not in name and "ensemble" not in name]
    print(filtered_table_names)
    for table in filtered_table_names:
        df_temp = pd.read_sql_query(f"select * from {table}", connection)
        
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        
        X = df_temp.drop(["return", "buy_indicator", "short_indicator","close_buy_indicator", "close_short_indicator", "time", "index", "level_0", "market_cap"], axis=1)
        
        
        X_test = X.iloc[-365:]
        y_test = y.iloc[-365:]

        
        f1score, f1score_macro, recall, precision = best_argmax_evaluation(rf_new, X_test, y_test, table, db_connection)

In [66]:
%%capture
execute_alternative_argmax_evaluation(connection)