In [26]:
from data_storage import create_connection
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [27]:
connection = create_connection("../database/crypto_billionairs.db")

In [28]:
def apply_random_forest(X_train, X_test, y_train, y_test):
    
    clf = RandomForestClassifier(criterion="entropy", min_samples_split= 0.01, min_samples_leaf= 0.005, max_depth=10, class_weight="balanced_subsample")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1score = f1_score(y_test, y_pred, average="weighted")
    
    return "random forest", f1score

In [29]:
def apply_knn(X_train, X_test, y_train, y_test):
    
    neigh = KNeighborsClassifier(weights="uniform", n_neighbors=5, algorithm="ball_tree")
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    f1score = f1_score(y_test, y_pred, average="weighted")
    
    return "k-nearest neighbour", f1score

In [30]:
def apply_support_vector_machine(X_train, X_test, y_train, y_test):
    
    svc = SVC(kernel="poly", degree=4, C=1)
    svc.fit(X_train, y_train)
    
    y_pred = svc.predict(X_test)
    f1score = f1_score(y_test, y_pred, average="weighted")
    
    return "support vector classifier", f1score

In [31]:
def apply_mlp(X_train, X_test, y_train, y_test):
    
    mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation="tanh", solver="lbfgs", learning_rate="constant", learning_rate_init=2e-5, tol=1e-5)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    f1score = f1_score(y_test, y_pred, average="weighted")
    
    return "mlp classifier", f1score

In [32]:
def apply_logistic_regression(X_train, X_test, y_train, y_test):
    
    lg = LogisticRegression(solver="liblinear", penalty="l1", C=1)
    lg.fit(X_train, y_train)
    y_pred = lg.predict(X_test)
    f1score = f1_score(y_test, y_pred, average="weighted")
    
    return "logistic regression", f1score
    

In [33]:
def apply_ml_algorithms(db_connection):
    
    df_ml = pd.DataFrame(columns = range(3))
    df_ml.columns = ["table_name", "model", "f1-score"]
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name]
    
    for table in filtered_table_names:
        
        df_temp = pd.read_sql_query(f"select * from {table}", db_connection)
        
        y = df_temp["buy_indicator"] + df_temp["short_indicator"]
        y = y.fillna(0)
        y = y.astype(str)
        
        X = df_temp.drop(["return", "buy_indicator", "close_buy_indicator", "short_indicator", "close_short_indicator", "time", "index"], axis=1)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42069)
        
        string, score = apply_random_forest(X_train, X_test, y_train, y_test)
        df_ml.loc[len(df_ml)] = [table, string, score]
        
        string, score = apply_knn(X_train, X_test, y_train, y_test)
        df_ml.loc[len(df_ml)] = [table, string, score]
        
        string, score = apply_support_vector_machine(X_train, X_test, y_train, y_test)
        df_ml.loc[len(df_ml)] = [table, string, score]
        
        string, score = apply_mlp(X_train, X_test, y_train, y_test)
        df_ml.loc[len(df_ml)] = [table, string, score]
        
        string, score = apply_logistic_regression(X_train, X_test, y_train, y_test)
        df_ml.loc[len(df_ml)] = [table, string, score]
        
    return df_ml
        

In [34]:
df_ml = apply_ml_algorithms(connection)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_r

In [35]:
df_ml.sort_values(by=["f1-score"])

Unnamed: 0,table_name,model,f1-score
10,ETH_1min_complete_1day_preproce_1day_features,random forest,0.597373
12,ETH_1min_complete_1day_preproce_1day_features,support vector classifier,0.614897
13,ETH_1min_complete_1day_preproce_1day_features,mlp classifier,0.614897
0,BTC_1min_complete_1day_preproce_1day_features,random forest,0.614938
11,ETH_1min_complete_1day_preproce_1day_features,k-nearest neighbour,0.615166
14,ETH_1min_complete_1day_preproce_1day_features,logistic regression,0.6288
2,BTC_1min_complete_1day_preproce_1day_features,support vector classifier,0.637745
3,BTC_1min_complete_1day_preproce_1day_features,mlp classifier,0.637745
4,BTC_1min_complete_1day_preproce_1day_features,logistic regression,0.643391
15,LTC_1min_complete_1day_preproce_1day_features,random forest,0.647962
