In [59]:
from data_storage import create_connection
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from stockstats import StockDataFrame
import os
import ta
from pyti.chande_momentum_oscillator import chande_momentum_oscillator
from pyti.accumulation_distribution import accumulation_distribution
from pyti.average_true_range_percent import average_true_range_percent


In [60]:
connection = create_connection("../database/crypto_billionairs.db")

In [61]:
# function to load data into database
def load_data_into_database(path, db_connection, header_list):
    
    for file in os.listdir(path):
            
            if file[-3:] == 'txt':
                file_name = str(file).replace("-", "_")
                df = pd.read_csv(f'./{path}/{file}', names = header_list)
                
                df.to_sql(f'{file_name[:-4]}_complete_raw', con=db_connection, if_exists="replace", index=False)

In [62]:
headers = ["time", "open", "high", "low", "close", "volume"]
load_data_into_database("../database/", connection, headers)

In [63]:
def preprocessing_data_1min(db_connection):
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_complete_raw" in name and 'trades' not in name]
    
    for table in filtered_table_names:
        
        df_temp = pd.read_sql_query(f"select * from {table}", db_connection)
        
        df_temp["time"] = pd.to_datetime(df_temp['time'])
        df_temp = df_temp.set_index('time')
        
        df_temp = df_temp.resample('1T').mean()
        
        df_temp["volume"] = df_temp["volume"].fillna(0)
        df_temp["close"] = df_temp["close"].fillna(method="ffill")
        df_temp = df_temp.fillna(axis=1, method="backfill")
        #df_temp["time"] = df_temp.index
    
        df_temp.to_sql(f"{table[:-4]}_1min_preprocessed", db_connection, if_exists="replace")
        
        

In [64]:
#preprocessing_data_1min(connection)

In [65]:
def preprocessing_data_1day(db_connection):
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_complete_raw" in name and 'trades' not in name]
    print(filtered_table_names)
    for table in filtered_table_names:
        
        df_temp = pd.read_sql_query(f"select * from {table}", db_connection)
        
        df_temp["time"] = pd.to_datetime(df_temp['time'])
        df_temp = df_temp.set_index('time')
        
        df_temp = df_temp.resample('1D').mean()
        
        df_temp["volume"] = df_temp["volume"].fillna(0)
        df_temp["close"] = df_temp["close"].fillna(method="ffill")
        df_temp = df_temp.fillna(axis=1, method="backfill")
      
    
        df_temp.to_sql(f"{table[:-4]}_1day_preprocessed", db_connection, if_exists="replace")

In [66]:
preprocessing_data_1day(connection)

['ADA_1min_complete_raw', 'BCH_1min_complete_raw', 'BTC_1min_complete_raw', 'DOGE_1min_complete_raw', 'ETH_1min_complete_raw', 'LINK_1min_complete_raw', 'LTC_1min_complete_raw', 'TRX_1min_complete_raw']


In [67]:
# def momentum2(df):
#     df["return"] = df["open"] / df["close"] - 1
#     return df["return"]

def momentum(df, lag):
    return df.pct_change(periods=lag)

In [68]:
def create_target_variable(df_target):
    
    df_target['return'] = momentum(df_target["close"], 1)
        
    df_target["mean_return"] = df_target["return"].rolling(50).mean()
        
    df_target["std_deviation"] = df_target["return"].rolling(50).std()
        
    df_target["buy_indicator"] = 0
    df_target.loc[df_target["return"] > df_target["mean_return"] + 1 * df_target["std_deviation"], 'buy_indicator'] = 1
    df_target["buy_indicator"] = df_target["buy_indicator"].shift(-1)
    df_target["close_buy_indicator"] = df_target["buy_indicator"].shift(1)
        
        
    df_target["short_indicator"] = 0
    df_target.loc[df_target["return"] < df_target["mean_return"] - 1 * df_target["std_deviation"], 'short_indicator'] = -1
    df_target["short_indicator"] = df_target["short_indicator"].shift(-1)
    df_target["close_short_indicator"] = df_target["short_indicator"].shift(1).fillna(0)
    
    return df_target["buy_indicator"], df_target["short_indicator"], df_target["close_buy_indicator"], df_target["close_short_indicator"]
        

In [69]:

def create_features(db_connection):
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_preprocessed" in name and 'trades' not in name and "_features" not in name]
    for table in filtered_table_names:
        
        df_temp = pd.read_sql_query(f"select * from {table}", db_connection)
        
        df_ti = pd.DataFrame()
        df_ti["time"] = df_temp["time"]
        df_ti["open"] = df_temp["open"]
        df_ti["close"] = df_temp["close"]
        df_ti["high"] = df_temp["high"]
        df_ti["low"] = df_temp["low"]
        df_ti["volume"] = df_temp["volume"]
    
        stock = StockDataFrame.retype(df_ti)
        
        df_temp["sma5-20"] = stock.get("close_5_sma") - stock.get("close_20_sma")
        df_temp["sma8-15"] = stock.get("close_8_sma") - stock.get("close_15_sma")
        df_temp["sma20-50"] = stock.get("close_20_sma") - stock.get("close_50_sma")
        df_temp["ema5-20"] = stock.get("close_5_ema") - stock.get("close_20_ema")
        df_temp["sma8-15"] = stock.get("close_8_ema") - stock.get("close_15_ema")
        df_temp["sma20-50"] = stock.get("close_20_ema") - stock.get("close_50_ema")
        df_temp["macd"] = stock.get("macd")
        df_temp["ao14"] = ta.trend.AroonIndicator(df_temp["close"], window = 14, fillna=True).aroon_indicator() 
        df_temp["adx14"] = ta.trend.ADXIndicator(df_temp["high"], df_temp["low"], df_temp["close"], 14).adx()
        df_temp["wd14"] = stock.get("pdi_14") - stock.get("mdi_14")
        
        df_temp["ppo12-26"] = stock.get("ppo") #default is 14
        df_temp["rsi14"] = stock.get("rsi_14")
        df_temp["mfi14"] = stock.get("mfi_14")
        df_temp["tsi"] = ta.momentum.TSIIndicator(df_temp["close"], fillna=True).tsi()
        df_temp["so14"] = stock.get("kdjk_14")
        df_temp["cmo14"] = chande_momentum_oscillator(df_temp["close"], 14)
        df_temp["atrp14"] = average_true_range_percent(df_temp["close"], 14)
        
        df_temp["pvo12-26"] = ta.momentum.PercentageVolumeOscillator(df_temp["volume"], fillna=True).pvo()
        df_temp["adl"] = accumulation_distribution(df_temp["close"], df_temp["high"], df_temp["low"], df_temp["volume"])
        df_temp["obv"] = ta.volume.OnBalanceVolumeIndicator(df_temp["close"], df_temp["volume"]).on_balance_volume()
        df_temp["fi13"] = ta.volume.ForceIndexIndicator(df_temp["close"], df_temp["volume"], 13, fillna= True).force_index()
        df_temp["fi50"] = ta.volume.ForceIndexIndicator(df_temp["close"], df_temp["volume"], 50, fillna=True).force_index()
        
        df_temp = df_temp.fillna(method = "backfill")
        df_temp = df_temp.fillna(method ="ffill")
        
        df_temp["buy_indicator"], df_temp["short_indicator"], df_temp["close_buy_indicator"], df_temp["close_short_indicator"] = create_target_variable(df_temp)
        df_temp = df_temp.drop(['mean_return', 'std_deviation'], axis=1)
        df_temp.to_sql(f"{table}_1day_features", db_connection, if_exists="replace")
    

In [70]:
create_features(connection)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  (((close_data[idx] - low_data[idx]) -
  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  (((close_data[idx] - low_data[idx]) -
  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  (((close_data[idx] - low_data[idx]) -
  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  (((close_data[idx] - low_data[idx]) -
  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  (((close_data[idx] - low_data[idx]) -
  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  (((close_data[idx] - low_data[idx]) -
  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  (((close_data[idx] - low_data[idx]) -
  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  (((close_data[

In [71]:
def create_pooling_dataset(db_connection):
    
    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", db_connection)
    
    table_names_list = table_names['name'].tolist()

    filtered_table_names = [name for name in table_names_list if "_1day_features" in name and 'trades' not in name and 'equity_curve' not in name]
    print(filtered_table_names)
    union_all_sql_list = []
    
    for table in filtered_table_names[:-1]:
        
        union_new_table = f"SELECT * FROM {table } where time < '2021-04-01 00:00:00' UNION ALL"
        union_all_sql_list.append(union_new_table)
    
    union_all_sql_list.append(f"SELECT * from {filtered_table_names[-1]} where time < '2021-04-01 00:00:00'")

    union_all_sql = ' '.join(union_all_sql_list)
    
    df = pd.read_sql_query(union_all_sql, db_connection)
    print(len(df)) #11973 #7720

    df.to_sql("cryptocurrency_pooling_dataset", db_connection, if_exists="replace")

In [72]:
create_pooling_dataset(connection)

['ADA_1min_complete_1day_preprocessed_1day_features', 'BCH_1min_complete_1day_preprocessed_1day_features', 'BTC_1min_complete_1day_preprocessed_1day_features', 'BTC_1min_complete_1day_preprocessed_1day_features_knn_pooling', 'BTC_1min_complete_1day_preprocessed_1day_features_logistic_regression_pooling', 'BTC_1min_complete_1day_preprocessed_1day_features_mlp_classifier_pooling', 'BTC_1min_complete_1day_preprocessed_1day_features_random_forest_pooling', 'DOGE_1min_complete_1day_preprocessed_1day_features', 'DOGE_1min_complete_1day_preprocessed_1day_features_knn_pooling', 'DOGE_1min_complete_1day_preprocessed_1day_features_logistic_regression_pooling', 'DOGE_1min_complete_1day_preprocessed_1day_features_mlp_classifier_pooling', 'DOGE_1min_complete_1day_preprocessed_1day_features_random_forest_pooling', 'ETH_1min_complete_1day_preprocessed_1day_features', 'ETH_1min_complete_1day_preprocessed_1day_features_knn_pooling', 'ETH_1min_complete_1day_preprocessed_1day_features_logistic_regression

DatabaseError: Execution failed on sql 'SELECT * FROM ADA_1min_complete_1day_preprocessed_1day_features where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM BCH_1min_complete_1day_preprocessed_1day_features where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM BTC_1min_complete_1day_preprocessed_1day_features where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM BTC_1min_complete_1day_preprocessed_1day_features_knn_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM BTC_1min_complete_1day_preprocessed_1day_features_logistic_regression_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM BTC_1min_complete_1day_preprocessed_1day_features_mlp_classifier_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM BTC_1min_complete_1day_preprocessed_1day_features_random_forest_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM DOGE_1min_complete_1day_preprocessed_1day_features where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM DOGE_1min_complete_1day_preprocessed_1day_features_knn_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM DOGE_1min_complete_1day_preprocessed_1day_features_logistic_regression_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM DOGE_1min_complete_1day_preprocessed_1day_features_mlp_classifier_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM DOGE_1min_complete_1day_preprocessed_1day_features_random_forest_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM ETH_1min_complete_1day_preprocessed_1day_features where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM ETH_1min_complete_1day_preprocessed_1day_features_knn_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM ETH_1min_complete_1day_preprocessed_1day_features_logistic_regression_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM ETH_1min_complete_1day_preprocessed_1day_features_mlp_classifier_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM ETH_1min_complete_1day_preprocessed_1day_features_random_forest_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM LINK_1min_complete_1day_preprocessed_1day_features where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM LTC_1min_complete_1day_preprocessed_1day_features where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM LTC_1min_complete_1day_preprocessed_1day_features_knn_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM LTC_1min_complete_1day_preprocessed_1day_features_logistic_regression_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM LTC_1min_complete_1day_preprocessed_1day_features_mlp_classifier_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * FROM LTC_1min_complete_1day_preprocessed_1day_features_random_forest_pooling where time < '2021-04-01 00:00:00' UNION ALL SELECT * from TRX_1min_complete_1day_preprocessed_1day_features where time < '2021-04-01 00:00:00'': no such column: time