In [None]:
import pandas as pd

import pandas_ta as ta
import numpy as np
import os
import pickle

from utils import get_strat_df, initial_columns

# Show all columns
pd.set_option("display.max_columns", None)

In [None]:
with open("dict_added_columns.pickle", "rb") as f:
    dict_added_columns = pickle.load(f)


dict_col_parent_strat = {}

for parent_stat, col_child in dict_added_columns.items():
    for col in col_child:
        dict_col_parent_strat[col] = parent_stat

In [None]:
agg_time = 60 * 6  # Aggregate time in minutes

df_strats = get_strat_df(360)

In [None]:
if "Timestamp" in df_strats.columns:
    del df_strats["Timestamp"]

In [None]:
# Make sure all added columns had their parent identified
list_no_parent = []
for col in df_strats.columns:
    if col in dict_col_parent_strat and col not in initial_columns:
        if col not in dict_col_parent_strat:
            print(f"Column {col} has no parent identified")

assert len(list_no_parent) == 0

In [None]:
# Problematic strategies will be removed if all their children have been removed
dict_strat_strikes = {}

for col_child, parent_stat in dict_col_parent_strat.items():
    if parent_stat in dict_strat_strikes:
        dict_strat_strikes[parent_stat] += 1
    else:
        dict_strat_strikes[parent_stat] = 1

In [None]:
# Drop columns with NaN values in the last n rows. Some of these strategies are problematic probably due to
# the usage of future data

aux = df_strats.head(1).isna().sum()

cols_to_delete = aux[df_strats.tail(1000).isna().sum() > 0].index


for col in cols_to_delete:
    last_nan_idx = df_strats[col][df_strats[col].isna()].index[-1]
    dict_strat_strikes[dict_col_parent_strat[col]] -= 1
    del df_strats[col]
    print(f"Column {col} | Last NaN index: {last_nan_idx}")

In [None]:
# Drop columns with no variation
quant_last_10_pct = int(df_strats.shape[0] / 10)
aux = df_strats.head(1).isna().sum()
cols_check_no_var = aux[df_strats.tail(quant_last_10_pct).diff().nunique() < 10]
dict_variation = {}
for col in cols_check_no_var.index:
    variation = 1 - ((df_strats[col].diff() == 0) | (df_strats[col].isna())).sum() / df_strats.shape[0]
    dict_variation[col] = variation
    if variation < 0.01:
        dict_strat_strikes[dict_col_parent_strat[col]] -= 1
        del df_strats[col]
        print(f"Column {col} | Variation: {variation}")


In [None]:
"""for col_child, parent_strat in dict_col_parent_strat.items():
    if col_child not in df_strats.columns:
        dict_strat_strikes[parent_strat] -= 1
        print(f"Column {col_child} has been removed")
"""

In [None]:
list_cols_predict = [col for col in df_strats.columns if col not in initial_columns]
list_strats = [strat for strat in dict_strat_strikes if dict_strat_strikes[strat] > 0]

In [None]:
with open("list_cols_predict", "wb") as f:
    pickle.dump(list_cols_predict, f)

with open("list_strats", "wb") as f:
    pickle.dump(list_strats, f)

In [None]:
# The following code will be used to identify the strategies that are not adding value to the prediction
# Some features have a very long inertia and are not useful for short term predictions

quant_half = int(df_strats.shape[0] / 2)
quant_1pct = int(5 * df_strats.shape[0] / 100)
df_strats_copy = df_strats.iloc[quant_last_10_pct:][initial_columns]
strategies = ta.Strategy("strats", [{"kind": strat} for strat in list_strats])
df_strats_copy.ta.strategy(strategies)
df_strats_copy = df_strats_copy[list_cols_predict]
base_line_df = df_strats.iloc[quant_last_10_pct:][list_cols_predict]

aux_df = (df_strats_copy - base_line_df).abs() < 1e-2 * (df_strats_copy.abs() + base_line_df.abs())
aux_df.reset_index(inplace=True, drop=True)
dict_col_inertia = {}

for col in aux_df.columns:
    dict_col_inertia[col] = aux_df[col][aux_df[col] == False].index[-1] if False in aux_df[col].values else 0

print(dict_col_inertia)

with open(f"dict_col_inertia_{agg_time}_min", "wb") as f:
    pickle.dump(dict_col_inertia, f)