In [1]:
import utils
import models.categorical
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

import warnings
warnings.filterwarnings("ignore")

In [2]:
futuresList = utils.futuresList

rfParams = [
    {
        "n_estimators": [20, 60, 100], 
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 5],
        # "max_features": ['sqrt', 'log2']
    }
]

In [3]:
future = futuresList[0]
df = pd.read_csv(f"tickerData/{future}.txt", parse_dates = ["DATE"])
df.columns = ['DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL', 'OI', 'P', 'R', 'RINFO']
df = df.set_index("DATE")
df = df[(df.VOL != 0) & (df.CLOSE != 0)]
df = df.dropna(axis=0)
X_df = utils.generate_X_df([df.CLOSE, df.VOL], ["perc", "perc"])
y_df = utils.generate_y_cat(df.CLOSE)
cost_df = utils.perc_change(df.CLOSE, shift=0)

In [4]:
parameter_grid = list(ParameterGrid(rfParams))
# prepare collated results
agg_results_collated = pd.DataFrame(index=list(range(len(parameter_grid))), 
                                    columns=["accuracy_SMA", "opp_cost_SMA", "accuracy_EMA", "opp_cost_EMA"])
win_results_collated = []

# run walk forward validation 
for i in range(len(parameter_grid)):
    param_set = parameter_grid[i]
    model = RandomForestClassifier(**param_set)
    win_results, agg_results = models.categorical.walk_forward(
        model = model, X = X_df, y = y_df, cost_weight = cost_df, rolling = True, max_windows = 114
    )
    win_results_collated.append(win_results)
    agg_results_collated.loc[i, "accuracy_SMA"] = agg_results.loc["SMA", "accuracy"]
    agg_results_collated.loc[i, "accuracy_EMA"] = agg_results.loc["EMA", "accuracy"]
    agg_results_collated.loc[i, "opp_cost_SMA"] = agg_results.loc["SMA", "opp_cost"]
    agg_results_collated.loc[i, "opp_cost_EMA"] = agg_results.loc["EMA", "opp_cost"]

# save parameters
parameter_df = pd.DataFrame.from_records(parameter_grid)
combined_df = pd.concat([parameter_df, agg_results_collated], axis=1)

In [5]:
win_results

Unnamed: 0,train_start,train_end,validation_start,validation_end,accuracy,opp_cost
0,1990-04-01,1992-04-01,1992-04-01,1992-07-01,0.56338,0.92759
1,1990-07-01,1992-07-01,1992-07-01,1992-10-01,0.619651,0.831496
2,1990-10-01,1992-10-01,1992-10-01,1993-01-01,0.638849,0.82623
3,1991-01-01,1993-01-01,1993-01-01,1993-04-01,0.672823,0.818174
4,1991-04-01,1993-04-01,1993-04-01,1993-07-01,0.719854,0.834702
...,...,...,...,...,...,...
110,2017-10-01,2019-10-01,2019-10-01,2020-01-01,0.967393,0.961166
111,2018-01-01,2020-01-01,2020-01-01,2020-04-01,0.969123,0.89512
112,2018-04-01,2020-04-01,2020-04-01,2020-07-01,0.967407,1.043213
113,2018-07-01,2020-07-01,2020-07-01,2020-10-01,0.968587,1.136023


In [6]:
combined_df

Unnamed: 0,max_depth,accuracy_SMA,opp_cost_SMA,accuracy_EMA,opp_cost_EMA
0,,0.907555,1.345153,0.967377,1.163621


In [7]:
# retrieve parameter grid
parameter_grid = list(ParameterGrid(rfParams))

for future in futuresList:
    # load data
    df = pd.read_csv(f"tickerData/{future}.txt", parse_dates = ["DATE"])
    df.columns = ['DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL', 'OI', 'P', 'R', 'RINFO']
    df = df.set_index("DATE")
    df = df[(df.VOL != 0) & (df.CLOSE != 0)]
    df = df.dropna(axis=0)
    
    # load X and y
    X_df = utils.generate_X_df([df.CLOSE, df.VOL], ["perc", "perc"])
    y_df = utils.generate_y_cat(df.CLOSE)
    cost_df = utils.perc_change(df.CLOSE, shift=0)
    
    # prepare collated results
    agg_results_collated = pd.DataFrame(index=list(range(len(parameter_grid))), 
                                        columns=["accuracy_SMA", "opp_cost_SMA", "accuracy_EMA", "opp_cost_EMA"])
    win_results_collated = []

    # run walk forward validation 
    for i in range(len(parameter_grid)):
        param_set = parameter_grid[i]
        model = RandomForestClassifier(**param_set)
        win_results, agg_results = models.categorical.walk_forward(
            model = model, X = X_df, y = y_df, cost_weight = cost_df, rolling = True, max_windows = 114
        )
        win_results_collated.append(win_results)
        agg_results_collated.loc[i, "accuracy_SMA"] = agg_results.loc["SMA", "accuracy"]
        agg_results_collated.loc[i, "accuracy_EMA"] = agg_results.loc["EMA", "accuracy"]
        agg_results_collated.loc[i, "opp_cost_SMA"] = agg_results.loc["SMA", "opp_cost"]
        agg_results_collated.loc[i, "opp_cost_EMA"] = agg_results.loc["EMA", "opp_cost"]

    # save parameters
    parameter_df = pd.DataFrame.from_records(parameter_grid)
    combined_df = pd.concat([parameter_df, agg_results_collated], axis=1)
    combined_df.to_csv(f"model_metrics/categorical/rf/perc/{future}.txt", index=False)
    
    print(f'{future} done')

F_AD done
F_BO done
F_BP done
F_C done
F_CC done
F_CD done
F_CL done
F_CT done
F_DX done
F_EC done
F_ED done
F_ES done
F_FC done
F_FV done
F_GC done
F_HG done
F_HO done
F_JY done
F_KC done
F_LB done
F_LC done
F_LN done
F_MD done
F_MP done
F_NG done
F_NQ done
F_NR done
F_O done
F_OJ done
F_PA done
F_PL done
F_RB done
F_RU done
F_S done
F_SB done
F_SF done
F_SI done
F_SM done
F_TU done
F_TY done
F_US done
F_W done
F_XX done
F_YM done
F_AX done
F_CA done
F_DT done
F_UB done
F_UZ done
F_GS done
F_LX done
F_SS done
F_DL done
F_ZQ done
F_VX done
F_AE done
F_BG done
F_BC done
F_LU done
F_DM done
F_AH done
F_CF done
F_DZ done
F_FB done
F_FL done
F_FM done
F_FP done
F_FY done
F_GX done
F_HP done
F_LR done
F_LQ done
F_ND done
F_NY done
F_PQ done
F_RR done
F_RF done
F_RP done
F_RY done
F_SH done
F_SX done
F_TR done
F_EB done
F_VF done
F_VT done
F_VW done
F_GD done
F_F done


In [8]:
combined_df

Unnamed: 0,max_depth,accuracy_SMA,opp_cost_SMA,accuracy_EMA,opp_cost_EMA
0,,0.878002,0.046812,0.945818,0.036693


In [14]:
print(win_results_collated)

[   train_start  train_end validation_start validation_end  accuracy  opp_cost
0   2001-04-01 2003-04-01       2003-04-01     2003-07-01  0.576241  0.093312
1   2001-07-01 2003-07-01       2003-07-01     2003-10-01   0.60925  0.083923
2   2001-10-01 2003-10-01       2003-10-01     2004-01-01   0.62482  0.071985
3   2002-01-01 2004-01-01       2004-01-01     2004-04-01  0.685185  0.060949
4   2002-04-01 2004-04-01       2004-04-01     2004-07-01  0.690621  0.065377
..         ...        ...              ...            ...       ...       ...
66  2017-10-01 2019-10-01       2019-10-01     2020-01-01  0.945105  0.027386
67  2018-01-01 2020-01-01       2020-01-01     2020-04-01  0.948702  0.027375
68  2018-04-01 2020-04-01       2020-04-01     2020-07-01  0.948567  0.030045
69  2018-07-01 2020-07-01       2020-07-01     2020-10-01  0.947802  0.033923
70  2018-10-01 2020-10-01       2020-10-01     2021-01-01   0.94472  0.038914

[71 rows x 6 columns]]
