In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import talib
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier

import plotly.graph_objects as go
import plotly.express as px

from joblib import Parallel, delayed

import itertools

<h1>Read the data into a pandas dataframe</h1>
<h2>Convert the Date column to a datetime format</h2>
<h2>Set the Date as the index and convert entire dataframe to a datatype of float</h2>

In [3]:
df = pd.read_csv('ReadyToTrainData.csv')
df['Date'] = df['Date'].astype('datetime64[ns]')
df = df.set_index('Date')
df = df.astype('float32')
data = df.copy()


<h2> Split the data into X and y </h2>

In [4]:
y = data[['target']]
X = data.drop(['target'],axis=1)

<h2> Define our K Fold Splitter, in this instance we will use a Time Series Walk Forward Methodology and start prior to the 2008 Recession </h2>

In [5]:
splitter = TimeSeriesSplit(n_splits=16, test_size=252)

<h2> Start with a general XGBoost Regression model </h2>

In [7]:
depth = 7
model = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    booster='gbtree',
    n_estimators=100,
    learning_rate=0.01,
    max_depth=depth,
    colsample_bytree=0.5,
    random_state=5,
    subsample=0.98,
    min_child_weight=2,
    importance_type='gain'
)

<h2> Here I create custom folds rather than use a Grid Search. <br>The purpose being that this method gives me the ability to analyze <br> each Parameter grid invidually </h2>

In [6]:
train_tuples = []
test_tuples = []
splits_tuples = []
for i, (train_index, test_index) in enumerate(splitter.split(X)):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]
    train_tuples.append((X_train, y_train))
    test_tuples.append((X_test, y_test))
    splits_tuples.append(
        (X.iloc[train_index], y.iloc[train_index], X.iloc[test_index], y.iloc[test_index])
    )

<h2> Here I iteratively create a list of models which will be passed into a function for training </h2>

In [16]:


#create a dictionary with list

defaults_new = {
    'n_estimators':[100, 150],
    'max_depth':[3,4,5,6,7],
    'num_parallel_tree':[1,3,5,7],
    'subsample':[0.98],
    'colsample_bytree': [0.1,0.3],
    'colsample_bynode':[0.1,0.3],
    'learning_rate': [0.1, 0.3, 0.01],
    'random_state': [5],
    'n_jobs':[2]
}

#create combinations of all possible parameters
combinations = itertools.product(*defaults_new.values())

#combine the combinations with their values
sub_dicts = [dict(zip(defaults_new.keys(), combo)) for combo in combinations]

#create a list of models to plug into the Parallel Joblib function
hypersearch_list = [XGBRegressor(**params) for params in sub_dicts]

In [17]:
len(hypersearch_list)

480

In [18]:
hypersearch_list[0]

<h2> Create a training method that will run the walk forward training. <br> 
Then the function append the scores for each fold and create a tuple of these values </h2>

In [10]:
def training(input_model):
    model = input_model
    y_test_list = []
    for i in splits_tuples:
        X_train, y_train, X_test, y_test = i
        model.fit(X_train, y_train)
        y_test['pred'] = model.predict(X_test)
        y_test_list.append(y_test)
    preds_df = pd.concat(i for i in y_test_list)
    score_08 = mean_squared_error(preds_df.loc['2008'].target, preds_df.loc['2008'].pred)
    total_score = mean_squared_error(preds_df.target, preds_df.pred)

    score = score_08

    data_tuple = (score,total_score, model.get_params())

    return data_tuple

<h2> This method utilizes the Parallel library to train multiple models on multiple cores </h2>

In [20]:
def run_multi_core():
    results = Parallel(n_jobs=24)(delayed(training)(i) for i in hypersearch_list)
    return results

<h2> Creates a final dataframe based on the results obtained from the run_mulit_core function </h2>

In [21]:
def create_df():
    data_tuple = run_multi_core()
    data = pd.DataFrame(data_tuple,
                        columns=['08_score', 'total_score', 'params'])
    data = data.sort_values(by='08_score', ascending=True)
    return data

In [22]:
results = create_df()

In [23]:
results.head(10)

Unnamed: 0,08_score,total_score,params
154,0.000664,0.000267,"{'objective': 'reg:squarederror', 'base_score'..."
151,0.000664,0.000267,"{'objective': 'reg:squarederror', 'base_score'..."
391,0.000668,0.000273,"{'objective': 'reg:squarederror', 'base_score'..."
394,0.000668,0.000273,"{'objective': 'reg:squarederror', 'base_score'..."
0,0.00067,0.00018,"{'objective': 'reg:squarederror', 'base_score'..."
3,0.00067,0.00018,"{'objective': 'reg:squarederror', 'base_score'..."
39,0.000675,0.000178,"{'objective': 'reg:squarederror', 'base_score'..."
36,0.000675,0.000178,"{'objective': 'reg:squarederror', 'base_score'..."
15,0.000678,0.000176,"{'objective': 'reg:squarederror', 'base_score'..."
12,0.000678,0.000176,"{'objective': 'reg:squarederror', 'base_score'..."
