In [4]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV

In [5]:
def evaluate_model(y_pred_train, y_pred_test, y_train, y_test):
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    print("MAE train: ", mae_train)
    print("MAE test: ", mae_test)
    print("R2 train: ", r2_train)
    print("R2 test: ", r2_test)
    print("MSE train: ", mse_train)
    print("MSE test: ", mse_test)

In [8]:
df = pd.read_csv('../../data/pre_train/gspc.csv', index_col='date')
df.index = pd.to_datetime(df.index)
df = df.interpolate(method='linear').fillna(method='ffill').fillna(method='bfill')

  df = df.interpolate(method='linear').fillna(method='ffill').fillna(method='bfill')


In [9]:
df.isna().sum()

close          0
low            0
open           0
sma_50         0
sma_200        0
sma_10         0
ema_10         0
upper_bb       0
middle_bb      0
obv            0
log_close      0
low_atr_pct    0
dtype: int64

In [10]:
X= df.drop(['close', 'log_close'], axis=1)
y = df['log_close']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=False)

In [12]:
xgb_model = XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

In [13]:
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

In [14]:
evaluate_model(y_pred_train, y_pred_test, y_train, y_test)

MAE train:  0.0037497550415127438
MAE test:  0.09127640621952646
R2 train:  0.9988813645178315
R2 test:  -0.22240998787311805
MSE train:  2.2777520305138596e-05
MSE test:  0.014443323376787963


In [15]:
feature_importances = xgb_model.feature_importances_
features = X.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)

print(importances_df)

       Feature  Importance
0          low    0.356837
2       sma_50    0.247563
5       ema_10    0.223288
1         open    0.082060
6     upper_bb    0.032470
3      sma_200    0.027866
4       sma_10    0.022644
8          obv    0.005255
9  low_atr_pct    0.001286
7    middle_bb    0.000729


In [420]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_lambda': [0.1, 1],
    'reg_alpha': [0.1, 1],
}


In [421]:
xgb_model = XGBRegressor(random_state=1)
tcsv = TimeSeriesSplit(n_splits=5)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tcsv, n_jobs=-1, verbose=2)

In [422]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [423]:
feature_importances = grid_search.best_estimator_.feature_importances_
features = X.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)
print(importances_df)

       Feature  Importance
4       sma_10    0.431617
0          low    0.240876
1         open    0.225869
5       ema_10    0.063141
6     upper_bb    0.027169
2       sma_50    0.006368
8          obv    0.002425
9  low_atr_pct    0.002157
3      sma_200    0.000350
7    middle_bb    0.000028


In [424]:
selected_features = importances_df[importances_df['Importance'] > 0.005]['Feature'].tolist()
X = X[selected_features]
X['log_close'] = df['log_close']
X['returns'] = df['log_close'].diff()
X = X.dropna()
y = X['log_close']
X = X.drop(['sma_10', 'log_close'], axis=1)

In [425]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=False)

In [426]:
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tcsv, n_jobs=-1, verbose=2)

In [429]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [430]:
feature_importances = grid_search.best_estimator_.feature_importances_
features = X.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)
print(importances_df)

    Feature  Importance
0       low    0.373934
1      open    0.331347
2    ema_10    0.271099
3  upper_bb    0.022456
5   returns    0.000887
4    sma_50    0.000277


In [431]:
best_params = grid_search.best_params_
print(best_params)

{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8}


In [432]:
y_pred_train = grid_search.best_estimator_.predict(X_train)
y_pred_test = grid_search.best_estimator_.predict(X_test)
evaluate_model(y_pred_train, y_pred_test, y_train, y_test)

MAE train:  0.0030098481380252083
MAE test:  0.09961134762671436
R2 train:  0.9989510091967483
R2 test:  -0.47555739797955243
MSE train:  2.0098726654796657e-05
MSE test:  0.01718128194736091


In [438]:
param_grid = {
    'n_estimators': [1000, 1200, 1500],
    'max_depth': [2],
    'learning_rate': [0.1, 0.15],
    'subsample': [0.8],
    'colsample_bytree': [0.9, 1.0, 1.1],
    'reg_lambda': [2, 3, 4],
    'reg_alpha': [0.03, 0.05],
}
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tcsv, n_jobs=-1, verbose=2)

In [439]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


180 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\santi\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\santi\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\santi\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1170, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\Users\santi\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           

In [440]:
best_params = grid_search.best_params_
print(best_params)

{'colsample_bytree': 0.9, 'learning_rate': 0.15, 'max_depth': 2, 'n_estimators': 1500, 'reg_alpha': 0.03, 'reg_lambda': 4, 'subsample': 0.8}


In [441]:
feature_importances = grid_search.best_estimator_.feature_importances_
features = X.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)
print(importances_df)

    Feature  Importance
0       low    0.639322
2    ema_10    0.238999
1      open    0.118142
4    sma_50    0.001370
5   returns    0.001318
3  upper_bb    0.000849


In [442]:
y_pred_train = grid_search.best_estimator_.predict(X_train)
y_pred_test = grid_search.best_estimator_.predict(X_test)
evaluate_model(y_pred_train, y_pred_test, y_train, y_test)

MAE train:  0.002515001224158433
MAE test:  0.08994210618080885
R2 train:  0.9994410536901946
R2 test:  -0.25186606675304857
MSE train:  1.070944479271099e-05
MSE test:  0.0145766365189651


In [476]:
param_grid = {
    'n_estimators': [200000, 500000, 1000000],
    'max_depth': [2],
    'learning_rate': [0.2],
    'subsample': [0.8],
    'colsample_bytree': [0.9],
    'reg_lambda': [4],
    'reg_alpha': [0.03],
}
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tcsv, n_jobs=-1, verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [None]:
best_params = grid_search.best_params_
print(best_params)

{'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 200000, 'reg_alpha': 0.03, 'reg_lambda': 4, 'subsample': 0.8}


In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
features = X.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)
print(importances_df)

    Feature  Importance
0       low    0.750050
3  upper_bb    0.163618
1      open    0.044482
2    ema_10    0.039721
5   returns    0.001253
4    sma_50    0.000877


In [None]:
y_pred_train = grid_search.best_estimator_.predict(X_train)
y_pred_test = grid_search.best_estimator_.predict(X_test)
evaluate_model(y_pred_train, y_pred_test, y_train, y_test)

MAE train:  0.0012690195233938538
MAE test:  0.09000521429847219
R2 train:  0.9998632810178674
R2 test:  -0.25305318548354916
MSE train:  2.619543891030934e-06
MSE test:  0.014590459242257093
