In [6]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
from xgboost import plot_importance
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [7]:
df = pd.read_csv('../data/processed_data/btc.csv', index_col='date')
df.index = pd.to_datetime(df.index)

In [8]:
X = df.drop(columns=['log_returns'])
y = df['log_returns']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=1)

In [10]:
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X_train, y_train)

In [11]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [12]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f'Train: MAE: {mae_train}, RMSE: {rmse_train}, R2: {r2_train}')
print(f'Test: MAE: {mae_test}, RMSE: {rmse_test}, R2: {r2_test}')

Train: MAE: 0.00015647626145356226, RMSE: 0.0002346642041099819, R2: 0.9998837989470769
Test: MAE: 0.0003568394157973274, RMSE: 0.0005712748487439398, R2: 0.9993076514933065


In [13]:
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)
print(feature_importance)

returns                  0.771642
square_returns           0.189647
abs_returns              0.036486
vol_ratio30              0.000317
vol_sma14                0.000242
vol_sma7                 0.000234
rolling_volatility_14    0.000220
returns_3                0.000213
sma30                    0.000188
vol_ratio7               0.000170
vol_ratio14              0.000169
rolling_volatility_7     0.000164
returns_2                0.000161
returns_1                0.000147
vol_sma30                0.000000
cumulative_returns       0.000000
open                     0.000000
ema30                    0.000000
high                     0.000000
ema14                    0.000000
ema7                     0.000000
sma14                    0.000000
sma7                     0.000000
volatility               0.000000
volume                   0.000000
close                    0.000000
low                      0.000000
rolling_volatility_30    0.000000
dtype: float32


In [14]:
correlations = df.corr()['log_returns'].sort_values(ascending=False)
print(correlations)

log_returns              1.000000
returns                  0.999943
square_returns           0.152603
abs_returns              0.149540
vol_ratio30              0.083801
vol_ratio14              0.066551
rolling_volatility_7     0.038774
vol_ratio7               0.037608
returns_2                0.031292
close                    0.027316
cumulative_returns       0.027316
volume                   0.015529
high                     0.008238
low                      0.006698
rolling_volatility_14   -0.000587
vol_sma7                -0.003300
ema7                    -0.005455
vol_sma30               -0.008961
returns_3               -0.009450
sma7                    -0.010316
ema14                   -0.010583
sma14                   -0.012816
vol_sma14               -0.013504
ema30                   -0.013992
volatility              -0.014031
sma30                   -0.015310
open                    -0.017978
rolling_volatility_30   -0.018740
returns_1               -0.026981
Name: log_retu

In [15]:
columns_to_drop = correlations[(correlations < 0.02) | (correlations > 0.98)].index.tolist()
columns_to_drop.remove('log_returns')
columns_to_drop.remove('returns')

In [16]:
df = df.drop(columns=columns_to_drop)
print(f'Columns dropped: {columns_to_drop}.\n\nColumns left: {df.columns}')

Columns dropped: ['volume', 'high', 'low', 'rolling_volatility_14', 'vol_sma7', 'ema7', 'vol_sma30', 'returns_3', 'sma7', 'ema14', 'sma14', 'vol_sma14', 'ema30', 'volatility', 'sma30', 'open', 'rolling_volatility_30', 'returns_1'].

Columns left: Index(['close', 'returns', 'square_returns', 'abs_returns', 'returns_2',
       'vol_ratio7', 'vol_ratio14', 'vol_ratio30', 'cumulative_returns',
       'log_returns', 'rolling_volatility_7'],
      dtype='object')


In [17]:
X = df.drop(columns=['log_returns'])
y = df['log_returns']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=1)

In [19]:
param_grid = {
    'learning_rate': 0.03,
    'n_estimators': 100,
    'max_depth': 2,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'gamma': 0,
    'reg_lambda': 1,
    'reg_alpha': 0,
    'min_child_weight': 1
}

In [20]:
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, **param_grid)
model.fit(X_train, y_train)

In [21]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [22]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f'Train: MAE: {mae_train}, RMSE: {rmse_train}, R2: {r2_train}')
print(f'Test: MAE: {mae_test}, RMSE: {rmse_test}, R2: {r2_test}')

Train: MAE: 0.003850587663810505, RMSE: 0.005434024347948789, R2: 0.9376897448864068
Test: MAE: 0.004079103535427307, RMSE: 0.005580242854676855, R2: 0.9339396680136166


In [23]:
param_grid = {
    'learning_rate': [0.01, 0.03],  
    'n_estimators': [100, 200],  
    'max_depth': [2, 3],  
    'subsample': [0.5, 0.7],  
    'colsample_bytree': [0.5, 0.7],  
    'gamma': [0, 0.1],  
    'reg_lambda': [0.1, 1],  
    'reg_alpha': [0.1, 1],  
    'min_child_weight': [1, 3]
}

In [24]:
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=1)

In [25]:
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='r2',
    cv=3,
    verbose=1,
    n_jobs=-1
)

In [26]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 512 candidates, totalling 1536 fits


KeyboardInterrupt: 

In [None]:
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best R2 in CV: {grid_search.best_score_}')

Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.03, 'max_depth': 2, 'min_child_weight': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 0.7}
Best R2 in CV: 0.9826739231039426


In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
print("Test MAE:", mean_absolute_error(y_test, y_pred))
print("Test RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("Test R2:", r2_score(y_test, y_pred))

Test MAE: 0.0008665466781288853
Test RMSE: 0.001749950396277598
Test R2: 0.9935034014820587


In [None]:
y_pred_test = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f'Train: MAE: {mae_train}, RMSE: {rmse_train}, R2: {r2_train}')
print(f'Test: MAE: {mae_test}, RMSE: {rmse_test}, R2: {r2_test}')

Train: MAE: 0.0008708048067062333, RMSE: 0.0018419385855257295, R2: 0.9928407642986018
Test: MAE: 0.0008665466781288853, RMSE: 0.001749950396277598, R2: 0.9935034014820587


In [30]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

In [31]:
def objective_function(colsample_bytree, gamma, learning_rate, max_depth, 
                       min_child_weight, n_estimators, reg_alpha, reg_lambda, subsample):
    
    params = {
        'colsample_bytree': max(min(colsample_bytree, 1), 0),  
        'gamma': max(gamma, 0),  
        'learning_rate': max(learning_rate, 0.001),  
        'max_depth': int(max_depth),  
        'min_child_weight': max(min_child_weight, 1),  
        'n_estimators': int(n_estimators),  
        'reg_alpha': max(reg_alpha, 0),  
        'reg_lambda': max(reg_lambda, 0),  
        'subsample': max(min(subsample, 1), 0),  
        'objective': 'reg:squarederror'
    }

    model = xgb.XGBRegressor(**params)

    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1)
    
    return np.mean(scores)

In [None]:
param_bounds = {
    'colsample_bytree': (0.5, 1.0),
    'gamma': (0, 5),
    'learning_rate': (0.01, 0.3),
    'max_depth': (2, 10),
    'min_child_weight': (1, 10),
    'n_estimators': (50, 500),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 1),
    'subsample': (0.5, 1.0)
}

In [33]:
optimizer = BayesianOptimization(f=objective_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m-0.000332[39m | [39m0.6873   [39m | [39m4.754    [39m | [39m0.2223   [39m | [39m6.789    [39m | [39m2.404    [39m | [39m120.2    [39m | [39m0.05808  [39m | [39m0.8662   [39m | [39m0.8006   [39m |
| [35m2        [39m | [35m0.1954   [39m | [35m0.854    [39m | [35m0.1029   [39m | [35m0.2913   [39m | [35m8.66     [39m | [35m2.911    [39m | [35m131.8    [39m | [35m0.1834   [39m | [35m0.3042   [39m | [35m0.7624   [39m |
| [39m3        [39m | [39m-0.000470[39m | [39m0.716    [39m | [39m1.456    [39m | [39m0.1874   [39m | [39m3.116    [39m | [39m3.629    [39m | [39m214.9    [39m | [39m0.4561   [39m | [39m0.7852   [39m | [39m0.5998   [39m |


In [34]:
best_params = optimizer.max['params']
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])

In [35]:
print(f"Mejores parámetros: {best_params}")

Mejores parámetros: {'colsample_bytree': 0.8540362888980227, 'gamma': 0.10292247147901223, 'learning_rate': 0.29127385712697834, 'max_depth': 8, 'min_child_weight': 2.9110519961044856, 'n_estimators': 131, 'reg_alpha': 0.18340450985343382, 'reg_lambda': 0.3042422429595377, 'subsample': 0.762378215816119}


In [38]:
best_params_adjusted = {
    'colsample_bytree': 0.8540362888980227,
    'gamma': 0.10292247147901223,
    'learning_rate': 0.03,  # Ajuste manual
    'max_depth': 8,
    'min_child_weight': 2.9110519961044856,
    'n_estimators': 200,  # Ajuste manual
    'reg_alpha': 0.18340450985343382,
    'reg_lambda': 0.3042422429595377,
    'subsample': 0.762378215816119
}

In [39]:
model_adjusted = xgb.XGBRegressor(**best_params_adjusted, random_state=42)
model_adjusted.fit(X_train, y_train)


In [40]:
y_train_pred = model_adjusted.predict(X_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
r2_train = r2_score(y_train, y_train_pred)

In [41]:
y_test_pred = model_adjusted.predict(X_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
r2_test = r2_score(y_test, y_test_pred)

In [42]:
print(f"Train: MAE: {mae_train:.5f}, RMSE: {rmse_train:.5f}, R2: {r2_train:.5f}")
print(f"Test: MAE: {mae_test:.5f}, RMSE: {rmse_test:.5f}, R2: {r2_test:.5f}")

Train: MAE: 0.01293, RMSE: 0.01835, R2: 0.28968
Test: MAE: 0.01334, RMSE: 0.01814, R2: 0.30204


In [49]:
# Nuevos hiperparámetros ajustados
best_params_tweaked = {
    'colsample_bytree': 0.854,
    'gamma': 0.08,
    'learning_rate': 0.291,  # Volvemos al valor de BO
    'max_depth': 6,  # Reducimos para evitar sobreajuste
    'min_child_weight': 4,  # Aumentamos para mayor regularización
    'n_estimators': 155,  # Volvemos al valor de BO
    'reg_alpha': 0.183,
    'reg_lambda': 0.304,
    'subsample': 0.75
}

# Entrenar el modelo con los nuevos parámetros
model_tweaked = xgb.XGBRegressor(**best_params_tweaked, random_state=42)
model_tweaked.fit(X_train, y_train)

# Evaluación en train
y_train_pred = model_tweaked.predict(X_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
r2_train = r2_score(y_train, y_train_pred)

# Evaluación en test
y_test_pred = model_tweaked.predict(X_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
r2_test = r2_score(y_test, y_test_pred)

# Mostrar resultados
print(f"Train: MAE: {mae_train:.5f}, RMSE: {rmse_train:.5f}, R2: {r2_train:.5f}")
print(f"Test: MAE: {mae_test:.5f}, RMSE: {rmse_test:.5f}, R2: {r2_test:.5f}")


Train: MAE: 0.01061, RMSE: 0.01598, R2: 0.46127
Test: MAE: 0.01073, RMSE: 0.01561, R2: 0.48307
