In [1]:
from functions import get_DAX
from models.DAX import DAX_quantile_regression
from models.DAX import DAX_baseline
from functions.evaluation import evaluate_horizon
from tqdm import tqdm
from functions import evaluate_dax
from functions import reorder_quantiles

In [2]:
# Load data
daxdata = get_DAX.get()
daxdata

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,future_ret1,future_ret2,future_ret3,future_ret4,future_ret5,lag_ret1,lag_ret2,lag_ret3,lag_ret4,lag_ret5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1988-01-08 00:00:00+01:00,1026.689941,1026.689941,1026.689941,1026.689941,0,0.0,0.0,3.889849,3.953666,6.116953,5.221176,7.504712,1.197370,2.034796,3.024763,7.082493,2.116340
1988-01-11 00:00:00+01:00,987.520020,987.520020,987.520020,987.520020,0,0.0,0.0,0.063817,2.227104,1.331328,3.614864,-1.568363,-3.889849,-2.692479,-1.855053,-0.865086,3.192645
1988-01-12 00:00:00+01:00,986.890015,986.890015,986.890015,986.890015,0,0.0,0.0,2.163287,1.267511,3.551046,-1.632180,0.682238,-0.063817,-3.953666,-2.756296,-1.918870,-0.928903
1988-01-13 00:00:00+01:00,965.770020,965.770020,965.770020,965.770020,0,0.0,0.0,-0.895776,1.387760,-3.795466,-1.481049,0.553422,-2.163287,-2.227104,-6.116953,-4.919583,-4.082157
1988-01-14 00:00:00+01:00,974.460022,974.460022,974.460022,974.460022,0,0.0,0.0,2.283536,-2.899690,-0.585273,1.449198,2.622184,0.895776,-1.267511,-1.331328,-5.221176,-4.023806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-04 00:00:00+01:00,17743.439453,17756.390625,17684.730469,17716.169922,1220700,0.0,0.0,0.100352,-0.003054,-0.712507,-0.553550,-0.169755,-0.106628,0.214613,0.650952,0.905408,1.667338
2024-03-05 00:00:00+01:00,17682.779297,17746.570312,17643.109375,17698.400391,74178400,0.0,0.0,-0.103405,-0.812858,-0.653902,-0.270106,-1.495725,-0.100352,-0.206979,0.114262,0.550600,0.805057
2024-03-06 00:00:00+01:00,17685.919922,17745.160156,17682.509766,17716.710938,91487700,0.0,0.0,-0.709453,-0.550497,-0.166701,-1.392319,-1.371563,0.103405,0.003054,-0.103574,0.217667,0.654006
2024-03-07 00:00:00+01:00,17648.890625,17879.109375,17619.400391,17842.849609,86702800,0.0,0.0,0.158957,0.542752,-0.682866,-0.662110,-0.554366,0.709453,0.812858,0.712507,0.605879,0.927120


# Preprocess

In [3]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def preprocess_data_for_prediction(df):
    """
    Adjust lagged features based on DAX data structure and normalize.
    """
    df.index = pd.to_datetime(df.index)
    # Assuming 'Close' is the main feature for now; adjust as per your model's needs
    lag_features = ['Close']
    for feature in lag_features:
        for lag in range(1, 6):
            df[f'{feature}_lag{lag}'] = df[feature].shift(lag)
    df['Close_MA5'] = df['Close'].rolling(window=5).mean().shift(1)
    
    # Normalize features
    features = [f'{feat}_lag{j}' for feat in lag_features for j in range(1, 6)] + ['Close_MA5']
    scaler_X = StandardScaler()
    df[features] = scaler_X.fit_transform(df[features].fillna(0))  # Fill NaNs or handle them as per your requirement
    
    return df, scaler_X, features


### Custom loss function

In [5]:
def xgb_quantile_grad_hess(quantile, y_true, y_pred):
    error = y_true - y_pred
    grad = np.where(error > 0, -quantile, -(quantile - 1))
    hess = np.ones_like(y_pred)
    return grad, hess

# Train and predict

We now build 25 models, 5 for each quantile, and predict future returns for each day. We use the XGBRegressor model from the xgboost library, which allows us to define custom loss functions. We use the custom loss function defined earlier to train the models for different quantiles.

In [34]:
from xgboost import XGBRegressor

def train_and_predict(df, features, quantiles=[0.025, 0.25, 0.5, 0.75, 0.975]):
    """
    Train XGBRegressor models for different quantiles and predict future returns.
    """
    quantile_models = {}
    
    # Assuming future returns are your target variables
    for days in range(1, 6):
        X = df[features].iloc[:-days]  # Exclude the last 'days' rows for which we don't have future returns
        y = df[f'future_ret{days}'].shift(-days).dropna()  # Shift future returns to align with current features
        
        for quantile in quantiles:
            model = XGBRegressor(objective=lambda y_true, y_pred: xgb_quantile_grad_hess(quantile, y_true, y_pred),n_estimators=100, max_depth=3, learning_rate=0.1, verbosity=0)
            model.fit(X.iloc[:len(y)], y)  # Ensure X and y have the same length
            quantile_models[(days, quantile)] = model
            
    return quantile_models


In [65]:
def predict_for_date(df, forecast_date, features, scaler_X, quantile_models):
    """
    Prepare features for a specific forecast date and predict future returns using quantile models.
    """
    forecast_date = pd.to_datetime(forecast_date)
    # Convert to CET
    forecast_date = forecast_date.tz_localize('Europe/Berlin')
    if forecast_date in df.index:
        # Prepare and normalize features for the forecast date
        X_forecast_df = df.loc[[forecast_date], features]
        X_forecast_scaled = scaler_X.transform(X_forecast_df)

        forecast_predictions = {}
        for (days, quantile), model in quantile_models.items():
            # Ensure each day's dictionary is initialized
            forecast_predictions.setdefault(days, {})
            # Predict using the scaled features and store the result
            forecast_predictions[days][quantile] = model.predict(X_forecast_scaled)[0]
        
        return forecast_predictions
    else:
        raise ValueError(f"No data available for forecast_date: {forecast_date}")


### Usage

In [66]:
daxdata, scaler_X, features = preprocess_data_for_prediction(daxdata)

In [67]:
quantile_models = train_and_predict(daxdata, features)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 1, quantile: 0.025


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 1, quantile: 0.25


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 1, quantile: 0.5


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 1, quantile: 0.75


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 1, quantile: 0.975


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 2, quantile: 0.025


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 2, quantile: 0.25


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 2, quantile: 0.5


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 2, quantile: 0.75


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 2, quantile: 0.975


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 3, quantile: 0.025


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 3, quantile: 0.25


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 3, quantile: 0.5


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 3, quantile: 0.75


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 3, quantile: 0.975


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 4, quantile: 0.025


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 4, quantile: 0.25


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 4, quantile: 0.5


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 4, quantile: 0.75


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 4, quantile: 0.975


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 5, quantile: 0.025


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 5, quantile: 0.25


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 5, quantile: 0.5


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 5, quantile: 0.75


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Trained model for days: 5, quantile: 0.975


In [68]:
# Assuming the function predict_for_date is defined as shown in previous instructions
forecast_date = '2023-01-25'
forecast_predictions = predict_for_date(daxdata, forecast_date, features, scaler_X, quantile_models)

# Display forecast predictions
for day in forecast_predictions:
    print(f"Day {day} predictions:")
    for quantile, prediction in forecast_predictions[day].items():
        print(f"  Quantile {quantile}: {prediction}")


Day 1 predictions:
  Quantile 0.025: -1.8376399278640747
  Quantile 0.25: -0.8620426058769226
  Quantile 0.5: 0.05265770107507706
  Quantile 0.75: 0.5592954754829407
  Quantile 0.975: 1.4440668821334839
Day 2 predictions:
  Quantile 0.025: -1.8078489303588867
  Quantile 0.25: -0.6258193254470825
  Quantile 0.5: 0.6710105538368225
  Quantile 0.75: 0.835727334022522
  Quantile 0.975: 1.9566612243652344
Day 3 predictions:
  Quantile 0.025: -3.6241588592529297
  Quantile 0.25: -2.0009734630584717
  Quantile 0.5: -1.3308767080307007
  Quantile 0.75: 0.748283863067627
  Quantile 0.975: 1.9728672504425049
Day 4 predictions:
  Quantile 0.025: -3.598845958709717
  Quantile 0.25: -1.2413666248321533
  Quantile 0.5: 0.9619272351264954
  Quantile 0.75: 0.9181704521179199
  Quantile 0.975: 2.0661113262176514
Day 5 predictions:
  Quantile 0.025: -3.411705493927002
  Quantile 0.25: -2.094843864440918
  Quantile 0.5: -1.4798141717910767
  Quantile 0.75: 0.5925297141075134
  Quantile 0.975: 1.938577413

In [69]:
print(list(quantile_models.keys()))
print("Total models:", len(quantile_models))


[(1, 0.025), (1, 0.25), (1, 0.5), (1, 0.75), (1, 0.975), (2, 0.025), (2, 0.25), (2, 0.5), (2, 0.75), (2, 0.975), (3, 0.025), (3, 0.25), (3, 0.5), (3, 0.75), (3, 0.975), (4, 0.025), (4, 0.25), (4, 0.5), (4, 0.75), (4, 0.975), (5, 0.025), (5, 0.25), (5, 0.5), (5, 0.75), (5, 0.975)]
Total models: 25


In [71]:
forecast_predictions

{1: {0.025: -1.8376399,
  0.25: -0.8620426,
  0.5: 0.0526577,
  0.75: 0.5592955,
  0.975: 1.4440669},
 2: {0.025: -1.8078489,
  0.25: -0.6258193,
  0.5: 0.67101055,
  0.75: 0.83572733,
  0.975: 1.9566612},
 3: {0.025: -3.6241589,
  0.25: -2.0009735,
  0.5: -1.3308767,
  0.75: 0.74828386,
  0.975: 1.9728673},
 4: {0.025: -3.598846,
  0.25: -1.2413666,
  0.5: 0.96192724,
  0.75: 0.91817045,
  0.975: 2.0661113},
 5: {0.025: -3.4117055,
  0.25: -2.0948439,
  0.5: -1.4798142,
  0.75: 0.5925297,
  0.975: 1.9385774}}

In [73]:
#store forecast predictions in a dataframe
df_forecast = pd.DataFrame({
    "forecast_date": [forecast_date] * 5,
    "target": ["DAX"] * 5,
    "horizon": [f"{i} day" for i in range(1, 6)],
    "q0.025": [forecast_predictions[i][0.025] for i in range(1, 6)],
    "q0.25": [forecast_predictions[i][0.25] for i in range(1, 6)],
    "q0.5": [forecast_predictions[i][0.5] for i in range(1, 6)],
    "q0.75": [forecast_predictions[i][0.75] for i in range(1, 6)],
    "q0.975": [forecast_predictions[i][0.975] for i in range(1, 6)]
})

In [78]:
df_forecast= reorder_quantiles.reorder_quantiles(df_forecast)

In [79]:
df_forecast

Unnamed: 0,forecast_date,target,horizon,q0.025,q0.25,q0.5,q0.75,q0.975
0,2023-01-25,DAX,1 day,-1.83764,-0.862043,0.052658,0.559295,1.444067
1,2023-01-25,DAX,2 day,-1.807849,-0.625819,0.671011,0.835727,1.956661
2,2023-01-25,DAX,3 day,-3.624159,-2.000973,-1.330877,0.748284,1.972867
3,2023-01-25,DAX,4 day,-3.598846,-1.241367,0.91817,0.961927,2.066111
4,2023-01-25,DAX,5 day,-3.411705,-2.094844,-1.479814,0.59253,1.938577


Test outsourced functions 

In [85]:
from models.DAX import DAX_XGBoost

In [12]:
df_forecast = DAX_XGBoost.run_model(daxdata, '2024-03-04')

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_

In [13]:
df_forecast

Unnamed: 0,date_str,target,horizon,q0.025,q0.25,q0.5,q0.75,q0.975
0,2024-03-04,DAX,1 day,-1.83764,-0.862043,0.052658,0.559295,1.444067
1,2024-03-04,DAX,2 day,-1.807849,-0.625819,0.671011,0.835727,1.956661
2,2024-03-04,DAX,3 day,-3.624159,-2.000973,-1.330877,0.748284,1.972867
3,2024-03-04,DAX,4 day,-3.598846,-1.241367,0.91817,0.961927,2.066111
4,2024-03-04,DAX,5 day,-3.459329,-2.215512,-1.493221,0.59253,1.938577


# Evaluate

In [4]:
from models.DAX import DAX_XGBoost

In [5]:

baseline_model = {
    'name': 'Baseline Model',
    'function': DAX_baseline.DAX_baseline
}
quantile_model = {
    'name': 'XGBoost',
    'function': DAX_XGBoost.run_model
}


In [7]:
start_date = '2023-01-15'
end_date = '2023-01-31'

evaluation_baseline, evaluation_XGBoost= evaluate_dax.evaluate(baseline_model, quantile_model, daxdata, start_date, end_date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{feature}_lag{lag}'] = df[feature].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{feature}_lag{lag}'] = df[feature].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{feature}_lag{lag}'] = df[feature].shift(lag)
A value is trying to be set on a copy of a slice fro

ValueError: No data available for date_str: 2023-01-18 00:00:00+01:00