In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

# Read, pre-process and visualize data

In [2]:
data_set = pd.read_csv("/home/manu/git/XGBoostHybrid/MultipleSources-2019-2025.csv")
data_set.head()

Unnamed: 0,Date,SP500,Futures,Nikkei,FTSE,DAX
0,2025-04-02,5670.97,5512.0,35725.87,8608.48,22390.84
1,2025-04-01,5633.07,5674.5,35624.48,8634.8,22539.98
2,2025-03-31,5611.85,5653.25,35617.56,8582.81,22163.49
3,2025-03-28,5580.94,5623.0,37120.33,8658.85,22461.52
4,2025-03-27,5693.31,5739.25,37799.97,8666.12,22678.74


In [3]:
len(data_set)
data_set.isna().sum()
data_set.dtypes

1509

Date        0
SP500       0
Futures     0
Nikkei     98
FTSE       30
DAX        24
dtype: int64

Date        object
SP500      float64
Futures    float64
Nikkei     float64
FTSE       float64
DAX        float64
dtype: object

In [4]:
data_set.columns

# Step 1: Ensure the 'Date' column is in datetime format
data_set['Date'] = pd.to_datetime(data_set['Date'])

# Step 2: Convert datetime to integer timestamp (in seconds)
data_set['Date'] = data_set['Date'].astype(np.int64) // 10**9

data_set.tail()

Index(['Date', 'SP500', 'Futures', 'Nikkei', 'FTSE', 'DAX'], dtype='object')

Unnamed: 0,Date,SP500,Futures,Nikkei,FTSE,DAX
1504,1554768000,2878.2,2882.5,21802.59,7425.57,11850.57
1505,1554681600,2895.77,2898.25,21761.65,7451.89,11963.4
1506,1554422400,2892.74,2896.0,21807.5,7446.87,12009.75
1507,1554336000,2879.39,2882.75,21724.95,7401.94,11988.01
1508,1554249600,2873.4,2879.75,21713.21,7418.28,11954.4


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data_set.drop(columns = ['SP500']), data_set['SP500'], test_size=0.2, random_state=50)
X_train
X_test
y_train
y_test

Unnamed: 0,Date,Futures,Nikkei,FTSE,DAX
232,1714348800,5147.00,,8147.03,18118.32
1314,1578528000,3276.00,23739.87,7598.12,13495.06
741,1650326400,4459.25,26985.09,7601.28,14153.46
1231,1588896000,2928.50,20179.09,,10904.48
610,1666656000,3870.25,27250.28,7013.48,13052.96
...,...,...,...,...,...
70,1734480000,5872.25,39081.71,8199.11,20242.57
132,1726790400,5699.99,37723.91,8229.99,18720.01
1313,1578614400,3264.75,23850.57,7587.85,13483.31
109,1729641600,5837.75,38104.86,8258.64,19377.62


Unnamed: 0,Date,Futures,Nikkei,FTSE,DAX
288,1707264000,5015.25,36119.92,7628.75,16921.96
965,1622160000,4202.50,29149.41,7022.61,15519.98
1090,1606435200,3636.50,26644.71,6367.58,13335.68
191,1719446400,5546.00,39341.54,8179.68,18210.55
453,1686528000,4342.75,32434.00,7570.69,16097.87
...,...,...,...,...,...
539,1675728000,4175.75,27685.47,7864.71,15320.88
1235,1588550400,2825.25,,5753.78,10466.80
497,1681084800,4136.25,27633.66,,
296,1706227200,4916.25,35751.07,7635.09,16961.39


232    5,116.17
1314   3,274.70
741    4,462.21
1231   2,929.80
610    3,859.11
         ...   
70     5,872.16
132    5,702.55
1313   3,265.35
109    5,797.42
1504   2,878.20
Name: SP500, Length: 1207, dtype: float64

288    4,995.06
965    4,204.11
1090   3,638.35
191    5,482.87
453    4,338.93
         ...   
539    4,164.00
1235   2,842.74
497    4,109.11
296    4,890.97
1038   3,934.83
Name: SP500, Length: 302, dtype: float64

# Decison tree


In [6]:
clf = DecisionTreeRegressor(random_state=50)

clf = clf.fit(X_train, y_train) 

In [7]:
X_train.columns
clf.feature_importances_

Index(['Date', 'Futures', 'Nikkei', 'FTSE', 'DAX'], dtype='object')

array([7.60832339e-05, 9.99658237e-01, 1.81359688e-04, 6.22487369e-05,
       2.20711814e-05])

In [8]:
test_output = pd.DataFrame(clf.predict(X_test), index = X_test.index, columns = ['pred_spx'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_spx'] - test_output['SP500']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_spx'] - test_output['SP500']).mean()/test_output['SP500'].mean()

Unnamed: 0,pred_spx,SP500
288,5000.62,4995.06
965,4195.99,4204.11
1090,3635.41,3638.35
191,5475.09,5482.87
453,4327.78,4338.93


Mean absolute error is 
9.033476821192066


np.float64(0.002149933263340384)

## Bagging Regressor ## 

In [9]:
regr = BaggingRegressor(random_state=50, n_estimators = 200, max_samples = 1000)

regr = regr.fit(X_train, y_train) 

In [10]:
test_output = pd.DataFrame(regr.predict(X_test), index = X_test.index, columns = ['pred_spx'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_spx'] - test_output['SP500']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_spx'] - test_output['SP500']).mean()/test_output['SP500'].mean()

Unnamed: 0,pred_spx,SP500
288,4981.59,4995.06
965,4198.88,4204.11
1090,3634.11,3638.35
191,5485.06,5482.87
453,4336.93,4338.93


Mean absolute error is 
8.152711092715036


np.float64(0.0019403143564295105)

## Random Forest Regressor 

In [11]:
rf = RandomForestRegressor(random_state=50, min_samples_leaf = 3, max_features = "sqrt")

rf = rf.fit(X_train, y_train) 


In [12]:
X_train.columns
rf.feature_importances_

Index(['Date', 'Futures', 'Nikkei', 'FTSE', 'DAX'], dtype='object')

array([0.29595133, 0.4259416 , 0.11283519, 0.01997037, 0.14530151])

In [13]:
test_output = pd.DataFrame(rf.predict(X_test), index = X_test.index, columns = ['pred_spx'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_spx'] - test_output['SP500']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_spx'] - test_output['SP500']).mean()/test_output['SP500'].mean()

Unnamed: 0,pred_spx,SP500
288,4965.6,4995.06
965,4202.48,4204.11
1090,3625.72,3638.35
191,5474.76,5482.87
453,4343.72,4338.93


Mean absolute error is 
14.605712320374524


np.float64(0.0034761042037200806)

## Gradeint Boosting Regressor

In [14]:
gb = HistGradientBoostingRegressor(random_state=50, min_samples_leaf = 2, max_depth = 4) #GradientBoostingRegressor(random_state=50, min_samples_leaf = 2, max_depth = 4)

gb = gb.fit(X_train, y_train) 


In [16]:
X_train.columns
#gb.feature_importances_

Index(['Date', 'Futures', 'Nikkei', 'FTSE', 'DAX'], dtype='object')

In [17]:
test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_spx'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_spx'] - test_output['SP500']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_spx'] - test_output['SP500']).mean()/test_output['SP500'].mean()

Unnamed: 0,pred_spx,SP500
288,4967.02,4995.06
965,4208.56,4204.11
1090,3628.26,3638.35
191,5503.11,5482.87
453,4333.92,4338.93


Mean absolute error is 
8.669035481129013


np.float64(0.002063197604961343)

## XGBoost Regressor 

In [18]:
# XGBoost comes with its own class for storing datasets called DMatrix. 
# It is a highly optimized class for memory and speed. 
# That's why converting datasets into this format is a requirement for the native XGBoost API:


# Create regression matrices

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)

dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [19]:
params = {"objective": "reg:squarederror", "tree_method": "exact", "max_depth" : 4, "learning_rate" : 0.1} # use "tree_method" : "hist" if you need speed

In [20]:
n = 100

model = xgb.train(

   params=params,

   dtrain=dtrain_reg,

   num_boost_round=n,

)

In [21]:
from sklearn.metrics import mean_squared_error
preds = model.predict(dtest_reg)


In [22]:
test_output = pd.DataFrame(preds, index = X_test.index, columns = ['pred_spx'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_spx'] - test_output['SP500']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_spx'] - test_output['SP500']).mean()/test_output['SP500'].mean()

Unnamed: 0,pred_spx,SP500
288,4980.5,4995.06
965,4208.14,4204.11
1090,3640.04,3638.35
191,5480.75,5482.87
453,4342.12,4338.93


Mean absolute error is 
8.028752360564981


np.float64(0.0019108126477512194)

## Hybrid Model 

In [23]:
model = HistGradientBoostingRegressor(random_state=50, min_samples_leaf = 2, max_depth = 4) #LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
model.score(X_train, y_train) 

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
training_residuals = y_train - model.predict(X_train)

In [None]:
rf = RandomForestRegressor(random_state=50, min_samples_leaf = 3, max_features = "sqrt")

rf = rf.fit(X_train, training_residuals) 

In [None]:
pred_residuals = rf.predict(X_test)
y_pred = pred_residuals + model.predict(X_test)

In [None]:
test_output = pd.DataFrame(y_pred, index = X_test.index, columns = ['pred_spx'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_spx'] - test_output['SP500']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_spx'] - test_output['SP500']).mean()/test_output['SP500'].mean()

## Plotting a Graph for Comparing Results


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Replace these with actual MAE or R² scores from your runs
model_names = [
    "Decision Tree", 
    "Bagging Regressor", 
    "Random Forest Regressor", 
    "Gradient Boosting Regressor",
    "XGBoost Regressor",
    "Hybrid Model"
]

# Example scores — update with your real MAEs or R² values
model_scores = [
    13.27,   # Decision Tree
    13.93,   # Bagging Regressor
    11.63,   # Random Forest Regressor
    16.42,   # Gradient Boosting Regressor
    16.46,   # XGBoost Regressor
    12.39,   # Hybrid Model
]

# Create DataFrame
df = pd.DataFrame({'Model': model_names, 'MAE': model_scores})

# Plot
plt.figure(figsize=(10, 6))
bars = plt.bar(df['Model'], df['MAE'], color='cornflowerblue', edgecolor='black')
plt.title('Comparison of Mean Absolute Error (MAE) Across Models', fontsize=14)
plt.xlabel('Models')
plt.ylabel('MAE (lower is better)')
plt.xticks(rotation=45)

# Annotate bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.5, f'{yval:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()


# Plotting a Graph using MAPE

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# 1. Replace with your actual model names and MAPE values
model_names = [
    "Decision Tree",
    "Bagging Regressor",
    "Random Forest",
    "Gradient Boosting",
    "XGBoost",
    "Hybrid Model"
]

mape_scores = [
    0.008531494740122672,
    0.008956134667946088,
    0.007480228486074053,
    0.010555821477945771,
    0.010580599231789816,
    0.007966035152265596
]

# 2. Create a DataFrame
df = pd.DataFrame({
    "Model": model_names,
    "MAPE": [score * 100 for score in mape_scores]  # convert to %
})

# 3. Plot
plt.figure(figsize=(10, 6))
bars = plt.bar(df["Model"], df["MAPE"], color='skyblue', edgecolor='black')

# 4. Annotate each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.3, f"{yval:.2f}%", ha='center', va='bottom')

plt.title("Comparison of MAPE Across Models", fontsize=14)
plt.ylabel("MAPE (%)", fontsize=12)
plt.xlabel("Machine Learning Models", fontsize=12)
plt.xticks(rotation=45)
plt.ylim(0, max(df["MAPE"]) + 5)
plt.tight_layout()
plt.show()
