In [153]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

In [130]:
df = pd.read_csv("combined.csv", index_col=0)

In [163]:
class Tester:
    def __init__(self, data, metrics, target_column="Gold_Close", split_ratio=0.2):
        """
        Initialize the Tester class with the dataset and metrics.
        
        :param data: The dataset (assumed to be a DataFrame or similar structure).
        :param metrics: A list of metrics functions to evaluate the model.
        """
        self.data = data
        self.split_ratio = split_ratio
        self.metrics = metrics
        self.target_column = target_column

        self.preprocess_data()

        self.X_train, self.y_train, self.X_test, self.y_test = None, None, None, None
        self.split_data(split_ratio=self.split_ratio)
    
    def split_data(self, split_ratio=0.2, random_state=42):
        """
        Split the data into training and testing sets, by performing a determinsitic continous split
        
        :param target_column: The column name in the dataset that contains the target variable.
        :param test_size: The proportion of the dataset to include in the test split.
        :param random_state: Controls the shuffling applied to the data before applying the split.
        """
        split_index = int(len(self.data) * (1 - split_ratio))
        train_data = self.data[:split_index]
        test_data = self.data[split_index:]

        self.X_train = train_data.drop(columns=[self.target_column])
        self.y_train = train_data[[self.target_column]]

        self.X_test = test_data.drop(columns=[self.target_column])
        self.y_test = test_data[[self.target_column]]

    def preprocess_data(self):
        """
        Apply preprocessing to the data (e.g., Min-Max scaling).
        """
        scaler = MinMaxScaler()  # Initialize the scaler

        X = self.data.drop(columns=[self.target_column])
        y = self.data[[self.target_column]]
        
        # Apply Min-Max Scaling to features
        X_scaled = scaler.fit_transform(X)
        
        self.data_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=self.data.index)
        self.data_scaled[self.target_column] = y
        
    
    def run(self, model):
        """
        Run the model on the training data and evaluate it on the test data using the provided metrics.
        
        :param model: The machine learning model to train and evaluate.
        :return: A pandas DataFrame containing the results for each metric.
        """
    
        # Train the model
        model.fit(self.X_train, self.y_train)
        
        # Predict on the test set
        y_pred = model.predict(self.X_test)
        
        # Evaluate using the provided metrics
        results = {}
        for metric in self.metrics:
            metric_name = metric.__name__
            results[metric_name] = metric(self.y_test, y_pred)
        
        # Return the results as a pandas DataFrame
        return pd.DataFrame([results])
    
    def run_multiple(self, models):
        """
        Run multiple models using the run function and return the results in a DataFrame.
        
        :param models: A list of tuples, where each tuple contains a model and its name.
        :return: A pandas DataFrame with each row corresponding to a model's evaluation results.
        """
        results_list = []
        
        for model_name, model in models:
            print(f"Running model: {model_name}")
            result = self.run(model)
            result.insert(0, 'Model', model_name)  # Insert the model name as the first column
            results_list.append(result)
        
        # Concatenate all results into a single DataFrame
        return pd.concat(results_list, ignore_index=True).set_index("Model")

In [167]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, root_mean_squared_error

tester = Tester(df, metrics=[mean_squared_error, r2_score, root_mean_squared_error], split_ratio=0.2)

In [168]:
# Benchmark (taking mean of the data)

class Benchmark:
    def __init__(self):
        self.mean = None
    
    def fit(self, X, y):
        self.mean = y.mean()
    
    def predict(self, X):
        return np.full((len(X), 1), self.mean)

In [169]:
tester.run_multiple(models=[
    ("Benchmark", Benchmark()),
    ("Linear Regression", LinearRegression()),
    ("XGBoost", XGBRegressor(
                            colsample_bytree=0.7,
                            learning_rate=0.05,
                            n_estimators=50,
                            max_depth=8,
                            subsample=0.7,
                            min_child_weight=4,
                            ))
])

Running model: Benchmark
Running model: Linear Regression
Running model: XGBoost


Unnamed: 0_level_0,mean_squared_error,r2_score,root_mean_squared_error
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Benchmark,318033.573129,-6.708696,563.944654
Linear Regression,200956.727461,-3.870914,448.281973
XGBoost,254348.824417,-5.165066,504.330075


100

## Walk-Forward Validation

In [60]:
df = df.iloc[:100]

In [72]:
tscv = TimeSeriesSplit(test_size=1, n_splits=99-9)
X = df.drop(columns=["Gold_Close"])
for i, (train_index, test_index) in enumerate(tscv.split(X)):
    print("Fold", i)
    print("Train: ", train_index)
    print("Test: ", test_index)
    print()

Fold 0
Train:  [0 1 2 3 4 5 6 7 8 9]
Test:  [10]

Fold 1
Train:  [ 0  1  2  3  4  5  6  7  8  9 10]
Test:  [11]

Fold 2
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11]
Test:  [12]

Fold 3
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12]
Test:  [13]

Fold 4
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]
Test:  [14]

Fold 5
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Test:  [15]

Fold 6
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
Test:  [16]

Fold 7
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
Test:  [17]

Fold 8
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
Test:  [18]

Fold 9
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]
Test:  [19]

Fold 10
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
Test:  [20]

Fold 11
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
Test:  [21]

Fold 12
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]
Test: 