In [None]:
import numpy as np
from custom_tree.decision_tree_rl_split import RLDecisionTreeRegressor

class CustomXGBoostRL:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_split=10,
                 min_impurity_decrease=1e-7, n_features=None, policy_lr=1e-2):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity_decrease = min_impurity_decrease
        self.policy_lr = policy_lr
        self.n_features = n_features
        self.trees = []
        self.init_val = 0

    def fit(self, X, y):
        X = np.nan_to_num(X, nan=np.nanmean(X))
        y = np.nan_to_num(y, nan=np.nanmean(y))
        self.init_val = np.mean(y)
        y_pred = np.full(y.shape, self.init_val)

        for _ in range(self.n_estimators):
            residual = y - y_pred
            tree = RLDecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_impurity_decrease=self.min_impurity_decrease,
                n_features=self.n_features,
                policy_lr=self.policy_lr
            )
            tree.fit(X, residual)
            update = tree.predict(X)
            y_pred += self.learning_rate * update
            self.trees.append(tree)

    def predict(self, X):
        X = np.nan_to_num(X, nan=np.nanmean(X))
        y_pred = np.full((X.shape[0],), self.init_val)
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred


In [None]:
import pandas as pd
from sklearn.metrics import r2_score

df_train = pd.read_csv("data/ames.csv")
df_test = pd.read_csv("data/unseen_ames.csv")

df_train = df_train.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')
df_test = df_test.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')

y_train = df_train["SalePrice"].values
X_train = df_train.drop(columns=["SalePrice"]).values

y_test = df_test["SalePrice"].values
X_test = df_test.drop(columns=["SalePrice"]).values

n_features = X_train.shape[1]

model = CustomXGBoostRL(
    n_estimators=10,
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=5,
    min_impurity_decrease=1e-7,
    n_features=n_features,
    policy_lr=1e-2
)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_mse = np.mean((y_train - y_train_pred) ** 2)
test_mse = np.mean((y_test - y_test_pred) ** 2)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Test MSE:", test_mse)
print()
print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print()
print("Train R²:", train_r2)
print("Test R²:", test_r2)
