**Imports**

In [9]:
import xgboost as xgb
from xgboost import XGBRegressor

import sklearn as sk
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold, TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

import shap
import warnings



**Load Data**

In [10]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

print(train_data.head())
print(test_data.head())

FEATURES = [col for col in train_data.columns if col in list("ABCDEFGHIJKLMN")]
TARGET = ["Y1", "Y2"]

train_data= train_data.sort_values("time")
test_data= test_data.sort_values("time")

strong_features = ['C','E', 'G', 'H','J', 'M', 'N']

for col in strong_features:
    train_data[f"{col}_roll_mean_5"] = train_data[col].rolling(2, min_periods=1).mean()
    train_data[f"{col}_roll_std_5"] = train_data[col].rolling(2, min_periods=1).std()
    train_data[f"{col}_roll_diff1"] = train_data[col].diff(1)

    test_data[f"{col}_roll_mean_5"] = test_data[col].rolling(2, min_periods=1).mean()
    test_data[f"{col}_roll_std_5"] = test_data[col].rolling(2, min_periods=1).std()
    test_data[f"{col}_roll_diff1"] = test_data[col].diff(1)


NEW_FEATURES = [col for col in train_data.columns if "roll" in col or "diff" in col]
FEATURES_EXTENDED = FEATURES + NEW_FEATURES

X = train_data[FEATURES_EXTENDED].copy()
y1 = train_data["Y1"].copy()
y2 = train_data["Y2"].copy()
y1y2 = train_data[["Y1", "Y2"]].copy()
X_test = test_data[FEATURES_EXTENDED].copy()




   time         A         B         C         D         E         F         G  \
0     0  0.207366 -0.159951 -0.634176 -0.580962 -0.266505  0.060173 -0.475257   
1     1  0.188828 -0.265508  0.042143 -0.550442 -0.132319 -0.185219  0.028295   
2     2 -0.144261 -0.577142 -0.214634 -0.747391 -0.184255 -0.464831 -0.085181   
3     3  0.208982 -0.310449  0.513708 -0.562868  0.742308 -0.305487  0.762246   
4     4  0.093320 -0.358156  0.173188 -0.687296 -0.161461 -0.116062 -0.245748   

          H         I         J         K         L         M         N  \
0 -1.486516 -0.332594 -0.671466 -0.226149 -0.187624 -0.780237 -0.785965   
1  0.093210 -0.518139 -0.251917 -0.347845 -0.359069 -0.161254  0.020401   
2  0.700449 -0.603438  0.197773 -0.566696 -0.580799  0.202726  0.135261   
3  1.363020 -0.384575  0.525556 -0.348514 -0.428099  0.548993  0.471031   
4  0.863372 -0.655588 -0.263358 -0.557428 -0.481214  0.083602  0.003087   

         Y1        Y2  
0 -0.935902 -0.310081  
1 -0.089707 -0

**Models**

In [11]:
#Make XGBRegressor model
def make_xgbregressor():
    return XGBRegressor(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1,
        reg_alpha=0,
        objective='reg:squarederror',
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        verbosity=1,
        eval_metric='rmse',
        early_stopping_rounds=100
    )

#Test without early stopping, and with RandomSearchCV for hyperparameter tuning
def make_xgbregressor_no_early_stopping():
    return XGBRegressor(
        n_estimators=1000,
        objective='reg:squarederror',
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        eval_metric='rmse'
    )

#Parameter distribution for RandomizedSearchCV
param_dist = {
    "max_depth": [3, 4, 5, 6, 7],
    "min_child_weight": [1, 2, 5, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "reg_lambda": [0.5, 1.0, 2.0, 5.0, 10.0],
    "reg_alpha": [0.0, 0.1, 0.5, 1.0],
    "gamma": [0, 0.1, 0.3, 0.5, 1.0]
}


**Train the model**

In [12]:
def train_model(X, y, X_test, cv, make_model_fn):
    oof = np.zeros(len(X))
    test_fold_preds = []

    for fold, (train, val) in enumerate(cv.split(X)):
        X_train, X_val = X.iloc[train], X.iloc[val]
        y_train, y_val = y.iloc[train], y.iloc[val]

        model=make_model_fn()

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        oof[val] = model.predict(X_val)

        test_fold_preds.append(model.predict(X_test))

        print(f"fold {fold+1} R^2: {r2_score(y_val, oof[val]):.4f}")
        cv_r2 = r2_score(y, oof)
        print(f"CV R^2: {cv_r2:.4f}")
        print()

        test_pred = np.mean(test_fold_preds, axis=0)

    return oof, test_pred

**Defining Folds**

In [13]:
#Time series cross validation
tscv = TimeSeriesSplit(n_splits=5) 

#Kfold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

**Running the model**

In [14]:
#No parameter tuning
print("Training for Y1")
oof_y1, test_pred_y1 = train_model(X, y1, X_test, tscv, make_xgbregressor)
print("\nTraining for Y2")
oof_y2, test_pred_y2 = train_model(X, y2, X_test, tscv, make_xgbregressor)

#RandomizedSearchCV for hyperparameter tuning (without early stopping)
print("Hyperparameter tuning for Y1")
rs_y1 = RandomizedSearchCV(
    estimator=make_xgbregressor_no_early_stopping(),
    param_distributions=param_dist,
    n_iter=50,
    scoring='r2',
    cv=tscv,
    n_jobs=-1,
    random_state=1,
    verbose=1
)

rs_y1.fit(X, y1)
print(f"Best parameters for Y1: {rs_y1.best_params_}")
print(f"Best CV R^2 for Y1: {rs_y1.best_score_:.4f}")
best_y1 = rs_y1.best_estimator_
test_pred_y1_tuned = best_y1.predict(X_test)

print("\nHyperparameter tuning for Y2")
rs_y2 = RandomizedSearchCV(
    estimator=make_xgbregressor_no_early_stopping(),
    param_distributions=param_dist,
    n_iter=50,
    scoring='r2',
    cv=tscv,
    n_jobs=-1,
    random_state=1,
    verbose=1
)
rs_y2.fit(X, y2)
print(f"Best parameters for Y2: {rs_y2.best_params_}")
print(f"Best CV R^2 for Y2: {rs_y2.best_score_:.4f}")
best_y2 = rs_y2.best_estimator_
test_pred_y2_tuned = best_y2.predict(X_test)

Training for Y1
fold 1 R^2: 0.7374
CV R^2: 0.0529

fold 2 R^2: 0.6935
CV R^2: 0.1317

fold 3 R^2: 0.7594
CV R^2: 0.2809

fold 4 R^2: 0.7673
CV R^2: 0.4266

fold 5 R^2: 0.7138
CV R^2: 0.6373


Training for Y2
fold 1 R^2: 0.6896
CV R^2: 0.0747

fold 2 R^2: 0.6445
CV R^2: 0.1371

fold 3 R^2: 0.6977
CV R^2: 0.2345

fold 4 R^2: 0.6976
CV R^2: 0.3285

fold 5 R^2: 0.5449
CV R^2: 0.5135

Hyperparameter tuning for Y1
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Y1: {'subsample': 1.0, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'min_child_weight': 8, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 0.5, 'colsample_bytree': 0.6}
Best CV R^2 for Y1: 0.7347

Hyperparameter tuning for Y2
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Y2: {'subsample': 0.6, 'reg_lambda': 10.0, 'reg_alpha': 1.0, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1.0, 'colsample_bytree': 1.0}
Best CV R^2 for Y2: 0.6462


**Submission**

In [15]:
#Prepare submission
submission = test_data[["id"]].copy()
submission["Y1"] = test_pred_y1_tuned
submission["Y2"] = test_pred_y2_tuned


assert submission.isnull().sum().sum() == 0, "There are missing values in submission!"

# save to CSV
submission.to_csv("submission.csv", index=False)

print("Saved submission.csv")
print(submission.head())


Saved submission.csv
   id        Y1        Y2
0   1  0.287263 -0.528851
1   2 -0.184273 -0.523373
2   3 -0.208544 -0.349863
3   4 -0.355431 -0.204670
4   5 -0.917815 -0.028433
