# Modeling

In [1]:
%%time
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import pandas as pd
import numpy as np
import cupy as cp
import random
random.seed(42)
import xgboost as xgb
print("XGBoost version:", xgb.__version__)
print("XGBoost config:", xgb.get_config())
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import joblib
import gc
from itertools import product

save_dir = "/root/autodl-tmp"

XGBoost version: 2.0.3
XGBoost config: {'use_rmm': False, 'verbosity': 1}
CPU times: user 1.36 s, sys: 1.17 s, total: 2.54 s
Wall time: 1.91 s


In [2]:
%%time

# Load the dataset
df_train = pd.read_parquet("data/prep/df_train.parquet")
df_test = pd.read_parquet("data/prep/df_test.parquet")
    
df_train.info()

# Select X (second column to second last column) and y (last column)
X_train = df_train.iloc[:, 1:-1].values
y_train = df_train.iloc[:, -1].values 
X_test = df_test.iloc[:, 1:-1].values
y_test = df_test.iloc[:, -1].values

# Convert X_train, y_train, X_test, y_test to CuPy arrays
X_train = cp.asarray(X_train).get()
y_train = cp.asarray(y_train).get()
X_test = cp.asarray(X_test).get()
y_test = cp.asarray(y_test).get()

# Verify the conversion
print("X_train type:", type(X_train))  # Should be <class 'cupy.ndarray'>
print("y_train type:", type(y_train))  # Should be <class 'cupy.ndarray'>
print("X_test type:", type(X_test))    # Should be <class 'cupy.ndarray'>
print("y_test type:", type(y_test))    # Should be <class 'cupy.ndarray'>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982163 entries, 0 to 982162
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         982163 non-null  object 
 1   vendor_id                  982163 non-null  int64  
 2   passenger_count            982163 non-null  float64
 3   store_and_fwd_flag         982163 non-null  int64  
 4   pickup_pca0                982163 non-null  float64
 5   pickup_pca1                982163 non-null  float64
 6   dropoff_pca0               982163 non-null  float64
 7   dropoff_pca1               982163 non-null  float64
 8   euclidean_distance         982163 non-null  float64
 9   pickup_hour_of_day         982163 non-null  float64
 10  day_of_week                982163 non-null  float64
 11  hour_of_week               982163 non-null  float64
 12  month_of_year              982163 non-null  float64
 13  day_of_year                98

In [3]:
%%time

# Define parameter grid for RandomizedSearchCV
param_dist = {
    "max_depth": [7, 9, 11, 13],                   # Maximum tree depth
    "learning_rate": [0.010, 0.050, 0.1],          # Boosting learning rate (eta)
    "subsample": [0.7, 0.8, 0.9],                  # Subsample ratio of training instances
    "colsample_bytree": [0.7, 0.8, 0.9],           # Subsample ratio of columns per tree
    "min_child_weight": [0.5, 0.75, 1, 3, 5],      # Minimum sum of instance weight (hessian)
    "gamma": [0.1, 0.25, 0.5, 0.75, 1],              # Minimum loss reduction for split
    "reg_alpha": [0.1, 0.5, 1, 2],              # L1 regularization term (alpha)
    "reg_lambda": [0.1, 0.5, 1, 2],             # L2 regularization term (lambda)
}

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 14.8 µs


In [4]:
%%time

# Initialize XGBRegressor with GPU support
xgb_model = xgb.XGBRegressor(
    n_estimators=5000,             # Number of boosting rounds
    device="cuda",                 # Use GPU for calculation
    objective="reg:squarederror",  # Regression objective (RMSE)
    eval_metric="rmse",            # Evaluation metric (RMSE)
    random_state=42                # Random seed for reproducibility
)

# Define parameter grid for RandomizedSearchCV
param_dist = {
    "max_depth": [7, 9, 11, 13],
    "learning_rate": [0.010, 0.050, 0.1],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "min_child_weight": [0.5, 0.75, 1, 3, 5],
    "gamma": [0.1, 0.25, 0.5, 0.75, 1],
    "reg_alpha": [0.1, 0.5, 1, 2],
    "reg_lambda": [0.1, 0.5, 1, 2],
}

# Define the GridSearchCV object
xgb_grid_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=500,
    cv=5,
    scoring="neg_mean_squared_error",  # Use negative RMSE for scoring
    verbose=2,
)

CPU times: user 22 µs, sys: 38 µs, total: 60 µs
Wall time: 72.5 µs


In [5]:
%%time

# Fit the GridSearchCV object
xgb_grid_search.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=7, min_child_weight=0.5, reg_alpha=1, reg_lambda=0.1, subsample=0.9; total time=  30.7s
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=7, min_child_weight=0.5, reg_alpha=1, reg_lambda=0.1, subsample=0.9; total time=  30.4s
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=7, min_child_weight=0.5, reg_alpha=1, reg_lambda=0.1, subsample=0.9; total time=  30.5s
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=7, min_child_weight=0.5, reg_alpha=1, reg_lambda=0.1, subsample=0.9; total time=  30.4s
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=7, min_child_weight=0.5, reg_alpha=1, reg_lambda=0.1, subsample=0.9; total time=  30.3s
[CV] END colsample_bytree=0.7, gamma=0.25, learning_rate=0.01, max_depth=7, min_child_weight=1, reg_alpha=2, reg_lambda=0.1, subsample=0.7; total time=  34.7s
[CV] END colsample_bytree=0.7, gamma=0.25, learning

In [6]:
%%time

# Get the best parameters and best score
best_params = xgb_grid_search.best_params_
best_score = np.sqrt(-xgb_grid_search.best_score_)  # Convert back to RMSE

print("Best Parameters:", best_params)
print("Best RMSE:", best_score)

Best Parameters: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'min_child_weight': 1, 'max_depth': 13, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.9}
Best RMSE: 0.041941542719797525
CPU times: user 97 µs, sys: 68 µs, total: 165 µs
Wall time: 182 µs


%%time

# Define parameter grid for RandomizedSearchCV
param_dist = {
    "max_depth": [7, 9, 11, 13],                   # Maximum tree depth # 13
    "learning_rate": [0.010, 0.050, 0.1],          # Boosting learning rate (eta) # 0.050
    "subsample": [0.7, 0.8, 0.9],                  # Subsample ratio of training instances # 0.9
    "colsample_bytree": [0.7, 0.8, 0.9],           # Subsample ratio of columns per tree # 0.9
    "min_child_weight": [0.5, 0.75, 1, 3, 5],      # Minimum sum of instance weight (hessian) # 1
    "gamma": [0.1, 0.25, 0.5, 0.75, 1],              # Minimum loss reduction for split # 0.1
    "reg_alpha": [0.1, 0.5, 1, 2],              # L1 regularization term (alpha) # 0.1
    "reg_lambda": [0.1, 0.5, 1, 2],             # L2 regularization term (lambda) # 0.1
}

---