# Modeling: Hyper Parameter Search

结论是目前数据有很大问题
1. 验证测试集和训练集的分布
2. OSRM
3. 用LightGBM或者Random Forest测试一下情况

In [1]:
%%time
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import pandas as pd
import numpy as np
import cupy as cp
import random
random.seed(42)
import xgboost as xgb
print("XGBoost version:", xgb.__version__)
print("XGBoost config:", xgb.get_config())
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import r2_score, mean_squared_error
from tqdm.notebook import tqdm
import joblib
import warnings
import gc

save_dir = "/root/autodl-tmp"

XGBoost version: 2.0.3
XGBoost config: {'use_rmm': False, 'verbosity': 1}
CPU times: user 1.14 s, sys: 875 ms, total: 2.01 s
Wall time: 1.4 s


In [2]:
%%time

# Load the dataset
df_train = pd.read_parquet("data/prep/train.parquet")
df_test = pd.read_parquet("data/prep/test.parquet")
    
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982163 entries, 0 to 982162
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         982163 non-null  object 
 1   vendor_id                  982163 non-null  int64  
 2   passenger_count            982163 non-null  float64
 3   store_and_fwd_flag         982163 non-null  int64  
 4   pickup_pca0                982163 non-null  float64
 5   pickup_pca1                982163 non-null  float64
 6   dropoff_pca0               982163 non-null  float64
 7   dropoff_pca1               982163 non-null  float64
 8   euclidean_distance         982163 non-null  float64
 9   pickup_hour_of_day         982163 non-null  float64
 10  day_of_week                982163 non-null  float64
 11  hour_of_week               982163 non-null  float64
 12  month_of_year              982163 non-null  float64
 13  day_of_year                98

In [3]:
%%time
    
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291713 entries, 0 to 291712
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         291713 non-null  object 
 1   vendor_id                  291713 non-null  int64  
 2   passenger_count            291713 non-null  float64
 3   store_and_fwd_flag         291713 non-null  int64  
 4   pickup_pca0                291713 non-null  float64
 5   pickup_pca1                291713 non-null  float64
 6   dropoff_pca0               291713 non-null  float64
 7   dropoff_pca1               291713 non-null  float64
 8   euclidean_distance         291713 non-null  float64
 9   pickup_hour_of_day         291713 non-null  float64
 10  day_of_week                291713 non-null  float64
 11  hour_of_week               291713 non-null  float64
 12  month_of_year              291713 non-null  float64
 13  day_of_year                29

In [4]:
%%time

# Select X (second column to second last column) and y (last column)
X_train = df_train.iloc[:, 1:-1].values
y_train = np.log(df_train.iloc[:, -1].values + 1)  # Apply log transformation
X_test = df_test.iloc[:, 1:-1].values
y_test = np.log(df_test.iloc[:, -1].values + 1)  # Apply log transformation

# Convert directly to XGBoost DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Verify conversion
print("DMatrix dtrain:", dtrain.num_row(), "rows,", dtrain.num_col(), "columns")
print("DMatrix dtest:", dtest.num_row(), "rows,", dtest.num_col(), "columns")

# Cleanup
del df_train, df_test, X_train, X_test
gc.collect()

DMatrix dtrain: 982163 rows, 29 columns
DMatrix dtest: 291713 rows, 29 columns
CPU times: user 1.65 s, sys: 664 ms, total: 2.31 s
Wall time: 1.08 s


0

In [5]:
%%time

# Define parameter grid for Hyper Parameter Search
param_dist = {
    "max_depth": [5, 7, 9, 11, 13],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.25, 0.5, 0.75, 1.0, 2.0], 
    "min_child_weight": [1, 2, 5, 10, 20, 50, 100],
    "gamma": [0.1, 0.25, 0.5, 1, 2, 5],
    "reg_alpha": [0, 0.1, 0.5, 1, 5, 10],
    "reg_lambda": [1, 10, 50, 100],
}

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 17.4 µs


In [6]:
%%time

# Sample parameter combinations from the parameter grid
param_samples = list(ParameterSampler(param_dist, n_iter=5, random_state=42))

# Initialize storage for performance metrics
test_performance = {key: [] for key in list(param_samples[0].keys())
                    + ["train_rmsle", "test_rmsle", "train_r2score", "test_r2score"]}
best_rmsle_test = float("inf")
best_model = None

# Define the evaluation set
eval_set = [(dtrain, "train"), (dtest, "eval")]

# Define the base parameters
base_params = {
    "nthread": 4,
    "device": "cuda",
    "tree_method": "hist",
    "booster": "gbtree",
    "eval_metric": "rmse",
    "objective": "reg:squarederror",
    "random_state": 42,
}

CPU times: user 826 µs, sys: 1.94 ms, total: 2.76 ms
Wall time: 1.81 ms


In [7]:
test_performance

{'subsample': [],
 'reg_lambda': [],
 'reg_alpha': [],
 'min_child_weight': [],
 'max_depth': [],
 'learning_rate': [],
 'gamma': [],
 'colsample_bytree': [],
 'train_rmsle': [],
 'test_rmsle': [],
 'train_r2score': [],
 'test_r2score': []}

In [8]:
%%time

# Iterate over each parameter combination
for params in tqdm(param_samples, desc="Hyperparameter Search Progress...", miniters=100, leave=False):
    # Merge base parameters with current set of hyperparameters
    current_params = {**base_params, **params}
    
    # Train the model
    model = xgb.train(
        params=current_params,
        dtrain=dtrain,
        num_boost_round=5000,
        evals=eval_set,
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    
    # Make predictions on the training and test sets
    y_pred_train = model.predict(dtrain)
    y_pred_test = model.predict(dtest)
    
    # Calculate R^2 scores
    train_score = r2_score(y_train, y_pred_train, force_finite=False)
    test_score = r2_score(y_test, y_pred_test, force_finite=False)
    
    # Calculate RMSLE(RMSE & log(Y))
    train_rmsle = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmsle = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # Append parameter values and RMSE scores
    for param in params.keys():
        test_performance[param].append(params[param])
    
    test_performance["train_r2score"].append(train_score)
    test_performance["test_r2score"].append(test_score)
    test_performance["train_rmsle"].append(train_rmsle)
    test_performance["test_rmsle"].append(test_rmsle)
    
    # Update best_model if the current test RMSLE is better
    if test_rmsle < best_rmsle_test:
        best_model = model
        best_rmsle_test = test_rmsle

# Cleanup memory
gc.collect()

Hyperparameter Search Progress...:   0%|          | 0/5 [00:00<?, ?it/s]

{'subsample': [0.9], 'reg_lambda': [100], 'reg_alpha': [0.1], 'min_child_weight': [2], 'max_depth': [11], 'learning_rate': [0.05], 'gamma': [0.25], 'colsample_bytree': [0.8], 'train_rmsle': [0.24068537794786146], 'test_rmsle': [0.5242402708566164], 'train_r2score': [0.8204304032275787], 'test_r2score': [0.5668410976816647]}
{'subsample': [0.9, 0.8], 'reg_lambda': [100, 10], 'reg_alpha': [0.1, 10], 'min_child_weight': [2, 10], 'max_depth': [11, 13], 'learning_rate': [0.05, 0.01], 'gamma': [0.25, 2], 'colsample_bytree': [0.8, 0.8], 'train_rmsle': [0.24068537794786146, 0.26162224639662374], 'test_rmsle': [0.5242402708566164, 0.5271134952897151], 'train_r2score': [0.8204304032275787, 0.7878306068028389], 'test_r2score': [0.5668410976816647, 0.5620800239026824]}
{'subsample': [0.9, 0.8, 0.8], 'reg_lambda': [100, 10, 50], 'reg_alpha': [0.1, 10, 0.5], 'min_child_weight': [2, 10, 20], 'max_depth': [11, 13, 13], 'learning_rate': [0.05, 0.01, 0.05], 'gamma': [0.25, 2, 0.5], 'colsample_bytree': [

100

In [9]:
%%time

# Generate folders to save model outputs
os.makedirs("model", exist_ok=True)

# Generate a DataFrame to store performance results
performance_df = pd.DataFrame(test_performance)

# Find the parameter combination with the lowest RMSE on the test set
best_params_test = performance_df.loc[
    performance_df["test_rmsle"].idxmin()].to_dict()

print("Best parameters on test set: ",
      best_params_test)
print("Best RMSE on test set: ",
      best_rmsle_test)

# Save the best model to file
best_model.save_model(os.path.join("model", "best_model_found.json"))

# Optionally, save the performance DataFrame to a CSV file
performance_df.to_csv(os.path.join("model", "hyperparameter_search_results.csv"),
                      index=False)

Best parameters on test set:  {'subsample': 0.9, 'reg_lambda': 100.0, 'reg_alpha': 5.0, 'min_child_weight': 20.0, 'max_depth': 7.0, 'learning_rate': 0.1, 'gamma': 2.0, 'colsample_bytree': 1.0, 'train_rmsle': 0.26939812758723125, 'test_rmsle': 0.5205461288230613, 'train_r2score': 0.7750310717487922, 'test_r2score': 0.5729242345179903}
Best RMSE on test set:  0.5205461288230613
CPU times: user 16.4 ms, sys: 4.6 ms, total: 21 ms
Wall time: 24 ms




In [11]:
for key, value in test_performance.items():
    print(f"{key}: {len(value)}")

subsample: 6
reg_lambda: 6
reg_alpha: 6
min_child_weight: 6
max_depth: 6
learning_rate: 6
gamma: 6
colsample_bytree: 6
train_rmsle: 5
test_rmsle: 5
train_r2score: 6
test_r2score: 6


In [12]:
performance_df.head(20)

{'subsample': [0.9, 0.9, 0.8, 0.8, 0.9, 1.0],
 'reg_lambda': [100, 100, 10, 50, 100, 100],
 'reg_alpha': [0.1, 0.1, 10, 0.5, 5, 10],
 'min_child_weight': [2, 2, 10, 20, 20, 10],
 'max_depth': [11, 11, 13, 13, 7, 13],
 'learning_rate': [0.05, 0.05, 0.01, 0.05, 0.1, 0.01],
 'gamma': [0.25, 0.25, 2, 0.5, 2, 0.25],
 'colsample_bytree': [0.8, 0.8, 0.8, 0.8, 1.0, 0.8],
 'train_rmsle': [0.24068537794786146,
  0.26162224639662374,
  0.24340832707636195,
  0.26939812758723125,
  0.24607773968349803],
 'test_rmsle': [0.5242402708566164,
  0.5271134952897151,
  0.5255561508377772,
  0.5205461288230613,
  0.5261307047010538],
 'train_r2score': [0.8204304032275787,
  0.8204304032275787,
  0.7878306068028389,
  0.8163443656481106,
  0.7750310717487922,
  0.8122940447303382],
 'test_r2score': [0.5668410976816647,
  0.5668410976816647,
  0.5620800239026824,
  0.5646638499143907,
  0.5729242345179903,
  0.563711484356304]}

---