# CatBoost Hyperparameter Optimization

Optimize CatBoost hyperparameters to improve CV score.

Parameters to test:
- Learning rates: [0.01, 0.03, 0.05, 0.1] with proportional iterations
- Depths: [4, 5, 6, 7, 8]
- Regularization: reg_lambda values [1, 3, 5, 10]
- Early stopping rounds: [30, 50, 100]

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

print("Loading data...")
train_df = pd.read_csv('/home/code/data/train.csv')
test_df = pd.read_csv('/home/code/data/test.csv')

print(f"Train: {train_df.shape}, Test: {test_df.shape}")

Loading data...
Train: (8000, 9), Test: (2000, 9)


In [2]:
def create_features(df):
    """Create engineered features for the model"""
    df_new = df.copy()
    
    # Original numerical features
    num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    
    # Log1p transformations
    for col in num_features:
        df_new[f'{col}_log1p'] = np.log1p(df_new[col])
    
    # Product features (from winning solutions)
    df_new['Weight_Duration'] = df_new['Weight'] * df_new['Duration']
    df_new['Duration_Heart_Rate'] = df_new['Duration'] * df_new['Heart_Rate']
    df_new['Height_Weight'] = df_new['Height'] * df_new['Weight']
    
    # Ratio features
    df_new['Weight_Height'] = df_new['Weight'] / (df_new['Height'] + 1e-6)
    
    # BMI feature (Body Mass Index approximation)
    df_new['BMI'] = df_new['Weight'] / ((df_new['Height'] / 100) ** 2 + 1e-6)
    
    return df_new

# Create features
train_feat = create_features(train_df)
test_feat = create_features(test_df)

# Define feature columns
feature_cols = [col for col in train_feat.columns if col not in ['id', 'Calories']]
cat_features = ['Sex'] if 'Sex' in feature_cols else []

print(f"Features: {len(feature_cols)}")
print(f"Categorical: {cat_features}")

Features: 18
Categorical: ['Sex']


In [3]:
# Prepare data
X = train_feat[feature_cols]
y = train_feat['Calories']
X_test = test_feat[feature_cols]

print(f"X shape: {X.shape}, y shape: {y.shape}, X_test shape: {X_test.shape}")

X shape: (8000, 18), y shape: (8000,), X_test shape: (2000, 18)


In [4]:
def train_catboost(params, X_train, y_train, X_val, y_val, cat_features):
    """Train CatBoost model and return validation score"""
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)
    
    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
    
    pred_val = model.predict(val_pool)
    score = np.sqrt(mean_squared_log_error(y_val, np.clip(pred_val, 0, None)))
    
    return score, model.best_iteration_

# Test configurations
configs = [
    # Learning rate sweep (adjust iterations proportionally)
    {'name': 'lr_0.01', 'learning_rate': 0.01, 'iterations': 2500, 'depth': 6},
    {'name': 'lr_0.03', 'learning_rate': 0.03, 'iterations': 1500, 'depth': 6},
    {'name': 'lr_0.05', 'learning_rate': 0.05, 'iterations': 1000, 'depth': 6},  # baseline
    {'name': 'lr_0.1', 'learning_rate': 0.1, 'iterations': 500, 'depth': 6},
    
    # Depth exploration
    {'name': 'depth_4', 'learning_rate': 0.05, 'iterations': 1000, 'depth': 4},
    {'name': 'depth_5', 'learning_rate': 0.05, 'iterations': 1000, 'depth': 5},
    {'name': 'depth_7', 'learning_rate': 0.05, 'iterations': 1000, 'depth': 7},
    {'name': 'depth_8', 'learning_rate': 0.05, 'iterations': 1000, 'depth': 8},
]

base_params = {
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': SEED,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'CPU'
}

print("Testing configurations...")
print("="*60)

results = []
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=SEED)

for config in configs:
    print(f"\nTesting: {config['name']}")
    print(f"  lr={config['learning_rate']}, depth={config['depth']}, iter={config['iterations']}")
    
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
        
        params = {**base_params, **config}
        score, best_iter = train_catboost(params, X_tr, y_tr, X_va, y_va, cat_features)
        fold_scores.append(score)
        
        print(f"  Fold {fold}: {score:.6f} (best_iter: {best_iter})")
    
    mean_score = np.mean(fold_scores)
    std_score = np.std(fold_scores)
    
    results.append({
        'name': config['name'],
        'learning_rate': config['learning_rate'],
        'depth': config['depth'],
        'iterations': config['iterations'],
        'mean_score': mean_score,
        'std_score': std_score,
        'fold_scores': fold_scores
    })
    
    print(f"  Mean: {mean_score:.6f} ± {std_score:.6f}")
    print(f"  Folds: {fold_scores}")

Testing configurations...

Testing: lr_0.01
  lr=0.01, depth=6, iter=2500


  Fold 1: 0.201661 (best_iter: 939)


  Fold 2: 0.195478 (best_iter: 829)


  Fold 3: 0.205664 (best_iter: 918)


  Fold 4: 0.212612 (best_iter: 535)


  Fold 5: 0.194060 (best_iter: 1220)
  Mean: 0.201895 ± 0.006807
  Folds: [0.2016614576991516, 0.1954784804152407, 0.20566431741620952, 0.2126119462128966, 0.1940604545035425]

Testing: lr_0.03
  lr=0.03, depth=6, iter=1500


  Fold 1: 0.201634 (best_iter: 308)


  Fold 2: 0.195838 (best_iter: 265)


  Fold 3: 0.206095 (best_iter: 315)


  Fold 4: 0.213105 (best_iter: 170)


  Fold 5: 0.194260 (best_iter: 404)
  Mean: 0.202187 ± 0.006898
  Folds: [0.20163386062060917, 0.19583831517972813, 0.20609546106687643, 0.21310546077700732, 0.19425953071256985]

Testing: lr_0.05
  lr=0.05, depth=6, iter=1000


  Fold 1: 0.201957 (best_iter: 190)


  Fold 2: 0.195251 (best_iter: 164)


  Fold 3: 0.205840 (best_iter: 178)


  Fold 4: 0.213369 (best_iter: 108)


  Fold 5: 0.194525 (best_iter: 216)
  Mean: 0.202188 ± 0.007003
  Folds: [0.2019565710485481, 0.19525077238224864, 0.20584031574655312, 0.21336855086167242, 0.19452510286332308]

Testing: lr_0.1
  lr=0.1, depth=6, iter=500


  Fold 1: 0.202320 (best_iter: 113)


  Fold 2: 0.195393 (best_iter: 137)


  Fold 3: 0.206547 (best_iter: 104)
  Fold 4: 0.213538 (best_iter: 52)


  Fold 5: 0.194633 (best_iter: 108)
  Mean: 0.202486 ± 0.007080
  Folds: [0.20232021545042395, 0.19539312727278865, 0.2065471090161467, 0.2135380166136583, 0.19463274437128367]

Testing: depth_4
  lr=0.05, depth=4, iter=1000


  Fold 1: 0.201429 (best_iter: 211)


  Fold 2: 0.195740 (best_iter: 214)


  Fold 3: 0.205280 (best_iter: 235)


  Fold 4: 0.212750 (best_iter: 142)


  Fold 5: 0.194010 (best_iter: 284)
  Mean: 0.201842 ± 0.006776
  Folds: [0.2014289812896486, 0.19573982027823, 0.20527994406543565, 0.2127498230043104, 0.19401047813658837]

Testing: depth_5
  lr=0.05, depth=5, iter=1000


  Fold 1: 0.202034 (best_iter: 193)


  Fold 2: 0.196227 (best_iter: 199)


  Fold 3: 0.205668 (best_iter: 215)


  Fold 4: 0.212765 (best_iter: 116)


  Fold 5: 0.194640 (best_iter: 296)
  Mean: 0.202267 ± 0.006580
  Folds: [0.20203387423286637, 0.19622736616449726, 0.20566814760632765, 0.21276474993529645, 0.19463985778337417]

Testing: depth_7
  lr=0.05, depth=7, iter=1000


  Fold 1: 0.202222 (best_iter: 179)


  Fold 2: 0.195554 (best_iter: 179)


  Fold 3: 0.206360 (best_iter: 188)


  Fold 4: 0.213315 (best_iter: 97)


  Fold 5: 0.195167 (best_iter: 223)
  Mean: 0.202524 ± 0.006840
  Folds: [0.2022215962840198, 0.19555399386800193, 0.20636033785713043, 0.21331522883630105, 0.19516732046311872]

Testing: depth_8
  lr=0.05, depth=8, iter=1000


  Fold 1: 0.202277 (best_iter: 173)


  Fold 2: 0.196128 (best_iter: 138)


  Fold 3: 0.206674 (best_iter: 207)


  Fold 4: 0.213229 (best_iter: 110)


  Fold 5: 0.194971 (best_iter: 175)
  Mean: 0.202656 ± 0.006778
  Folds: [0.20227686281892054, 0.1961281655870748, 0.20667359189799786, 0.2132289150588326, 0.1949712514952475]


In [5]:
# Display results summary
print("\n" + "="*80)
print("HYPERPARAMETER OPTIMIZATION RESULTS")
print("="*80)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('mean_score')

for _, row in results_df.iterrows():
    print(f"{row['name']:15s} | lr={row['learning_rate']:.2f} | depth={row['depth']} | "
          f"score={row['mean_score']:.6f} ± {row['std_score']:.6f}")

print("\n" + "="*80)
print("Best configuration:")
best = results_df.iloc[0]
print(f"{best['name']} - Score: {best['mean_score']:.6f} ± {best['std_score']:.6f}")
print(f"Parameters: lr={best['learning_rate']}, depth={best['depth']}, iter={best['iterations']}")

# Compare to baseline
baseline_score = 0.202383
print(f"\nBaseline score: {baseline_score:.6f}")
print(f"Improvement: {best['mean_score'] - baseline_score:+.6f}")


HYPERPARAMETER OPTIMIZATION RESULTS
depth_4         | lr=0.05 | depth=4 | score=0.201842 ± 0.006776
lr_0.01         | lr=0.01 | depth=6 | score=0.201895 ± 0.006807
lr_0.03         | lr=0.03 | depth=6 | score=0.202187 ± 0.006898
lr_0.05         | lr=0.05 | depth=6 | score=0.202188 ± 0.007003
depth_5         | lr=0.05 | depth=5 | score=0.202267 ± 0.006580
lr_0.1          | lr=0.10 | depth=6 | score=0.202486 ± 0.007080
depth_7         | lr=0.05 | depth=7 | score=0.202524 ± 0.006840
depth_8         | lr=0.05 | depth=8 | score=0.202656 ± 0.006778

Best configuration:
depth_4 - Score: 0.201842 ± 0.006776
Parameters: lr=0.05, depth=4, iter=1000

Baseline score: 0.202383
Improvement: -0.000541


In [6]:
# Train final model with best parameters and generate predictions
best_config = {
    'learning_rate': best['learning_rate'],
    'depth': best['depth'],
    'iterations': best['iterations'],
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': SEED,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'CPU'
}

print(f"Training final model with best parameters: {best['name']}")
print(f"Parameters: {best_config}")

oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))
fold_scores = []

fold = 1
for train_idx, val_idx in kf.split(X):
    print(f"\nFold {fold}/5")
    
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    
    train_pool = Pool(X_tr, y_tr, cat_features=cat_features)
    val_pool = Pool(X_va, y_va, cat_features=cat_features)
    
    model = CatBoostRegressor(**best_config)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
    
    # Validation predictions
    pred_va = model.predict(val_pool)
    oof_predictions[val_idx] = pred_va
    
    # Test predictions
    test_pool = Pool(X_test, cat_features=cat_features)
    pred_test = model.predict(test_pool)
    test_predictions += pred_test / n_folds
    
    # Score
    score = np.sqrt(mean_squared_log_error(y_va, np.clip(pred_va, 0, None)))
    fold_scores.append(score)
    
    print(f"  Fold {fold} RMSLE: {score:.6f} (best_iter: {model.best_iteration_})")
    
    fold += 1

final_score = np.mean(fold_scores)
final_std = np.std(fold_scores)

print("\n" + "="*60)
print("FINAL MODEL RESULTS")
print("="*60)
print(f"CV RMSLE: {final_score:.6f} ± {final_std:.6f}")
print(f"Individual folds: {fold_scores}")
print(f"Improvement over baseline: {final_score - baseline_score:+.6f}")

Training final model with best parameters: depth_4
Parameters: {'learning_rate': 0.05, 'depth': 4, 'iterations': 1000, 'loss_function': 'RMSE', 'eval_metric': 'RMSE', 'random_seed': 42, 'verbose': False, 'allow_writing_files': False, 'task_type': 'CPU'}

Fold 1/5


  Fold 1 RMSLE: 0.201429 (best_iter: 211)

Fold 2/5


  Fold 2 RMSLE: 0.195740 (best_iter: 214)

Fold 3/5


  Fold 3 RMSLE: 0.205280 (best_iter: 235)

Fold 4/5
  Fold 4 RMSLE: 0.212750 (best_iter: 142)

Fold 5/5


  Fold 5 RMSLE: 0.194010 (best_iter: 284)

FINAL MODEL RESULTS
CV RMSLE: 0.201842 ± 0.006776
Individual folds: [0.2014289812896486, 0.19573982027823, 0.20527994406543565, 0.2127498230043104, 0.19401047813658837]
Improvement over baseline: -0.000541


In [7]:
# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'Calories': np.clip(test_predictions, train_df['Calories'].min(), train_df['Calories'].max())
})

submission_path = '/home/submission/submission_004_catboost_hyperopt.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission saved: {submission_path}")

# Save OOF predictions
oof_df = pd.DataFrame({
    'id': train_df['id'],
    'oof_prediction': oof_predictions
})
oof_path = '/home/code/experiments/oof_004_catboost_hyperopt.csv'
oof_df.to_csv(oof_path, index=False)

print(f"OOF predictions saved: {oof_path}")


Submission saved: /home/submission/submission_004_catboost_hyperopt.csv
OOF predictions saved: /home/code/experiments/oof_004_catboost_hyperopt.csv
