In [3]:
# Install CatBoost if not already installed
!pip install catboost

# Importing necessary libraries
from catboost import CatBoostRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

# Loading the California Housing dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='MedHouseVal')

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the first few rows of the dataset
X_train.head()




Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
14196,3.2596,33.0,5.017657,1.006421,2300.0,3.691814,32.71,-117.03
8267,3.8125,49.0,4.473545,1.041005,1314.0,1.738095,33.77,-118.16
17445,4.1563,4.0,5.645833,0.985119,915.0,2.723214,34.66,-120.48
14265,1.9425,36.0,4.002817,1.033803,1418.0,3.994366,32.69,-117.11
2271,3.5542,43.0,6.268421,1.134211,874.0,2.3,36.78,-119.8


In [4]:
# Training a basic CatBoost model
model = CatBoostRegressor(
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    l2_leaf_reg=3,
    verbose=100
)

# Fitting the model
model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0916893	test: 1.0827828	best: 1.0827828 (0)	total: 55ms	remaining: 55s
100:	learn: 0.4856977	test: 0.5141492	best: 0.5141492 (100)	total: 1.08s	remaining: 9.59s
200:	learn: 0.4317457	test: 0.4778063	best: 0.4778063 (200)	total: 2.11s	remaining: 8.37s
300:	learn: 0.4010594	test: 0.4626636	best: 0.4626636 (300)	total: 3.51s	remaining: 8.16s
400:	learn: 0.3779586	test: 0.4540902	best: 0.4540902 (400)	total: 5.71s	remaining: 8.53s
500:	learn: 0.3608106	test: 0.4491909	best: 0.4491909 (500)	total: 7.51s	remaining: 7.48s
600:	learn: 0.3459401	test: 0.4454644	best: 0.4454644 (600)	total: 8.59s	remaining: 5.7s
700:	learn: 0.3332879	test: 0.4430853	best: 0.4430408 (699)	total: 9.64s	remaining: 4.11s
800:	learn: 0.3220272	test: 0.4405790	best: 0.4405790 (800)	total: 10.9s	remaining: 2.7s
900:	learn: 0.3119731	test: 0.4386333	best: 0.4386333 (900)	total: 12s	remaining: 1.32s
999:	learn: 0.3024983	test: 0.4374350	best: 0.4373177 (980)	total: 13.5s	remaining: 0us

bestTest = 0.437317652

<catboost.core.CatBoostRegressor at 0x7b428e5db940>

In [5]:
# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model's performance using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')


RMSE: 0.4373176513932556


# Hyperparameter Tuning Using Randomized Search:

In [6]:
from sklearn.model_selection import RandomizedSearchCV

# Defining the parameter grid
param_dist = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [200, 500, 1000],
    'l2_leaf_reg': [1, 3, 5, 7]
}

# Setting up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=CatBoostRegressor(verbose=0),
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    n_jobs=-1,
    random_state=42
)

# Fitting RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters from Randomized Search
best_params = random_search.best_params_
print(f'Best Parameters: {best_params}')

# Training the model with the best parameters
best_model = CatBoostRegressor(**best_params, verbose=100)
best_model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True)

# Making predictions with the best model
best_y_pred = best_model.predict(X_test)

# Evaluating the best model's performance
best_rmse = mean_squared_error(y_test, best_y_pred, squared=False)
print(f'Best Model RMSE: {best_rmse}')


Best Parameters: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 1000, 'depth': 6}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0916893	test: 1.0827828	best: 1.0827828 (0)	total: 8.21ms	remaining: 8.2s
100:	learn: 0.4856977	test: 0.5141492	best: 0.5141492 (100)	total: 466ms	remaining: 4.15s
200:	learn: 0.4317457	test: 0.4778063	best: 0.4778063 (200)	total: 906ms	remaining: 3.6s
300:	learn: 0.4010594	test: 0.4626636	best: 0.4626636 (300)	total: 1.35s	remaining: 3.14s
400:	learn: 0.3779586	test: 0.4540902	best: 0.4540902 (400)	total: 1.8s	remaining: 2.69s
500:	learn: 0.3608106	test: 0.4491909	best: 0.4491909 (500)	total: 2.38s	remaining: 2.37s
600:	learn: 0.3459401	test: 0.4454644	best: 0.4454644 (600)	total: 2.86s	remaining: 1.9s
700:	learn: 0.3332879	test: 0.4430853	best: 0.4430408 (699)	total: 3.33s	remaining: 1.42s
800:	learn: 0.3220272	test: 0.4405790	best: 0.4405790 (800)	total: 3.78s	remaining: 938ms
900:	learn: 0.3119731	test: 0.4386333	best: 0.4386333 (900)	total: 4.27s	remaining: 469ms
999:	learn: 0.3024983	test: 0.4374350	best: 0.4373177 (980)	total: 4.71s	remaining: 0us

bestTest = 0.43731

# Exploring advanced parameters and controlling overfitting:

In [8]:
 # Using advanced parameters and controlling overfitting
advanced_model = CatBoostRegressor(
    boosting_type='Plain',  # Alternative boosting types: 'Ordered'
    task_type='CPU',        # Alternative task types: 'GPU' for using GPUs
    # depth=10, use when you dont use max_depth
    learning_rate=0.01,
    iterations=1000,
    l2_leaf_reg=5,
    max_depth=10,
    random_strength=2,
    early_stopping_rounds=50,
    verbose=100
)

# Fitting the advanced model
advanced_model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True)

# Making predictions and evaluating performance
adv_y_pred = advanced_model.predict(X_test)
adv_rmse = mean_squared_error(y_test, adv_y_pred, squared=False)
print(f'Advanced Model RMSE: {adv_rmse}')


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.1493003	test: 1.1382137	best: 1.1382137 (0)	total: 80.2ms	remaining: 1m 20s
100:	learn: 0.7422724	test: 0.7508992	best: 0.7508992 (100)	total: 4.34s	remaining: 38.6s
200:	learn: 0.6030931	test: 0.6234925	best: 0.6234925 (200)	total: 7.07s	remaining: 28.1s
300:	learn: 0.5468106	test: 0.5737754	best: 0.5737754 (300)	total: 9.76s	remaining: 22.7s
400:	learn: 0.5172075	test: 0.5492520	best: 0.5492520 (400)	total: 13.4s	remaining: 20s
500:	learn: 0.4970728	test: 0.5328287	best: 0.5328287 (500)	total: 16.8s	remaining: 16.7s
600:	learn: 0.4825299	test: 0.5221890	best: 0.5221890 (600)	total: 19.5s	remaining: 13s
700:	learn: 0.4704678	test: 0.5140187	best: 0.5140187 (700)	total: 22.3s	remaining: 9.5s
800:	learn: 0.4592164	test: 0.5068936	best: 0.5068936 (800)	total: 25.1s	remaining: 6.24s
900:	learn: 0.4494278	test: 0.5010371	best: 0.5010371 (900)	total: 29.6s	remaining: 3.25s
999:	learn: 0.4397295	test: 0.4951943	best: 0.4951943 (999)	total: 32.5s	remaining: 0us

bestTest = 0.49519

# model with aal hyperparamter

In [9]:
from catboost import CatBoostClassifier

# Initialize the CatBoostClassifier with all 24 hyperparameters
model = CatBoostClassifier(
    iterations=1000,                 # Number of trees
    learning_rate=0.03,              # Step size for each iteration
    depth=6,                         # Depth of each tree
    l2_leaf_reg=3,                   # L2 regularization to avoid overfitting
    border_count=254,                # Number of splits for numerical features
    bagging_temperature=1,           # Randomness in feature bagging
    random_strength=1,               # Randomness in tree splits
    subsample=0.8,                   # Fraction of samples used for training each tree
    max_leaves=31,                   # Maximum number of leaves in each tree
    min_data_in_leaf=1,              # Minimum samples per leaf
    cat_features=[0, 1, 2],          # Indices of categorical features
    one_hot_max_size=2,              # Max size of categories for one-hot encoding
    eval_metric='Accuracy',          # Metric used for evaluation
    early_stopping_rounds=50,        # Early stopping if no improvement
    task_type='GPU',                 # Use GPU for training (set to 'CPU' if no GPU is available)
    grow_policy='SymmetricTree',     # Strategy for growing trees
    leaf_estimation_method='Newton', # Method for estimating leaf values
    boosting_type='Plain',           # Type of boosting scheme
    use_best_model=True,             # Save the best model during training
    max_bin=254,                     # Max number of bins for numeric feature discretization
    ctr_description=['Borders', 'FeatureFreq'],  # Transformation for categorical features
    verbose=100,                     # Verbose logging every 100 iterations
    custom_loss=['AUC', 'Precision'],# Custom loss functions to optimize
    random_seed=42                   # Random seed for reproducibility
)

# Load your dataset
# X_train, y_train = load_your_data()

# Fit the model
# model.fit(X_train, y_train, cat_features=[0, 1, 2], eval_set=(X_val, y_val))

# Predict
# predictions = model.predict(X_test)




1. Iterations (iterations)
Purpose: Number of boosting rounds (trees).
Effect: More iterations generally improve performance but can lead to overfitting if too high.
2. Learning Rate (learning_rate)
Purpose: Step size at each iteration.
Effect: Lower values make learning more stable but require more iterations.
3. Depth (depth)
Purpose: Maximum depth of the trees.
Effect: Deeper trees capture more details but can overfit.
4. L2 Leaf Regularization (l2_leaf_reg)
Purpose: Regularization to prevent overfitting.
Effect: Higher values reduce overfitting by penalizing large leaf values.
5. Border Count (border_count)
Purpose: Number of splits for numeric features.
Effect: Higher values improve numeric feature handling but increase computation time.
6. Bagging Temperature (bagging_temperature)
Purpose: Controls randomness in feature bagging.
Effect: Higher values add more randomness, reducing overfitting.
7. Random Strength (random_strength)
Purpose: Adds randomness to splits.
Effect: Helps generalization, but too high values may lead to poor splits.
8. Subsample (subsample)
Purpose: Fraction of data used for training each tree.
Effect: Lower values reduce overfitting but might lead to underfitting.
9. Max Leaves (max_leaves)
Purpose: Maximum number of leaves in a tree.
Effect: Controls tree complexity; more leaves can capture more patterns.
10. Min Data in Leaf (min_data_in_leaf)
Purpose: Minimum samples required in a leaf.
Effect: Prevents creating leaves with too few samples, reducing overfitting.
11. Cat Features (cat_features)
Purpose: List of categorical feature indices.
Effect: Proper handling of categorical data improves model performance.
12. One Hot Max Size (one_hot_max_size)
Purpose: Maximum size of categories for one-hot encoding.
Effect: Helps manage memory usage and computation for large categorical features.
13. Eval Metric (eval_metric)
Purpose: Metric for evaluating model performance.
Effect: Guides model optimization; choose according to the business objective.
14. Early Stopping Rounds (early_stopping_rounds)
Purpose: Stops training if no improvement in the evaluation metric.
Effect: Prevents overfitting and saves computation time.
15. Task Type (task_type)
Purpose: Specifies whether to use CPU or GPU.
Effect: GPU can significantly speed up training on large datasets.
16. Grow Policy (grow_policy)
Purpose: Defines the strategy for growing trees.
Effect: Affects how trees are built; Depthwise builds balanced trees, Lossguide focuses on specific areas.
17. Leaf Estimation Method (leaf_estimation_method)
Purpose: Method to estimate leaf values (e.g., Newton, Gradient).
Effect: Newton is more accurate but computationally intensive, while Gradient is faster.
18. Boosting Type (boosting_type)
Purpose: Type of boosting scheme (Ordered vs. Plain).
Effect: Ordered is slower but better for handling categorical features, while Plain is faster.
19. Use Best Model (use_best_model)
Purpose: Saves the model with the best performance during training.
Effect: Helps prevent overfitting by selecting the best model.
20. Max Bin (max_bin)
Purpose: Maximum number of bins used for numeric feature discretization.
Effect: More bins increase precision but also computation time.
21. Ctr Description (ctr_description)
Purpose: Defines how categorical features are transformed into numerical representations.
Effect: Different transformations can significantly affect model performance.
22. Verbose (verbose)
Purpose: Controls the level of logging during training.
Effect: Helps monitor training progress, with no direct impact on model performance.
23. Custom Loss Function (custom_loss)
Purpose: Allows using custom loss functions.
Effect: Useful for optimizing specific metrics beyond the default options.
24. Random Seed (random_seed)
Purpose: Seed for random number generation.
Effect: Ensures reproducibility of results.