In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [27]:
# Import necessary libraries for data manipulation and machine learning
import numpy as np                 # For numerical operations on arrays and matrices
import pandas as pd                # For data manipulation and analysis
import matplotlib.pyplot as plt    # For creating static visualizations
import seaborn as sns              # For statistical data visualization based on matplotlib

# Import modules for model evaluation and selection
from sklearn.model_selection import train_test_split, cross_val_score    # For splitting data and cross-validation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score    # For model evaluation metrics

# Import machine learning algorithms
from sklearn.linear_model import LinearRegression    # For linear regression
from sklearn.ensemble import RandomForestRegressor  # For random forest regression
from sklearn.svm import SVR                          # For support vector regression
import xgboost as xgb                                # For XGBoost regression
import lightgbm as lgb                                # For LightGBM regression
from catboost import CatBoostRegressor               # For CatBoost regression

# Import modules for advanced model stacking techniques
from sklearn.ensemble import StackingRegressor        # For stacking multiple regressors
from mlxtend.regressor import StackingCVRegressor     # For stacked generalization with cross-validation

# Import additional libraries for hyperparameter tuning
import optuna    # For hyperparameter optimization

# Import metrics for additional model evaluation
from sklearn import metrics
# Import category_encoders for encoding categorical features
import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [28]:
# Define the path to the CSV file containing the data
path = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'
test_path = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(path)
test_df = pd.read_csv(test_path)

# Set option to display all columns
pd.set_option('display.max_columns', None)

# Display the DataFrame to view the loaded data
df

# Feature engineering and preprocessing
dum_data = pd.get_dummies(df, drop_first=True)
x_boost = dum_data.drop(columns=['SalePrice'])
y_boost = dum_data['SalePrice']
X_train_boost, X_test_boost, y_train_boost, y_test_boost = train_test_split(x_boost, y_boost, test_size=0.3, random_state=42)

df_cat = df.copy()
columns_to_fill = df_cat.select_dtypes('object').columns  # Assume these are the categorical columns
df_cat[columns_to_fill] = df_cat[columns_to_fill].fillna('Missing Value')
x_cat = df_cat.drop(columns=['SalePrice'])
y_cat = df_cat['SalePrice']
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(x_cat, y_cat, test_size=0.3, random_state=42)

# Ensure consistency in test data preprocessing
test_df[columns_to_fill] = test_df[columns_to_fill].fillna('Missing Value')

models = []

# Define default models for each algorithm
xgb_model_def = xgb.XGBRegressor()
lgb_model_def = lgb.LGBMRegressor()
catboost_model_def = CatBoostRegressor()
catboost_model_custom = CatBoostRegressor(cat_features=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'])

# Define models for stacking
stacking_models = [('XGBoost', xgb_model_def),
                   ('LightGBM', lgb_model_def),
                   ('CatBoost', catboost_model_def),
                   ('CatBoost_Custom', catboost_model_custom)
                   ]

# Extend models list with default models
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def),
    ('CatBoost_Custom', catboost_model_custom)
])

def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels on the training data
    y_train_pred = model.predict(X_train)

    # Predict labels on the testing data
    y_test_pred = model.predict(X_test)

    # Calculate evaluation metrics for training data
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)

    # Calculate evaluation metrics for testing data
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Training Data:')
    print('Mean Absolute Error:', train_mae)
    print('Mean Squared Error:', train_mse)
    print('R-squared:', train_r2)
    print('Testing Data:')
    print('Mean Absolute Error:', test_mae)
    print('Mean Squared Error:', test_mse)
    print('R-squared:', test_r2)

    return train_r2, test_r2

# Initialize the DataFrame
r2_df = pd.DataFrame(columns=['Model', 'Train R2', 'Test R2'])

# Train and evaluate each model
for model_name, model in models:
    if model_name == 'CatBoost_Custom':
        train_r2, test_r2 = train_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
    else:
        train_r2, test_r2 = train_and_evaluate_model(model_name, model, X_train_boost, y_train_boost, X_test_boost, y_test_boost)
        
    r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'Train R2': [train_r2], 'Test R2': [test_r2]})], ignore_index=True)

r2_df_sorted = r2_df.sort_values(by='Test R2', ascending=False)

r2_df_sorted

Model Performance for XGBoost
Training Data:
Mean Absolute Error: 633.1473596501957
Mean Squared Error: 807131.1570605971
R-squared: 0.9998658933228696
Testing Data:
Mean Absolute Error: 17331.469347531394
Mean Squared Error: 752264863.7139277
R-squared: 0.8921962229964839
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3275
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 150
[LightGBM] [Info] Start training from score 181312.692759


  r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'Train R2': [train_r2], 'Test R2': [test_r2]})], ignore_index=True)


Model Performance for LightGBM
Training Data:
Mean Absolute Error: 4936.444562899559
Mean Squared Error: 140485015.03677517
R-squared: 0.9766580952941801
Testing Data:
Mean Absolute Error: 16483.632060524043
Mean Squared Error: 714422176.401174
R-squared: 0.897619292477806
Learning rate set to 0.041084
0:	learn: 75649.7977063	total: 6.63ms	remaining: 6.62s
1:	learn: 73812.6221008	total: 11.2ms	remaining: 5.61s
2:	learn: 72009.8388298	total: 15.7ms	remaining: 5.22s
3:	learn: 70311.8827646	total: 20.3ms	remaining: 5.06s
4:	learn: 68633.5709407	total: 24.9ms	remaining: 4.96s
5:	learn: 67062.7187977	total: 29.4ms	remaining: 4.87s
6:	learn: 65473.8815537	total: 34.3ms	remaining: 4.86s
7:	learn: 64122.8211959	total: 39ms	remaining: 4.83s
8:	learn: 62617.1023404	total: 43.7ms	remaining: 4.81s
9:	learn: 61127.2368452	total: 48.9ms	remaining: 4.84s
10:	learn: 59919.7174699	total: 53.6ms	remaining: 4.82s
11:	learn: 58691.6576124	total: 58.2ms	remaining: 4.79s
12:	learn: 57347.5591398	total: 62.9

Unnamed: 0,Model,Train R2,Test R2
3,CatBoost_Custom,0.982935,0.916983
2,CatBoost,0.995348,0.916585
1,LightGBM,0.976658,0.897619
0,XGBoost,0.999866,0.892196


In [29]:
def optimize_xgboost(trial, X_train, y_train, X_test, y_test):
    param = {
        'tree_method': 'hist',
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-3, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, log=True)
    }
    
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

def optimize_lightgbm(trial, X_train, y_train, X_test, y_test):
    param = {
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 10),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)
    }
    
    model = lgb.LGBMRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

def optimize_catboost(trial, X_train, y_train, X_test, y_test):
    param = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-3, 10.0),
    }
    
    model = CatBoostRegressor(**param, verbose=0)
    model.fit(X_train, y_train, cat_features=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'])
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

In [31]:
# Optimize XGBoost
study_xgboost = optuna.create_study(direction='maximize')
study_xgboost.optimize(lambda trial: optimize_xgboost(trial, X_train_boost, y_train_boost, X_test_boost, y_test_boost), n_trials=10)
print('XGBoost best params:', study_xgboost.best_params)

# Optimize LightGBM
study_lightgbm = optuna.create_study(direction='maximize')
study_lightgbm.optimize(lambda trial: optimize_lightgbm(trial, X_train_boost, y_train_boost, X_test_boost, y_test_boost), n_trials=10)
print('LightGBM best params:', study_lightgbm.best_params)

# Optimize CatBoost
study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(lambda trial: optimize_catboost(trial, X_train_cat, y_train_cat, X_test_cat, y_test_cat), n_trials=50)
print('CatBoost best params:', study_catboost.best_params)

[I 2024-05-28 09:42:18,354] A new study created in memory with name: no-name-7a93f7c7-5331-4bc6-b5b9-479836e5fdd7
[I 2024-05-28 09:42:19,935] Trial 0 finished with value: 0.8679872691917749 and parameters: {'lambda': 0.27479403897049715, 'alpha': 0.003797727963227016, 'learning_rate': 0.009004562293143465, 'n_estimators': 229, 'max_depth': 7, 'min_child_weight': 7, 'gamma': 0.005987339300111216, 'colsample_bytree': 0.6174539183445487}. Best is trial 0 with value: 0.8679872691917749.
[I 2024-05-28 09:42:21,367] Trial 1 finished with value: 0.4708104708968427 and parameters: {'lambda': 0.014450017112625316, 'alpha': 0.0016215570261064309, 'learning_rate': 0.001120523311625978, 'n_estimators': 465, 'max_depth': 3, 'min_child_weight': 1, 'gamma': 0.004166972888680815, 'colsample_bytree': 0.8735664248608377}. Best is trial 0 with value: 0.8679872691917749.
[I 2024-05-28 09:42:23,989] Trial 2 finished with value: 0.8897925112996629 and parameters: {'lambda': 5.229298881722655, 'alpha': 0.640

XGBoost best params: {'lambda': 0.0013410677937864986, 'alpha': 7.061993133121298, 'learning_rate': 0.20935562318485407, 'n_estimators': 816, 'max_depth': 7, 'min_child_weight': 6, 'gamma': 0.004422598981029681, 'colsample_bytree': 0.8385219330036604}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3331
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 173
[LightGBM] [Info] Start training from score 181312.692759


[I 2024-05-28 09:42:52,193] Trial 0 finished with value: 0.9052913954717635 and parameters: {'lambda_l1': 1.8036468233645107, 'lambda_l2': 0.2123758534053414, 'num_leaves': 167, 'learning_rate': 0.14688377037028535, 'n_estimators': 503, 'max_depth': 6, 'min_child_samples': 10, 'feature_fraction': 0.7113923211913404}. Best is trial 0 with value: 0.9052913954717635.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3376
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 195
[LightGBM] [Info] Start training from score 181312.692759


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)




[I 2024-05-28 09:42:52,611] Trial 1 finished with value: 0.9160612196457404 and parameters: {'lambda_l1': 0.0011273962640257706, 'lambda_l2': 0.07867546754143452, 'num_leaves': 93, 'learning_rate': 0.021301136928036906, 'n_estimators': 634, 'max_depth': 3, 'min_child_samples': 5, 'feature_fraction': 0.5344739684945699}. Best is trial 1 with value: 0.9160612196457404.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3359
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 187
[LightGBM] [Info] Start training from score 181312.692759


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)




[I 2024-05-28 09:42:54,052] Trial 2 finished with value: 0.9128048864204834 and parameters: {'lambda_l1': 0.012277368524739388, 'lambda_l2': 0.007306207400663307, 'num_leaves': 33, 'learning_rate': 0.007298927919691468, 'n_estimators': 921, 'max_depth': 6, 'min_child_samples': 7, 'feature_fraction': 0.6216974600624954}. Best is trial 1 with value: 0.9160612196457404.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3376
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 195
[LightGBM] [Info] Start training from score 181312.692759


[I 2024-05-28 09:42:54,378] Trial 3 finished with value: 0.541936977039645 and parameters: {'lambda_l1': 0.03816545607537195, 'lambda_l2': 0.043937733564917154, 'num_leaves': 152, 'learning_rate': 0.002051729761553458, 'n_estimators': 292, 'max_depth': 4, 'min_child_samples': 5, 'feature_fraction': 0.5214895991247486}. Best is trial 1 with value: 0.9160612196457404.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3466
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 240
[LightGBM] [Info] Start training from score 181312.692759


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)




[I 2024-05-28 09:42:55,295] Trial 4 finished with value: 0.8996174585756551 and parameters: {'lambda_l1': 5.960103919543461, 'lambda_l2': 2.133905207362572, 'num_leaves': 169, 'learning_rate': 0.10643625773123896, 'n_estimators': 412, 'max_depth': 6, 'min_child_samples': 1, 'feature_fraction': 0.7373232749369092}. Best is trial 1 with value: 0.9160612196457404.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3408
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 211
[LightGBM] [Info] Start training from score 181312.692759


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)




[I 2024-05-28 09:42:56,986] Trial 5 finished with value: 0.9074422975570096 and parameters: {'lambda_l1': 0.003678392385819062, 'lambda_l2': 0.008851193547102988, 'num_leaves': 144, 'learning_rate': 0.0373063318010226, 'n_estimators': 618, 'max_depth': 10, 'min_child_samples': 3, 'feature_fraction': 0.5496558628457291}. Best is trial 1 with value: 0.9160612196457404.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3422
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 218
[LightGBM] [Info] Start training from score 181312.692759


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)
[I 2024-05-28 09:42:57,153] Trial 6 finished with value: 0.5930584672541183 and parameters: {'lambda_l1': 0.25296517020313436, 'lambda_l2': 0.009551958993515795, 'num_leaves': 92, 'learning_rate': 0.004561535278946822, 'n_estimators': 168, 'max_depth': 3, 'min_child_samples': 2, 'feature_fraction': 0.6818126754847644}. Best is trial 1 with value: 0.9160612196457404.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3422
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 218
[LightGBM] [Info] Start training from score 181312.692759


[I 2024-05-28 09:42:57,870] Trial 7 finished with value: 0.9027616198335673 and parameters: {'lambda_l1': 0.001082610839118641, 'lambda_l2': 0.41165255459429023, 'num_leaves': 29, 'learning_rate': 0.10579213772202777, 'n_estimators': 659, 'max_depth': 4, 'min_child_samples': 2, 'feature_fraction': 0.5036226977697809}. Best is trial 1 with value: 0.9160612196457404.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3408
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 211
[LightGBM] [Info] Start training from score 181312.692759


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)




[I 2024-05-28 09:42:59,631] Trial 8 finished with value: 0.8913332315133281 and parameters: {'lambda_l1': 0.0030082329024136495, 'lambda_l2': 0.006939226208028725, 'num_leaves': 165, 'learning_rate': 0.045953804699580846, 'n_estimators': 779, 'max_depth': 7, 'min_child_samples': 3, 'feature_fraction': 0.8878441116336877}. Best is trial 1 with value: 0.9160612196457404.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3359
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 187
[LightGBM] [Info] Start training from score 181312.692759


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)
[I 2024-05-28 09:42:59,973] Trial 9 finished with value: 0.9127936481614624 and parameters: {'lambda_l1': 0.05016271936141258, 'lambda_l2': 1.78473624991608, 'num_leaves': 101, 'learning_rate': 0.10702460078197246, 'n_estimators': 188, 'max_depth': 10, 'min_child_samples': 7, 'feature_fraction': 0.5560116931830204}. Best is trial 1 with value: 0.9160612196457404.
[I 2024-05-28 09:42:59,975] A new study created in memory with name: no-name-354ff82a-ad97-43de-8d6a-47815e188b33


LightGBM best params: {'lambda_l1': 0.0011273962640257706, 'lambda_l2': 0.07867546754143452, 'num_leaves': 93, 'learning_rate': 0.021301136928036906, 'n_estimators': 634, 'max_depth': 3, 'min_child_samples': 5, 'feature_fraction': 0.5344739684945699}


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-3, 10.0),
[I 2024-05-28 09:43:08,757] Trial 0 finished with value: 0.917529676652841 and parameters: {'depth': 4, 'learning_rate': 0.07674710244122208, 'iterations': 983, 'l2_leaf_reg': 0.001984517142491912, 'border_count': 203, 'bagging_temperature': 2.2959613490464665}. Best is trial 0 with value: 0.917529676652841.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-3, 10.0),
[I 2024-05-28 09:43:15,695] Trial 1 finished with value: 0.9133285213467698 and parameters: {'depth': 5, 'learning_rate': 0.057398132491603485, 'iterations': 566, 'l2_leaf_reg': 0.0014547120102001113, 'border_count': 41, 'bagging_te

CatBoost best params: {'depth': 4, 'learning_rate': 0.03364617759532714, 'iterations': 586, 'l2_leaf_reg': 0.012744817999708008, 'border_count': 253, 'bagging_temperature': 0.009343000931891752}


In [32]:
# Train the best CatBoost model
best_catboost_model = CatBoostRegressor(**study_catboost.best_params)
best_catboost_model.fit(X_train_cat, y_train_cat, cat_features=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'])

# Evaluate the best model
train_r2_best, test_r2_best = train_and_evaluate_model('CatBoost_Custom_Tuned', best_catboost_model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)

# Preprocess the test data similar to the training data
test_df[columns_to_fill] = test_df[columns_to_fill].fillna('Missing Value')

# Ensure that test_df has the same structure as X_train_cat
# Drop the target column if it exists in the test data (usually it doesn't for Kaggle test sets)
if 'SalePrice' in test_df.columns:
    test_df = test_df.drop(columns=['SalePrice'])

# Make predictions on the test data
predictions = best_catboost_model.predict(test_df)

# If you need to save the predictions for submission or further analysis
# Create a DataFrame with the predictions
submission_df = pd.DataFrame({
    'Id': test_df['Id'],  # Ensure 'Id' column is present in the test data
    'SalePrice': predictions
})


0:	learn: 75978.7803891	total: 12ms	remaining: 7.02s
1:	learn: 74423.5913602	total: 24.1ms	remaining: 7.04s
2:	learn: 72863.1352908	total: 33.3ms	remaining: 6.47s
3:	learn: 71411.4555872	total: 42.3ms	remaining: 6.15s
4:	learn: 69892.4329034	total: 51.3ms	remaining: 5.96s
5:	learn: 68464.7187935	total: 60.5ms	remaining: 5.85s
6:	learn: 67289.4388846	total: 70ms	remaining: 5.79s
7:	learn: 66033.2512980	total: 79.2ms	remaining: 5.72s
8:	learn: 64921.5678149	total: 89.3ms	remaining: 5.72s
9:	learn: 63604.0917706	total: 100ms	remaining: 5.76s
10:	learn: 62502.0682426	total: 110ms	remaining: 5.77s
11:	learn: 61379.1420743	total: 119ms	remaining: 5.68s
12:	learn: 60348.2577384	total: 128ms	remaining: 5.66s
13:	learn: 59182.9087242	total: 137ms	remaining: 5.61s
14:	learn: 58195.4338450	total: 146ms	remaining: 5.56s
15:	learn: 57198.4766142	total: 154ms	remaining: 5.5s
16:	learn: 56231.2649652	total: 163ms	remaining: 5.45s
17:	learn: 55244.1335820	total: 172ms	remaining: 5.42s
18:	learn: 54285

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=2]="RL": Cannot convert 'b'RL'' to float

In [34]:
# Save the predictions to a CSV file
submission_df.to_csv('catboost_predictions2.csv', index=False)

print('Predictions saved to catboost_predictions.csv')

Predictions saved to catboost_predictions.csv
