## Final Ensemble Model

Use optuna to ensemble XGBoost, CatBoost and LightGBM.

In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/buiding-density-medium/train_df_with_building_density_medium.csv
/kaggle/input/buiding-density-medium/valid_df_with_building_density_medium.csv
/kaggle/input/final-building-density/validation_data_with_density.csv
/kaggle/input/final-building-density/uncorrupted_training_data_with_density.csv
/kaggle/input/final-satellite/validation_data_with_satellite.csv
/kaggle/input/final-satellite/training_data_with_satellite.csv
/kaggle/input/final-road-density/train_df_with_road_density.csv
/kaggle/input/final-road-density/valid_df_with_road_density.csv
/kaggle/input/building-density-tall/train_df_with_building_density_tall.csv
/kaggle/input/building-density-tall/valid_df_with_building_density_tall.csv


In [11]:
pip install optuna-integration[lightgbm]

Collecting optuna-integration[lightgbm]
  Downloading optuna_integration-4.2.1-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.2.1-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.2.1
Note: you may need to restart the kernel to use updated packages.


In [12]:
# Feature Engineering
from sklearn.preprocessing import StandardScaler

# Machine Learning
from sklearn.metrics import r2_score, make_scorer,mean_squared_error,mean_squared_log_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
import optuna
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from lightgbm import early_stopping
from optuna.integration import LightGBMPruningCallback
from functools import partial

# Combine Datasets

In [13]:
train_density = pd.read_csv("/kaggle/input/final-building-density/uncorrupted_training_data_with_density.csv")
validation_density = pd.read_csv("/kaggle/input/final-building-density/validation_data_with_density.csv")

In [14]:
train_satellite = pd.read_csv("/kaggle/input/final-satellite/training_data_with_satellite.csv")
validation_satellite = pd.read_csv("/kaggle/input/final-satellite/validation_data_with_satellite.csv")

In [15]:
train_road = pd.read_csv("/kaggle/input/final-road-density/train_df_with_road_density.csv")
validation_road = pd.read_csv("/kaggle/input/final-road-density/valid_df_with_road_density.csv")

In [16]:
train_density_medium = pd.read_csv("/kaggle/input/buiding-density-medium/train_df_with_building_density_medium.csv")
validation_density_medium = pd.read_csv("/kaggle/input/buiding-density-medium/valid_df_with_building_density_medium.csv")

In [17]:
train_density_tall = pd.read_csv("/kaggle/input/building-density-tall/train_df_with_building_density_tall.csv")
validation_density_tall = pd.read_csv("/kaggle/input/building-density-tall/valid_df_with_building_density_tall.csv")

In [18]:
train_satellite = train_satellite.drop(["Longitude","Latitude","datetime","UHI Index"], axis=1)
train_road = train_road.drop(["Longitude","Latitude","datetime","UHI Index"], axis=1)
train_density_medium = train_density_medium.drop(["Longitude","Latitude","datetime","UHI Index"], axis=1)
train_density_tall = train_density_tall.drop(["Longitude","Latitude","datetime","UHI Index"], axis=1)
train_concat = pd.concat([train_density, train_satellite, train_road, train_density_medium, train_density_tall], axis = 1)
train_concat.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,density,B01,B02,B03,B04,B05,...,B8A,B11,B12,NDVI,NDBI,NDWI,LST,road_density,building_density_medium,building_density_tall
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,8,1036.934045,1144.664522,1325.140099,1373.032466,1644.462855,...,2552.886809,2305.273326,1920.79687,0.284717,-0.033706,-0.300946,37.14364,798.078274,14,6
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,8,1036.971023,1144.3587,1325.462754,1372.924045,1645.394702,...,2556.644227,2309.511781,1923.721499,0.285353,-0.033441,-0.301429,37.147602,796.366736,12,6
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,8,1038.80155,1145.059082,1327.088037,1374.093302,1649.179292,...,2563.074729,2316.142001,1928.230915,0.28593,-0.033062,-0.30183,37.169067,793.343813,13,6
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,8,1040.573078,1146.855305,1329.966823,1376.344928,1653.159602,...,2571.105817,2323.46653,1933.006431,0.286648,-0.033084,-0.3023,37.180983,790.201109,14,5
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,8,1041.853259,1149.656095,1333.705642,1380.476761,1657.821397,...,2578.13651,2331.205495,1938.843467,0.286367,-0.032615,-0.302108,37.173729,787.450494,14,4


In [19]:
validation_satellite = validation_satellite.drop(["Longitude", "Latitude", "UHI Index"], axis=1)
validation_road = validation_road.drop(["Longitude", "Latitude", "UHI Index"], axis=1)
validation_density_medium = validation_density_medium.drop(["Longitude", "Latitude", "UHI Index"], axis=1)
validation_density_tall = validation_density_tall.drop(["Longitude", "Latitude", "UHI Index"], axis=1)
validation_concat = pd.concat([validation_density, validation_satellite, validation_road, validation_density_medium, validation_density_tall], axis=1)
validation_concat.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Longitude,Latitude,UHI Index,density,B01,B02,B03,B04,B05,B06,...,B8A,B11,B12,NDVI,NDBI,NDWI,LST,road_density,building_density_medium,building_density_tall
0,-73.971665,40.788763,,12,1109.576929,1229.599619,1355.112575,1425.64383,1626.311228,2012.087688,...,2224.51603,2146.13673,1826.886547,0.200376,0.001399,-0.224598,36.637337,1876.839924,26,94
1,-73.971928,40.788875,,12,1115.170029,1239.38114,1365.236988,1438.272807,1636.686696,2007.709649,...,2214.21652,2150.349123,1836.398977,0.194253,0.004341,-0.219195,36.703919,2095.162872,24,97
2,-73.96708,40.78908,,4,795.622951,881.664959,1005.529128,972.993999,1216.328747,1958.375732,...,2336.408811,1815.682231,1377.212968,0.395391,-0.105857,-0.381427,33.485227,1285.873617,29,48
3,-73.97255,40.789082,,12,1120.069213,1239.884109,1364.432543,1438.773632,1638.771291,2003.329968,...,2206.887035,2147.318115,1831.670618,0.19295,0.004815,-0.218353,36.817434,2848.513646,29,94
4,-73.969697,40.787953,,9,951.219441,1046.451471,1173.158542,1190.45396,1423.341238,2052.694042,...,2383.179476,2009.478846,1600.800322,0.315578,-0.064867,-0.322152,35.072928,1908.523907,157,73


# Select features 

In [20]:
features = ['B01','B8A','B11','B12','LST','NDVI','NDBI','NDWI','density','road_density','building_density_medium','building_density_tall']
train_df = train_concat[features + ["UHI Index"]]
train_df

Unnamed: 0,B01,B8A,B11,B12,LST,NDVI,NDBI,NDWI,density,road_density,building_density_medium,building_density_tall,UHI Index
0,1036.934045,2552.886809,2305.273326,1920.796870,37.143640,0.284717,-0.033706,-0.300946,8,798.078274,14,6,1.030289
1,1036.971023,2556.644227,2309.511781,1923.721499,37.147602,0.285353,-0.033441,-0.301429,8,796.366736,12,6,1.030289
2,1038.801550,2563.074729,2316.142001,1928.230915,37.169067,0.285930,-0.033062,-0.301830,8,793.343813,13,6,1.023798
3,1040.573078,2571.105817,2323.466530,1933.006431,37.180983,0.286648,-0.033084,-0.302300,8,790.201109,14,5,1.023798
4,1041.853259,2578.136510,2331.205495,1938.843467,37.173729,0.286367,-0.032615,-0.302108,8,787.450494,14,4,1.021634
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11224,505.685643,3164.168008,1954.692083,1197.219230,31.250745,0.643753,-0.211530,-0.590693,0,1579.040587,0,26,0.972470
11225,506.378600,3154.576962,1951.042245,1196.335477,31.230093,0.642562,-0.210976,-0.589601,0,1586.321518,0,26,0.972470
11226,506.808327,3137.486194,1943.507378,1193.436669,31.220690,0.641328,-0.210249,-0.588194,0,1595.356778,0,26,0.981124
11227,506.673153,3134.157718,1941.849744,1192.670373,31.205963,0.640902,-0.210145,-0.587743,0,1598.145380,0,26,0.981245


# Remove duplicates from training data

In [21]:
# Remove duplicate rows from the DataFrame based on specified columns and keep the first occurrence
for col in features:
    # Check if the value is a numpy array and has more than one dimension
    train_df[col] = train_df[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# Now remove duplicates
uhi_data = train_df.drop_duplicates(subset=features, keep='first')
uhi_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[col] = train_df[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)


Unnamed: 0,B01,B8A,B11,B12,LST,NDVI,NDBI,NDWI,density,road_density,building_density_medium,building_density_tall,UHI Index
0,1036.934045,2552.886809,2305.273326,1920.79687,37.14364,0.284717,-0.033706,-0.300946,8,798.078274,14,6,1.030289
1,1036.971023,2556.644227,2309.511781,1923.721499,37.147602,0.285353,-0.033441,-0.301429,8,796.366736,12,6,1.030289
2,1038.80155,2563.074729,2316.142001,1928.230915,37.169067,0.28593,-0.033062,-0.30183,8,793.343813,13,6,1.023798
3,1040.573078,2571.105817,2323.46653,1933.006431,37.180983,0.286648,-0.033084,-0.3023,8,790.201109,14,5,1.023798
4,1041.853259,2578.13651,2331.205495,1938.843467,37.173729,0.286367,-0.032615,-0.302108,8,787.450494,14,4,1.021634


In [22]:
uhi_data.shape

(11229, 13)

In [23]:
# Resetting the index of the dataset
uhi_data=uhi_data.reset_index(drop=True)

In [24]:
uhi_data.isna().sum()

B01                        0
B8A                        0
B11                        0
B12                        0
LST                        0
NDVI                       0
NDBI                       0
NDWI                       0
density                    0
road_density               0
building_density_medium    0
building_density_tall      0
UHI Index                  0
dtype: int64

# Model Building

In [25]:
# Split the data into features (X) and target (y)
X = uhi_data.drop(columns=['UHI Index']).values
y = uhi_data ['UHI Index'].values

**Feature Scaling**

In [26]:
# Scale the training and test data using standardscaler
sc = StandardScaler()
X = sc.fit_transform(X)

## **Model Training**

## XGBoost

In [27]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1200, step=100),
        'max_depth': trial.suggest_int('max_depth', 2, 120, step=2),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 0.8),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10),  # L1 regularization
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10) # L2 regularization
    }
    
    model = XGBRegressor(objective='reg:squarederror', random_state=42, **params)
    score = cross_val_score(model, X, y, cv=5, scoring='r2', n_jobs=-1)
    return score.mean()

# Run the optimization
#study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_trials=10000, n_jobs=-1)

# Best hyperparameters
#best_params = study.best_params
#print("Best Hyperparameters found:", best_params)

# 📌 Version 0.9770
best_params = {'n_estimators': 1100, 'max_depth': 114, 'learning_rate': 0.015591888704234214, 'subsample': 0.8049957937585521, 'colsample_bytree': 0.8908642503612316, 'min_child_weight': 4, 'gamma': 1.923273380075315e-07, 'reg_alpha': 1.766675566559426e-05, 'reg_lambda': 1.148751628522958}
# Train the model with best parameters
best_xgb = XGBRegressor(**best_params)

## CatBoost

### Define validation function

In [28]:
def cross_validation(model, X, y, cv, verbose=True):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    pipeline = make_pipeline(StandardScaler(), model)
    scores = cross_val_score(model, X, y, scoring="r2", cv=kf)
    if verbose:
        model_name = model.__class__.__name__
        print(f"Cross validation score for {model_name}: {scores.mean():.3f} +- {scores.std():.3f}")
    return scores

In [29]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

def custom_fit_cross_val(model, X, y, cv):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42) 
    cv_scores = []

    for train_idx, val_idx in kf.split(X):
        X_train_cv, X_val_cv = X[train_idx], X[val_idx]
        y_train_cv, y_val_cv = y[train_idx], y[val_idx]

        sc = StandardScaler()
        sc.fit_transform(X_train_cv)
        sc.transform(X_val_cv)
        
        # Fit with early stopping
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)], 
            early_stopping_rounds=50, 
            verbose=False
        )
            
        y_pred_cv = model.predict(X_val_cv)
        
        score = mean_squared_error(y_val_cv, y_pred_cv, squared=False)
        cv_scores.append(score)

    return np.mean(cv_scores)

In [30]:
def objective(trial):
    boosting_type = trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
    grow_policy = trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide'])

    all_score_functions = ['SolarL2', 'Cosine', 'NewtonCosine', 'L2', 'LOOL2', 'NewtonL2']
    score_function = trial.suggest_categorical('score_function', all_score_functions)

    if boosting_type == 'Ordered' and score_function in ['LOOL2', 'SolarL2', 'L2', 'NewtonL2']:
        return float('inf')

    if boosting_type == 'Ordered' and grow_policy in ['Lossguide', 'Depthwise']:
        return float('inf')

    params = {
        'boosting_type': boosting_type,
        'grow_policy': grow_policy,
        "score_function": score_function,
        "iterations": trial.suggest_int("iterations", 100, 1500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 10.0, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_float("random_strength", 1.0, 10.0),
        "leaf_estimation_iterations": trial.suggest_int("leaf_estimation_iterations", 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'feature_border_type': trial.suggest_categorical('feature_border_type', ['GreedyLogSum', 'MinEntropy', 'Median', 'UniformAndQuantiles']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'No', 'Poisson']),
        "eval_metric": "RMSE",
        "loss_function": "RMSE",
        "task_type": "GPU",
        "verbose": 0,
    }

    # Conditional parameters based on the selected settings
    if params['grow_policy'] == 'Lossguide':
        params['max_leaves'] = trial.suggest_int('max_leaves', 2, 64)

    if params['bootstrap_type'] in ['Bernoulli', 'Poisson']:
        params['subsample'] = trial.suggest_float('subsample', 0.1, 1.0)

    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    
    
    model = CatBoostRegressor(**params)
    return custom_fit_cross_val(model, X, y, cv=5)

# Run Optuna optimization
#study_cb = optuna.create_study(direction="minimize")
#study_cb.optimize(objective, n_trials=100)

# 5. Print best hyperparameters
#best_params = study_cb.best_params
#best_params["task_type"] = "GPU"
#best_params["eval_metric"] = "RMSE"
#best_params["loss_function"] = "RMSE"
#best_params["verbose"] = 0
#print("Best CatBoost Hyperparameters:", best_params)
#print("Best rmse score is:", study_cb.best_value)

# Train the model with best parameters
# 📌 Version 0.9750
best_params = {'boosting_type': 'Plain', 'grow_policy': 'Depthwise', 'score_function': 'SolarL2', 'iterations': 1039, 'learning_rate': 0.05627818345201275, 'depth': 10, 'l2_leaf_reg': 8.330279687259916, 'border_count': 255, 'random_strength': 7.767297165881897, 'leaf_estimation_iterations': 4, 'min_data_in_leaf': 13, 'feature_border_type': 'UniformAndQuantiles', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.2397344481053751}
best_params["task_type"] = "GPU"
best_params["eval_metric"] = "RMSE"
best_params["loss_function"] = "RMSE"
best_params["verbose"] = 0
best_cb = CatBoostRegressor(**best_params)
# best_cb.fit(X,y)

## LightGBM

In [31]:
def lgbm_cross_val(model, trial, X, y, cv):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42) 
    cv_scores = []

    for train_idx, val_idx in kf.split(X):
        X_train_cv, X_val_cv = X[train_idx], X[val_idx]
        y_train_cv, y_val_cv = y[train_idx], y[val_idx]

        sc = StandardScaler()
        sc.fit_transform(X_train_cv)
        sc.transform(X_val_cv)
        
        # Fit with early stopping
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)], 
            callbacks=[early_stopping(50), LightGBMPruningCallback(trial, "rmse"),]
        )
            
        y_pred_cv = model.predict(X_val_cv)
        
        score = mean_squared_error(y_val_cv, y_pred_cv, squared=False)
        cv_scores.append(score)

    return np.mean(cv_scores)

In [32]:
def objective(trial):
    boosting_type = trial.suggest_categorical("boosting_type", ["gbdt", "dart", "goss", "rf"])
    params = {
        "boosting_type": boosting_type,
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),  # Number of trees
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.5, log=True),  # Step size shrinkage
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),  # Maximum number of leaves per tree
        "max_depth": trial.suggest_int("max_depth", 3, 20),  # Depth of trees
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),  # Minimum samples per leaf
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),  # Fraction of features per tree
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-4, 10.0, log=True),  # L1 regularization
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-4, 10.0, log=True),  # L2 regularization
        "metric": "rmse",
        # "device_type": "gpu",
        "random_state": 42,
        "verbose": -1,
    }

    # Mode-specific tuning
    if boosting_type == "gbdt" or boosting_type == "rf":
        params["bagging_fraction"] = trial.suggest_float("bagging_fraction", 0.5, 1.0)
        params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 10)
    
    elif boosting_type == "dart":
        params["drop_rate"] = trial.suggest_float("drop_rate", 0.01, 0.5)
        params["max_drop"] = trial.suggest_int("max_drop", 5, 50)
        params["skip_drop"] = trial.suggest_float("skip_drop", 0.0, 1.0)
    
    elif boosting_type == "goss":
        params["top_rate"] = trial.suggest_float("top_rate", 0.1, 0.5)
        params["other_rate"] = trial.suggest_float("other_rate", 0.1, 0.5)
        # No bagging_fraction in GOSS
    
    model = LGBMRegressor(**params)
    return lgbm_cross_val(model, trial, X, y, cv=5)

# #  4. Run Optuna optimization
#study_lgb = optuna.create_study(direction="minimize") 
#study_lgb.optimize(objective, n_trials=5000)

#best_params = study_lgb.best_params
# #  5. Print best hyperparameters
#print("Best LightGBM Hyperparameters:", study_lgb.best_params)
#print("Best cross validation score is:", study_lgb.best_value)
# 📌 Version 0.9741
best_params = {'boosting_type': 'goss', 'n_estimators': 936, 'learning_rate': 0.21380353035497862, 'num_leaves': 186, 'max_depth': 18, 'min_data_in_leaf': 3, 'feature_fraction': 0.9876949484780569, 'lambda_l1': 0.010202698630371546, 'lambda_l2': 2.489485764075577, 'top_rate': 0.25542235514965383, 'other_rate': 0.36917949534546296}
best_lgbm = LGBMRegressor(**best_params)

## Optuna Ensemble

In [34]:
def genOOF(model, X, y, model_type):
    """
    Generate Out-of-Fold (OOF) predictions and evaluate models.

    Args:
        model: The model to train.
        X: Features dataset (numpy array or DataFrame).
        y: Target variable (continuous).
        model_type: Model type identifier (1 for XGBoost, 2 for CatBoost, 3 for HistGradientBoosting).

    Returns:
        oof_valid_preds: Out-of-fold predictions.
    """
    cv = KFold(n_splits=10, shuffle=True, random_state=42)  # Use KFold for continuous targets
    cv_scores = np.empty(10)
    oof_valid_preds = np.zeros(len(y))

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X)):
        X_train, y_train = X[train_idx], y[train_idx]  # Training set for the current fold
        X_valid, y_valid = X[valid_idx], y[valid_idx]  # Validation set for the current fold

        if model_type == 1:  # XGBoost
            model.fit(X_train, y_train,
                      eval_set=[(X_valid, y_valid)],
                      early_stopping_rounds=20,
                      verbose=False)
        elif model_type == 2:  # CatBoost
            model.fit(X_train, y_train,
                      eval_set=(X_valid, y_valid),
                      verbose=False,
                      early_stopping_rounds=20)  
        elif model_type == 3: # LightGBM
            model.fit(X_train, y_train, 
                      eval_set=[(X_valid, y_valid)]) 
        valid_preds = model.predict(X_valid)
        oof_valid_preds[valid_idx] = valid_preds

        # Calculate the evaluation metric (RMSLE) for the validation set
        cv_scores[fold] = np.sqrt(mean_squared_log_error(y_valid, valid_preds))

    mean_rmse = np.mean(cv_scores)  # Average evaluation metric across all folds
    names = {1: "XGBoost", 2: "CatBoost", 3: "LightGBM"}
    print(f'The mean RMSE for {names[model_type]} is {mean_rmse:.4f}')
    return oof_valid_preds

In [35]:
oof_valid_preds_xgb = genOOF(best_xgb, X, y, 1)

The mean RMSE for XGBoost is 0.0014


In [36]:
oof_valid_preds_cb = genOOF(best_cb, X, y, 2)

The mean RMSE for CatBoost is 0.0014


In [37]:
oof_valid_preds_lgbm = genOOF(best_lgbm, X, y, 3)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2614
[LightGBM] [Info] Number of data points in the train set: 10106, number of used features: 12
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1.000008
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2620
[LightGBM] [Info] Number of data points in the train set: 10106, number of used features: 12
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_row_wis

In [38]:
!pip install cmaes

Collecting cmaes
  Downloading cmaes-0.11.1-py3-none-any.whl.metadata (18 kB)
Downloading cmaes-0.11.1-py3-none-any.whl (35 kB)
Installing collected packages: cmaes
Successfully installed cmaes-0.11.1


In [39]:
class OptunaWeights:
    def __init__(self, random_state, n_trials=10000):
        self.study = None  # Optuna study object
        self.weights = None  # Optimal weights for the predictions
        self.random_state = random_state  # Random state for reproducibility
        self.n_trials = n_trials  # Number of trials for hyperparameter optimization

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", 0, 1) for n in range(len(y_preds))]
        total = sum(weights)
        weights = [x / total for x in weights]  # Ensure the sum of weights is 1
        
        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=weights)
        
        # Calculate the R² score
        r2 = r2_score(y_true, weighted_pred)
        
        # Return negative R² because Optuna minimizes by default and we want to maximize R²
        return -r2  # Negative because we want to maximize R²

    def fit(self, y_true, y_preds):
        # Set Optuna logging verbosity to ERROR
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        
        # Create a CMA-ES sampler for hyperparameter optimization
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        
        # Create a Hyperband pruner for early stopping
        pruner = optuna.pruners.HyperbandPruner()
        
        # Create an Optuna study with the specified sampler, pruner, and objective direction
        # Direction is 'minimize' because we're minimizing -R² (which maximizes R²)
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, 
                                         study_name="OptunaWeights", direction='minimize')
        
        # Create a partial function for the objective with y_true and y_preds as fixed arguments
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        
        # Optimize the study with the specified number of trials and show progress bar
        self.study.optimize(objective_partial, n_trials=self.n_trials, show_progress_bar=True)
        
        # Extract the best weights from the study's best parameters
        weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]
        print("Weights before scaling:", weights)
        
        # Scale weights to ensure sum of weights is 1
        total = sum(weights)
        self.weights = [x / total for x in weights]
        print("Weights after scaling:", self.weights)
        
        # Calculate and print the best R² score achieved
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=self.weights)
        best_r2 = r2_score(y_true, weighted_pred)
        print(f"Best R² score: {best_r2:.6f}")


ow = OptunaWeights(42)
ow.fit(y, y_preds=[oof_valid_preds_xgb, oof_valid_preds_cb, oof_valid_preds_lgbm])
weights = ow.weights
weights

  0%|          | 0/10000 [00:00<?, ?it/s]

Weights before scaling: [0.46829988198372335, 0.6553766335677247, 0.26328961256161904]
Weights after scaling: [0.3376433443409564, 0.472525334457409, 0.1898313212016345]
Best R² score: 0.972380


[0.3376433443409564, 0.472525334457409, 0.1898313212016345]

# Submission

In [40]:
# Train with the whole datasets
best_xgb.fit(X, y)
best_cb.fit(X, y)
best_lgbm.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2621
[LightGBM] [Info] Number of data points in the train set: 11229, number of used features: 12
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1.000001


In [41]:
validation_concat.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Longitude,Latitude,UHI Index,density,B01,B02,B03,B04,B05,B06,...,B8A,B11,B12,NDVI,NDBI,NDWI,LST,road_density,building_density_medium,building_density_tall
0,-73.971665,40.788763,,12,1109.576929,1229.599619,1355.112575,1425.64383,1626.311228,2012.087688,...,2224.51603,2146.13673,1826.886547,0.200376,0.001399,-0.224598,36.637337,1876.839924,26,94
1,-73.971928,40.788875,,12,1115.170029,1239.38114,1365.236988,1438.272807,1636.686696,2007.709649,...,2214.21652,2150.349123,1836.398977,0.194253,0.004341,-0.219195,36.703919,2095.162872,24,97
2,-73.96708,40.78908,,4,795.622951,881.664959,1005.529128,972.993999,1216.328747,1958.375732,...,2336.408811,1815.682231,1377.212968,0.395391,-0.105857,-0.381427,33.485227,1285.873617,29,48
3,-73.97255,40.789082,,12,1120.069213,1239.884109,1364.432543,1438.773632,1638.771291,2003.329968,...,2206.887035,2147.318115,1831.670618,0.19295,0.004815,-0.218353,36.817434,2848.513646,29,94
4,-73.969697,40.787953,,9,951.219441,1046.451471,1173.158542,1190.45396,1423.341238,2052.694042,...,2383.179476,2009.478846,1600.800322,0.315578,-0.064867,-0.322152,35.072928,1908.523907,157,73


In [42]:
submission_val_data = validation_concat[features]
submission_val_data.head()

Unnamed: 0,B01,B8A,B11,B12,LST,NDVI,NDBI,NDWI,density,road_density,building_density_medium,building_density_tall
0,1109.576929,2224.51603,2146.13673,1826.886547,36.637337,0.200376,0.001399,-0.224598,12,1876.839924,26,94
1,1115.170029,2214.21652,2150.349123,1836.398977,36.703919,0.194253,0.004341,-0.219195,12,2095.162872,24,97
2,795.622951,2336.408811,1815.682231,1377.212968,33.485227,0.395391,-0.105857,-0.381427,4,1285.873617,29,48
3,1120.069213,2206.887035,2147.318115,1831.670618,36.817434,0.19295,0.004815,-0.218353,12,2848.513646,29,94
4,951.219441,2383.179476,2009.478846,1600.800322,35.072928,0.315578,-0.064867,-0.322152,9,1908.523907,157,73


In [43]:
# Feature Scaling 
submission_val_data = submission_val_data.values
transformed_submission_data = sc.transform(submission_val_data)

In [44]:
#Making predictions
final_predictions_xgb = best_xgb.predict(transformed_submission_data)
final_predictions_cb = best_cb.predict(transformed_submission_data)
final_predictions_lgbm = best_lgbm.predict(transformed_submission_data)

final_predictions = final_predictions_xgb*weights[0] + final_predictions_cb*weights[1] + final_predictions_lgbm*weights[2] 
final_prediction_series = pd.Series(final_predictions)



In [45]:
#Combining the results into dataframe
submission_df = pd.DataFrame({'Longitude':validation_concat['Longitude'].values, 'Latitude':validation_concat['Latitude'].values, 'UHI Index':final_prediction_series.values})

In [46]:
#Displaying the sample submission dataframe
submission_df.head()

Unnamed: 0,Longitude,Latitude,UHI Index
0,-73.971665,40.788763,0.964462
1,-73.971928,40.788875,0.964386
2,-73.96708,40.78908,0.963518
3,-73.97255,40.789082,0.962195
4,-73.969697,40.787953,0.959113


In [47]:
#Dumping the predictions into a csv file.
submission_df.to_csv("submission.csv",index = False)