In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imputed-scaled-lgbm-15/X_dev_scaled_imputed.csv
/kaggle/input/imputed-scaled-lgbm-15/y_dev_scaled.csv
/kaggle/input/imputed-scaled-lgbm-15/X_test_scaled_imputed.csv
/kaggle/input/imputed-scaled-lgbm-15/X_train_scaled_imputed.csv
/kaggle/input/imputed-scaled-lgbm-15/y_test_scaled.csv
/kaggle/input/imputed-scaled-lgbm-15/y_train_scaled.csv
/kaggle/input/hyperparam-helper/y_train.parquet
/kaggle/input/hyperparam-helper/X_train.parquet


In [2]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from tqdm.notebook import tqdm
import joblib


def evaluate_order_kfold(models, order, X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    scores = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        fold_scores = []
        for model in models:
            chain = RegressorChain(model, order=order, random_state=1)
            chain.fit(X_train_fold, y_train_fold)
            y_pred_val = chain.predict(X_val_fold)
            score = mcrmse(y_val_fold, y_pred_val)
            fold_scores.append(score)

        scores.append(np.mean(fold_scores))

    return np.mean(scores)

def greedy_search_order(models, X, y, n_splits=5):
    all_targets = list(range(y.shape[1]))
    current_order = []
    while len(all_targets) > 0:
        best_score = float('inf')
        best_target = None

        for target in all_targets:
            temp_order = current_order + [target]
            temp_score = evaluate_order_kfold(models, temp_order, X, y, n_splits=n_splits)

            if temp_score < best_score:
                best_score = temp_score
                best_target = target

        current_order.append(best_target)
        all_targets.remove(best_target)
        
    return current_order


def make_predictions(input_df):
    """
    This function will preprocess the input dataframe, and use the ensemble of trained models to make predictions.
    Args:
    - input_df: The input dataframe. It should have the same features as your training data.
    
    Returns:
    - final_predictions_df: Predictions for the input dataframe in the form of a dataframe with appropriate column names.
    """
    # Ensure the input dataframe has been preprocessed
    # Example: if you used scaling before, ensure `input_df` has also been scaled
    
    # Load the trained models, scores, and weights
    chains = joblib.load('chains.joblib')
    weights = joblib.load('weights.joblib')
    
    # Make predictions using each model in the ensemble
    predictions = [chain.predict(input_df) for chain in chains]
    
    # Compute the ensemble predictions
    final_predictions = np.zeros_like(predictions[0])
    for weight, prediction in zip(weights, predictions):
        final_predictions += weight * prediction

    # Convert predictions to a dataframe with appropriate column names
    final_predictions_df = pd.DataFrame(final_predictions, columns=labels_list())
        
    return final_predictions_df

# def apply_robust_scaling(df):
#     df = df.drop(columns=obj_columns(df))
#     scaler = RobustScaler()
#     df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
#     return df_scaled

def drop_high_nan(df, threshold=0.85):
    """
    Drops columns from the dataframe that have a proportion of NaN values greater than the specified threshold.
    
    Parameters:
    - df: Input DataFrame.
    - threshold: Proportion threshold for NaN values (default is 0.85).
    
    Returns:
    - DataFrame with columns having NaN proportion greater than the threshold dropped.
    """
    nan_proportion = df.isnull().mean()
    columns_to_drop = nan_proportion[nan_proportion > threshold].index.tolist()
    return df.drop(columns=columns_to_drop)

class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, df):
        numerical_data = df.select_dtypes(include=['float64', 'int64']).values
        self.scaler.fit(numerical_data)

    def transform(self, df):
        df_transformed = df.copy()
        numerical_features = df_transformed.select_dtypes(include=['float64', 'int64']).columns
        df_transformed[numerical_features] = self.scaler.transform(df_transformed[numerical_features].values)
        return df_transformed

    def inverse_transform(self, df):
        df_original = df.copy()
        numerical_features = df_original.select_dtypes(include=['float64', 'int64']).columns
        df_original[numerical_features] = self.scaler.inverse_transform(df_original[numerical_features].values)
        return df_original

class TargetPreprocessor:
    def __init__(self, column_names=None):
        self.scaler = StandardScaler()
        self.column_names = column_names

    def fit(self, y):
        self.scaler.fit(y)

    def transform(self, y):
        y_array = self.scaler.transform(y)
        if isinstance(y, pd.DataFrame):
            return pd.DataFrame(y_array, columns=y.columns, index=y.index)
        else:
            return y_array

    def inverse_transform(self, y_scaled):
        y_array = self.scaler.inverse_transform(y_scaled)
        if self.column_names:
            return pd.DataFrame(y_array, columns=self.column_names)
        elif isinstance(y_scaled, pd.DataFrame):
            return pd.DataFrame(y_array, columns=y_scaled.columns, index=y_scaled.index)
        else:
            return y_array


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
import pandas as pd
from tqdm.notebook import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from lightgbm import LGBMRegressor
import pandas as pd

class IterativeDataImputer:
    def __init__(self, max_iter=10, random_state=None, verbose=0):
        lgb_fast_hyperparams = {
            'learning_rate': 0.05,
            'n_estimators': 300,
            'max_depth': 6,
            'num_leaves': 64,
            'feature_fraction': 0.5,
            'bagging_fraction': 0.7,
            'bagging_freq': 1,
            'n_jobs': -1
        }
        self.lgbm_regressor = LGBMRegressor(**lgb_fast_hyperparams)
        self.imputer = IterativeImputer(estimator=self.lgbm_regressor, max_iter=max_iter, random_state=random_state, verbose=verbose)

    def fit(self, df):
        # Select only numerical columns
        self.numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
        self.imputer.fit(df[self.numerical_cols])
        return self

    def transform(self, df):
        df_copy = df.copy()
        df_copy[self.numerical_cols] = self.imputer.transform(df[self.numerical_cols])
        return df_copy
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)



# def apply_knn_iterative_imputation(df, knn_n_neighbors, max_iter, random_state=None):
#     warnings.warn("it is a good idea to apply scaling before imputation,ignore if already applied, if not apply_robust_scaling() and apply_standard_scaling() functions can be used")
#     """
#     Apply KNN imputation for columns with less than 30% missing data and Iterative imputation for columns with more.
    
#     Parameters:
#     - df: The input dataframe with missing values.
#     - knn_n_neighbors: Number of neighboring samples to use for KNN imputation.
#     - max_iter: Maximum number of imputation iterations for IterativeImputer.
#     - random_state: Seed used by the random number generator for IterativeImputer.
    
#     Returns:
#     - df: DataFrame with imputed values.
#     """
    
#     # Identify columns based on missing data threshold
#     missing_data = df.isna().mean()
#     cols_lt_30 = missing_data[missing_data < 0.3].index.tolist()
#     cols_gt_30 = missing_data[missing_data >= 0.3].index.tolist()

#     # KNN imputation for columns with < 30% missing data
#     if len(cols_lt_30) > 0:
#         print("KNN imputer for columns with < 30% missing data started")
#         knn_imputer = KNNImputer(n_neighbors=knn_n_neighbors)
#         df[cols_lt_30] = knn_imputer.fit_transform(df[cols_lt_30])
    
#     # Iterative imputation for columns with >= 30% missing data
#     if len(cols_gt_30) > 0:
#         print("Iterative imputer for columns with >= 30% missing data started")
#         iter_imputer = IterativeImputer(estimator=DecisionTreeRegressor(), max_iter=max_iter, random_state=random_state)
#         df[cols_gt_30] = iter_imputer.fit_transform(df[cols_gt_30])

#     return df

def load_csv_data(filepath):
    df = pd.read_csv(filepath)
    return df

def load_parquet_data(filepath):
    df = pd.read_parquet(filepath)
    return df

def labels_list():
    labels = ['Mean_BMI', 'Median_BMI',
           'Unmet_Need_Rate', 'Under5_Mortality_Rate',
         'Skilled_Birth_Attendant_Rate', 'Stunted_Rate']
    return labels

def heatmap(df):    
    plt.figure(figsize=(40, 30)) # Increase the size of the figure
    sns.heatmap(df.isnull(), cbar=False, cmap='binary')
    plt.show()

def one_hot_encode(df, columns):
    df = pd.get_dummies(df, columns=columns)
    return df

def country_region_mapping(df):
    # Country code to region mapping
    country_to_region = {}
    country_to_region.update({code: "East Asia & Pacific" for code in {"BD", "KH", "ID", "LA", "MM", "NP", "PH", "TL", "VN", "GU"}})
    country_to_region.update({code: "Central Asia" for code in {"TJ", "UZ", "KG", "KZ"}})
    country_to_region.update({code: "Europe & Central Asia" for code in {"AL", "AM", "AZ", "GE", "MD", "MK", "RS", "TR", "UA"}})
    country_to_region.update({code: "Sub-Saharan Africa" for code in {"AO", "BJ", "BF", "BU", "CM", "CF", "CI", "CD", "ET", "GA", "GH", "GN", "GY", "KE", "KM", "LS", "LB", "MD", "MW", "ML", "MZ", "NG", "NM", "RW", "SN", "SL", "SZ", "TD", "TG", "ZA", "TZ", "UG", "ZM", "ZW"}})
    country_to_region.update({code: "North Africa & Middle East" for code in {"EG", "JO", "YE", "MA", "LB", "MB"}})
    country_to_region.update({code: "South Asia" for code in {"AF", "BD", "IN", "PK", "NP", "IA"}})
    country_to_region.update({code: "Latin America & Caribbean" for code in {"BO", "CO", "DR", "HT", "HN", "MX", "PE", "NI"}})

    # Extract country codes from DHSID
    df['Country_Code'] = df['DHSID'].str.extract(r'([A-Za-z]+)')[0]
    
    # Correct any mislabeled country codes
    df.loc[df["Country_Code"] == "DHS", "Country_Code"] = "BD"

    # Map regions and countries
    df['target_region'] = df['Country_Code'].map(country_to_region)
    df['target_country'] = df['Country_Code'].factorize()[0]

    return df

#df1 is main dataframe and df2 is secondary dataframe, merged on basis of DHSID
def merge_dataframes(df, df2):
    return df.merge(df2, on='DHSID', how='inner')

def obj_columns(df):
    return(df.columns[df.dtypes == 'object'].tolist())

def split_data_country_wise(df):
    if 'Country_Code' not in df.columns.to_list():
        raise Exception("dataframe doesnt have a column named Country_Code, use country_region_mapping(df) to get it")
  
    X = df.drop(columns=labels)
    y = df[labels]
    # Unique country codes
    countries = df['Country_Code'].unique()

    # Initialize empty lists to store the split data
    X_train_list, X_dev_list, X_test_list = [], [], []
    y_train_list, y_dev_list, y_test_list = [], [], []

    # Split the data for each country
    for country in countries:
        X_country = X[df['Country_Code'] == country]
        y_country = y[df['Country_Code'] == country]
        
        X_train_country, X_temp, y_train_country, y_temp = train_test_split(X_country, y_country, test_size=0.2, random_state=1)
        X_dev_country, X_test_country, y_dev_country, y_test_country = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

        X_train_list.append(X_train_country)
        X_dev_list.append(X_dev_country)
        X_test_list.append(X_test_country)

        y_train_list.append(y_train_country)
        y_dev_list.append(y_dev_country)
        y_test_list.append(y_test_country)

    # Concatenate the splits
    X_train = pd.concat(X_train_list, ignore_index=True)
    X_dev = pd.concat(X_dev_list, ignore_index=True)
    X_test = pd.concat(X_test_list, ignore_index=True)

    y_train = pd.concat(y_train_list, ignore_index=True)
    y_dev = pd.concat(y_dev_list, ignore_index=True)
    y_test = pd.concat(y_test_list, ignore_index=True)

    return X_train,X_dev,X_test,y_train,y_dev,y_test

def add_temporal_features(df):
    """
    Adds temporal features to the dataframe.
    
    For countries with data across multiple years:
    1. Computes the year-on-year difference for each feature.
    2. Generates aggregated temporal features like mean, median, 
       and standard deviation for each feature for each country over the years.
    
    Parameters:
    - df: DataFrame with original data.
    
    Returns:
    - DataFrame with additional temporal features.
    """
    
    # Excluding labels and other non-feature columns
    exclude_cols = obj_columns(df) + labels_list()
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    # Sort the dataframe by 'Country_Code' and 'DHSYEAR' for proper computation
    df = df.sort_values(by=['Country_Code', 'DHSYEAR'])
    
    new_cols = []
    
    # 1. Compute year-on-year difference for each feature
    for col in feature_cols:
        new_cols.append(pd.Series(df.groupby('Country_Code')[col].diff(), name=f"{col}_yearly_diff"))
    
    # 2. Generate aggregated temporal features for each feature for each country
    for col in feature_cols:
        new_cols.append(pd.Series(df.groupby('Country_Code')[col].transform('mean'), name=f"{col}_mean"))
        new_cols.append(pd.Series(df.groupby('Country_Code')[col].transform('median'), name=f"{col}_median"))
        new_cols.append(pd.Series(df.groupby('Country_Code')[col].transform('std'), name=f"{col}_std"))
    
    # Concatenate original DataFrame with new columns
    df = pd.concat([df] + new_cols, axis=1)
    
    return df

def nan_percentage(df):
    """
    Returns a DataFrame with columns and their corresponding % of NaN values in descending order.
    
    Args:
    - df (pd.DataFrame): The input DataFrame.

    Returns:
    - pd.DataFrame: A DataFrame with columns: 'Column Name' and 'NaN %'
    """
    # Calculate the percentage of NaNs for each column
    nan_percent = df.isnull().mean() * 100

    # Create a DataFrame with results
    result_df = pd.DataFrame({
        'Column Name': nan_percent.index,
        'NaN %': nan_percent.values
    })

    # Sort the DataFrame in descending order
    result_df = result_df.sort_values(by='NaN %', ascending=False)

    return result_df

# Define the function to calculate MCRMSE
def mcrmse(y_true, y_pred):
    return np.mean(np.sqrt(np.mean(np.square(y_true - y_pred), axis=0)))



In [3]:
X_train_scaled_imputed = load_csv_data("/kaggle/input/imputed-scaled-lgbm-15/X_train_scaled_imputed.csv")
X_dev_scaled_imputed = load_csv_data("/kaggle/input/imputed-scaled-lgbm-15/X_dev_scaled_imputed.csv")
X_test_scaled_imputed = load_csv_data("/kaggle/input/imputed-scaled-lgbm-15/X_test_scaled_imputed.csv")

y_train_scaled = load_csv_data("/kaggle/input/imputed-scaled-lgbm-15/y_train_scaled.csv")
y_dev_scaled = load_csv_data("/kaggle/input/imputed-scaled-lgbm-15/y_dev_scaled.csv")
y_test_scaled = load_csv_data("/kaggle/input/imputed-scaled-lgbm-15/y_test_scaled.csv")

# Compute the medians from the training data
medians = X_train_scaled_imputed.select_dtypes(include=['float64', 'int64']).median()

# Impute NaN values in all datasets with the computed medians
X_train_scaled_imputed.fillna(medians, inplace=True)
X_dev_scaled_imputed.fillna(medians, inplace=True)
X_test_scaled_imputed.fillna(medians, inplace=True)

In [4]:
X_train = load_parquet_data("/kaggle/input/hyperparam-helper/X_train.parquet")
y_train = load_parquet_data("/kaggle/input/hyperparam-helper/y_train.parquet")

In [5]:
features_preprocessor = DataPreprocessor()

# Fit the preprocessor on the training data
features_preprocessor.fit(X_train)

# Create an instance of the preprocessor for targets
target_preprocessor = TargetPreprocessor()

# Fit the preprocessor on the training target data
target_preprocessor.fit(y_train)

In [6]:
# import numpy as np
# from sklearn.multioutput import RegressorChain
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
# import optuna
# from sklearn.model_selection import KFold
# optuna.logging.set_verbosity(optuna.logging.INFO)

# def objective(trial):
#     model_type = trial.suggest_categorical("model_type", ["xgb", "lgb", "cat"])
#     order = [0, 1, 5, 4, 3, 2]
#     kf = KFold(n_splits=5, shuffle=True, random_state=1)
#     scores = []

#     for fold_num, (train_idx, valid_idx) in enumerate(kf.split(X_train_scaled_imputed)):
#         X_train_fold = X_train_scaled_imputed.iloc[train_idx]
#         y_train_fold = y_train_scaled.iloc[train_idx]
#         X_valid_fold = X_train_scaled_imputed.iloc[valid_idx]
#         y_valid_fold = y_train_scaled.iloc[valid_idx]

#         if model_type == "xgb":
#             model = XGBRegressor(
#                 random_state=1,
#                 objective='reg:squarederror',
#                 learning_rate=trial.suggest_float("xgb_learning_rate", 0.01, 0.3, log=True),
#                 n_estimators=trial.suggest_int("xgb_n_estimators", 50, 300),
#                 max_depth=trial.suggest_int("xgb_max_depth", 2, 10),
#                 min_child_weight=trial.suggest_int("xgb_min_child_weight", 1, 10),
#                 subsample=trial.suggest_float("xgb_subsample", 0.5, 1),
#                 colsample_bytree=trial.suggest_float("xgb_colsample_bytree", 0.5, 1),
#                 reg_alpha=trial.suggest_float("xgb_reg_alpha", 1e-3, 10.0, log=True),
#                 reg_lambda=trial.suggest_float("xgb_reg_lambda", 1e-3, 10.0, log=True),
#                 verbosity=0
#             )

#         elif model_type == "lgb":
#             model = LGBMRegressor(
#                 random_state=1,
#                 learning_rate=trial.suggest_float("lgb_learning_rate", 0.01, 0.3, log=True),
#                 n_estimators=trial.suggest_int("lgb_n_estimators", 50, 300),
#                 max_depth=trial.suggest_int("lgb_max_depth", 2, 10),
#                 num_leaves=trial.suggest_int("lgb_num_leaves", 2, 2**trial.suggest_int("lgb_max_depth", 2, 10)),
#                 min_child_samples=trial.suggest_int("lgb_min_child_samples", 5, 100),
#                 subsample=trial.suggest_float("lgb_subsample", 0.5, 1),
#                 colsample_bytree=trial.suggest_float("lgb_colsample_bytree", 0.5, 1),
#                 reg_alpha=trial.suggest_float("lgb_reg_alpha", 1e-3, 10.0, log=True),
#                 reg_lambda=trial.suggest_float("lgb_reg_lambda", 1e-3, 10.0, log=True),
#                 verbose=-1
#             )

#         else:
#             model = CatBoostRegressor(
#                 random_seed=1,
#                 learning_rate=trial.suggest_float("cat_learning_rate", 0.01, 0.3, log=True),
#                 n_estimators=trial.suggest_int("cat_n_estimators", 50, 300),
#                 depth=trial.suggest_int("cat_depth", 2, 10),
#                 l2_leaf_reg=trial.suggest_float("cat_l2_leaf_reg", 1e-3, 10.0, log=True),
#                 border_count=trial.suggest_int("cat_border_count", 5, 200),
#                 subsample=trial.suggest_float("cat_subsample", 0.5, 1),
#                 loss_function='RMSE',
#                 od_type='Iter',
#                 od_wait=10,
#                 verbose=0
#             )
        
#         chain = RegressorChain(model, order=order)
#         chain.fit(X_train_fold, y_train_fold)

#         y_pred_fold = chain.predict(X_valid_fold)
#         y_valid_fold_unscaled = target_preprocessor.inverse_transform(y_valid_fold)
#         y_pred_original = target_preprocessor.inverse_transform(y_pred_fold)
        
#         fold_score = mcrmse(y_valid_fold, y_pred_original)
#         scores.append(fold_score)
        
#         print(f"Model {model_type}, Fold {fold_num + 1} completed with score: {fold_score}")

#     mean_score = np.mean(scores)
#     print(f"Finished iteration with model_type: {model_type}. Mean score: {mean_score}")
#     return mean_score

# pruner = optuna.pruners.MedianPruner()
# study = optuna.create_study(direction="minimize", pruner=pruner)
# study.optimize(objective, n_trials=120, n_jobs=-1)

# best_trials = {}
# for trial in study.trials:
#     if trial.state == optuna.trial.TrialState.COMPLETE:
#         model_type = trial.params['model_type']
#         if model_type not in best_trials or trial.value < best_trials[model_type].value:
#             best_trials[model_type] = trial

# best_params_dict = {}
# for model_type, trial in best_trials.items():
#     print(f"Best parameters for {model_type}: {trial.params}")
#     print(f"Score: {trial.value}")
#     print('-' * 50)
#     best_params_dict[model_type] = trial.params

# np.save('best_hyperparams.npy', best_params_dict)

In [7]:
import numpy as np
from sklearn.multioutput import RegressorChain
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from sklearn.model_selection import KFold
optuna.logging.set_verbosity(optuna.logging.INFO)

def objective(trial):
    try:
        # 1. Model selection
        model_type = trial.suggest_categorical("model_type", ["xgb", "lgb", "cat"])
    
        order = [0, 1, 5, 4, 3, 2]
        
        # 2. Hyperparameter definitions based on the model
        if model_type == "xgb":
            model = XGBRegressor(
                random_state=1,
                learning_rate=trial.suggest_float("xgb_learning_rate", 0.01, 0.3, log=True),
                n_estimators=trial.suggest_int("xgb_n_estimators", 50, 300),
                max_depth=trial.suggest_int("xgb_max_depth", 2, 10),
                min_child_weight=trial.suggest_int("xgb_min_child_weight", 1, 10),
                subsample=trial.suggest_float("xgb_subsample", 0.5, 1),
                colsample_bytree=trial.suggest_float("xgb_colsample_bytree", 0.5, 1),
                reg_alpha=trial.suggest_float("xgb_reg_alpha", 1e-3, 10.0, log=True),
                reg_lambda=trial.suggest_float("xgb_reg_lambda", 1e-3, 10.0, log=True)
            )
        elif model_type == "lgb":
            model = LGBMRegressor(
                random_state=1,
                learning_rate=trial.suggest_float("lgb_learning_rate", 0.01, 0.3, log=True),
                n_estimators=trial.suggest_int("lgb_n_estimators", 50, 300),
                max_depth=trial.suggest_int("lgb_max_depth", 2, 10),
                num_leaves=trial.suggest_int("lgb_num_leaves", 2, 2**trial.suggest_int("lgb_max_depth", 2, 10)),
                min_child_samples=trial.suggest_int("lgb_min_child_samples", 5, 100),
                subsample=trial.suggest_float("lgb_subsample", 0.5, 1),
                colsample_bytree=trial.suggest_float("lgb_colsample_bytree", 0.5, 1),
                reg_alpha=trial.suggest_float("lgb_reg_alpha", 1e-3, 10.0, log=True),
                reg_lambda=trial.suggest_float("lgb_reg_lambda", 1e-3, 10.0, log=True)
            )
        else:
            model = CatBoostRegressor(
                random_seed=1,
                learning_rate=trial.suggest_float("cat_learning_rate", 0.01, 0.3, log=True),
                n_estimators=trial.suggest_int("cat_n_estimators", 50, 300),
                depth=trial.suggest_int("cat_depth", 2, 10),
                l2_leaf_reg=trial.suggest_float("cat_l2_leaf_reg", 1e-3, 10.0, log=True),
                border_count=trial.suggest_int("cat_border_count", 5, 200),
                subsample=trial.suggest_float("cat_subsample", 0.5, 1),
                verbose=0
            )

        # Model training
        chain = RegressorChain(model, order=order)
        chain.fit(X_train_scaled_imputed, y_train_scaled)

        y_pred_scaled = chain.predict(X_dev_scaled_imputed)
        y_pred_original = target_preprocessor.inverse_transform(y_pred_scaled)
        y_dev_original = target_preprocessor.inverse_transform(y_dev_scaled)
        return mcrmse(y_dev_original, y_pred_original)
    
    except Exception as e:
        print(f"Error in trial {trial.number}: {e}")
        return None

# Hyperparameter optimization
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction="minimize", pruner=pruner)
study.optimize(objective, n_trials=120, n_jobs=-1)

best_params_dict = {}
for model_type, trial in best_trials.items():
    print(f"Best parameters for {model_type}: {trial.params}")
    print(f"Score: {trial.value}")
    print('-' * 50)
    best_params_dict[model_type] = trial.params

np.save('best_hyperparams.npy', best_params_dict)

[I 2023-08-13 06:56:40,224] A new study created in memory with name: no-name-a0bf01d2-c03e-42fb-8d14-d2be7ccf98d5
[I 2023-08-13 07:00:48,418] Trial 0 finished with value: 10.144405145710238 and parameters: {'model_type': 'lgb', 'lgb_learning_rate': 0.011071359353951064, 'lgb_n_estimators': 173, 'lgb_max_depth': 2, 'lgb_num_leaves': 4, 'lgb_min_child_samples': 76, 'lgb_subsample': 0.8111577482332275, 'lgb_colsample_bytree': 0.8588335935711795, 'lgb_reg_alpha': 0.001058033324136389, 'lgb_reg_lambda': 0.00965811744483018}. Best is trial 0 with value: 10.144405145710238.
[I 2023-08-13 07:03:28,003] Trial 2 finished with value: 9.073500382219398 and parameters: {'model_type': 'cat', 'cat_learning_rate': 0.1460804564320867, 'cat_n_estimators': 131, 'cat_depth': 2, 'cat_l2_leaf_reg': 2.658839115618, 'cat_border_count': 8, 'cat_subsample': 0.9665934789076048}. Best is trial 2 with value: 9.073500382219398.
[I 2023-08-13 07:06:03,361] Trial 5 finished with value: 10.602011659352149 and paramete