In [37]:
# Import necessary libraries for data manipulation and machine learning
import numpy as np                 # For numerical operations on arrays and matrices
import pandas as pd                # For data manipulation and analysis
import matplotlib.pyplot as plt    # For creating static visualizations
import seaborn as sns              # For statistical data visualization based on matplotlib

# Import modules for model evaluation and selection
from sklearn.model_selection import train_test_split, cross_val_score    # For splitting data and cross-validation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score    # For model evaluation metrics

# Import machine learning algorithms
from sklearn.linear_model import LinearRegression    # For linear regression
from sklearn.ensemble import RandomForestRegressor  # For random forest regression
from sklearn.svm import SVR                          # For support vector regression
import xgboost as xgb                                # For XGBoost regression
import lightgbm as lgb                                # For LightGBM regression
from catboost import CatBoostRegressor               # For CatBoost regression

# Import modules for advanced model stacking techniques
from sklearn.ensemble import StackingRegressor        # For stacking multiple regressors
from mlxtend.regressor import StackingCVRegressor     # For stacked generalization with cross-validation

# Import additional libraries for hyperparameter tuning
import optuna    # For hyperparameter optimization

# Import metrics for additional model evaluation
from sklearn import metrics
# Import category_encoders for encoding categorical features
import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [38]:
# Define the path to the CSV file containing the data
path = r'C:\Users\User\Desktop\Rashad\DATA\Life Expectancy Data.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(path)

# Display the DataFrame to view the loaded data
df

Unnamed: 0,Country,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,...,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
0,Afghanistan,2015,Developing,263.0,62,0.01,71.279624,65.0,1154,19.1,...,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1,65.0
1,Afghanistan,2014,Developing,271.0,64,0.01,73.523582,62.0,492,18.6,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,59.9
2,Afghanistan,2013,Developing,268.0,66,0.01,73.219243,64.0,430,18.1,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9,59.9
3,Afghanistan,2012,Developing,272.0,69,0.01,78.184215,67.0,2787,17.6,...,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8,59.5
4,Afghanistan,2011,Developing,275.0,71,0.01,7.097109,68.0,3013,17.2,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,59.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,723.0,27,4.36,0.000000,68.0,31,27.1,...,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2,44.3
2934,Zimbabwe,2003,Developing,715.0,26,4.06,0.000000,7.0,998,26.7,...,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5,44.5
2935,Zimbabwe,2002,Developing,73.0,25,4.43,0.000000,73.0,304,26.3,...,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0,44.8
2936,Zimbabwe,2001,Developing,686.0,25,1.72,0.000000,76.0,529,25.9,...,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8,45.3


In [39]:
df['Life expectancy '] = df['Life expectancy '].fillna(df['Life expectancy '].mean())

In [40]:
# Create copies of the DataFrame for different preprocessing approaches
df_copy = df.copy()
df_boost = df.copy()
df_cat = df.copy()

In [41]:
df_copy = df_copy.drop(columns=['Country', 'Year'])
df_copy['Status'] = df_copy['Status'].map({'Developing': 0, 'Developed': 1})

In [42]:
models = ["linear_regression", "random_forest", "svr", "xgboost", "lightgbm", "catboost", "catboost_custom"]


# Iterate through each model in the list
for model in models:
    if model == "linear_regression" or model == "svr":
        # Preprocessing for linear regression and SVR
        
        # Fill missing values with mean for numerical columns and mode for categorical columns
        for col in df_copy.columns:
            if df_copy[col].dtype in ['int64', 'float64']:
                df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
            elif df_copy[col].dtype == 'object':
                df_copy[col] = df_copy[col].fillna(df_copy[col].mode().iloc[0])
        df_copy.drop_duplicates(inplace=True)
                
    elif model in ["random_forest", "xgboost", "lightgbm", "catboost"]:
        for col in df_boost.columns:
            if df_boost[col].dtype in ['int64', 'float64']:
                df_boost[col] = df_boost[col].fillna(df_boost[col].mean())
            elif df_boost[col].dtype == 'object':
                df_boost[col] = df_boost[col].fillna(df_boost[col].mode().iloc[0])
        dum_data = pd.get_dummies(df_boost, drop_first=True)
             
    elif model == "catboost_custom":
        columns_to_fill = ['Country','Status']  # Assume these are the categorical columns
        df_cat[columns_to_fill] = df_cat[columns_to_fill].fillna('Missing Value')
        

# Split data into features (X) and target (y) for each preprocessing approach
x_log = df_copy.drop(columns=['Life expectancy '])
y_log = df_copy['Life expectancy ']
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_log, y_log, test_size=0.3, random_state=42)

x_boost = dum_data.drop(columns=['Life expectancy '])
y_boost = dum_data['Life expectancy ']
X_train_boost, X_test_boost, y_train_boost, y_test_boost = train_test_split(x_boost, y_boost, test_size=0.3, random_state=42)

x_cat = df_cat.drop(columns=['Life expectancy '])
y_cat = df_cat['Life expectancy ']
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(x_cat, y_cat, test_size=0.3, random_state=42)

In [43]:
df_copy.isnull().sum()

Status                             0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
 BMI                               0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
 HIV/AIDS                          0
GDP                                0
Population                         0
 thinness  1-19 years              0
 thinness 5-9 years                0
Income composition of resources    0
Schooling                          0
Life expectancy                    0
dtype: int64

In [44]:
models = []

# Define default models for each algorithm
xgb_model_def = xgb.XGBRegressor()
lgb_model_def = lgb.LGBMRegressor()
catboost_model_def = CatBoostRegressor()
catboost_model_custom = CatBoostRegressor(cat_features=['Country','Status'])
lg = LinearRegression()
rf = RandomForestRegressor()
svc_model_def = SVR()  # Add SVR model definition

# Define models for stacking
stacking_models = [('XGBoost', xgb_model_def),
                   ('LightGBM', lgb_model_def),
                   ('CatBoost', catboost_model_def),
                   ('CatBoost_Custom', catboost_model_custom),
                   ('LinearRegression', lg),
                   ('RandomForest', rf),
                   ('SVR', svc_model_def)]  # Add SVR to stacking_models

# Extend models list with default models
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def),
    ('CatBoost_Custom', catboost_model_custom),
    ('LinearRegression', lg),
    ('RandomForest', rf),
    ('SVR', svc_model_def)  # Add SVR to models list
])

In [45]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    """
    Train and evaluate the given model on the training and testing data.

    Parameters:
    model_name (str): Name of the model for display purposes.
    model : Machine learning model object.
    X_train : Features of the training data.
    y_train : Target labels of the training data.
    X_test : Features of the testing data.
    y_test : Target labels of the testing data.

    Returns:
    float: R-squared score calculated from the model's predictions.
    """

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels on the testing data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('R-squared:', r2)

    return r2

In [46]:
r2_df = pd.DataFrame(columns=['Model', 'R2'])

# Iterate through each model in the list of models
for model_name, model in models:
    # Train and evaluate the model, and calculate the R-squared score
    if model_name == 'CatBoost_Custom':
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
    elif model_name in ['LinearRegression', 'SVR']:
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_log, y_train_log, X_test_log, y_test_log)
    else:
        r2_score_value = train_and_evaluate_model(model_name, model, X_train_boost, y_train_boost, X_test_boost, y_test_boost)
        
    # Add model name and R2 score to the DataFrame
    if r2_score_value is not None:
        r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'R2': [r2_score_value]})], ignore_index=True)

# Sort the DataFrame by R2 score in descending order
r2_df_sorted = r2_df.sort_values(by='R2', ascending=False)

Model Performance for XGBoost
Mean Absolute Error: 1.1774590160861984
Mean Squared Error: 3.3904557409771177
R-squared: 0.9637194883710133
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000338 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3312
[LightGBM] [Info] Number of data points in the train set: 2056, number of used features: 20
[LightGBM] [Info] Start training from score 69.395112
Model Performance for LightGBM
Mean Absolute Error: 1.2176356489288758
Mean Squared Error: 3.4318289446854395
R-squared: 0.9632767629343923
Learning rate set to 0.045881
0:	learn: 9.0875048	total: 9.16ms	remaining: 9.15s
1:	learn: 8.7856217	total: 17.4ms	remaining: 8.7s
2:	learn: 8.5050437	total: 26ms	remaining: 8.64s
3:	learn: 8.2173789	total: 34.2ms	remaining: 8.52s
4:	learn: 7.9716455	total: 42.4ms	remaining: 8.45s
5:	learn: 7.7232393	total: 50.5ms	remaining: 8.37s
6:	learn: 7.4854150	total: 58.8ms	remaining: 8

In [47]:
r2_df_sorted

Unnamed: 0,Model,R2
3,CatBoost_Custom,0.970382
2,CatBoost,0.96742
5,RandomForest,0.964382
0,XGBoost,0.963719
1,LightGBM,0.963277
4,LinearRegression,0.821231
6,SVR,-0.118747


# Random Forest, Xgboost, LightGBM, Catboost with Optuna

In [62]:
def best_params_for_model(trial):
    """
    Define the search space for hyperparameters and train the XGBoost model with the suggested hyperparameters.

    Parameters:
    - trial: Optuna's Trial object for sampling hyperparameters

    Returns:
    - auc: Mean R-squared score from cross-validation using the suggested hyperparameters
    """
    # Define the search space for hyperparameters
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),                   # Number of boosting rounds
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),          # Step size shrinkage
        'max_depth': trial.suggest_int('max_depth', 3, 10),                             # Maximum depth of a tree
        'subsample': trial.suggest_uniform('subsample', 0.5, 1),                         # Fraction of training data used for each tree
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),           # Fraction of features used for each tree
        'gamma': trial.suggest_int('gamma', 0, 5)                                       # Minimum loss reduction required to make a further partition
    }
    
    # Initialize XGBoost regressor with suggested hyperparameters
    xgb_reg = xgb.XGBRegressor(**param)

    # Calculate mean R-squared score from cross-validation
    auc = cross_val_score(xgb_reg, X_train_boost, y_train_boost, cv=3, scoring='r2', n_jobs=-1).mean()

    return auc

# Create a study object to optimize hyperparameters
study = optuna.create_study(direction='maximize')

# Optimize hyperparameters using Optuna
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

# Initialize the best XGBoost model with the best hyperparameters
best_xgb_model = xgb.XGBRegressor(**best_params)

[I 2024-05-20 00:15:38,674] A new study created in memory with name: no-name-6d122efe-9142-4b10-95e3-266c3a8196d5
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),          # Step size shrinkage
  'subsample': trial.suggest_uniform('subsample', 0.5, 1),                         # Fraction of training data used for each tree
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),           # Fraction of features used for each tree
[I 2024-05-20 00:15:43,395] Trial 0 finished with value: 0.923034332616949 and parameters: {'n_estimators': 548, 'learning_rate': 0.687981867785152, 'max_depth': 4, 'subsample': 0.6942859476147375, 'colsample_bytree': 0.9871828357777984, 'gamma': 2}. Best is trial 0 with value: 0.923034332616949.


Best trial:
  Value: 0.923
  Params:  {'n_estimators': 548, 'learning_rate': 0.687981867785152, 'max_depth': 4, 'subsample': 0.6942859476147375, 'colsample_bytree': 0.9871828357777984, 'gamma': 2}


In [63]:
def best_params_for_model(trial):
    """
    Define the search space for hyperparameters and train the LightGBM model with the suggested hyperparameters.

    Parameters:
    - trial: Optuna's Trial object for sampling hyperparameters

    Returns:
    - auc: Mean R-squared score from cross-validation using the suggested hyperparameters
    """
    # Define the search space for hyperparameters
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),                   # Number of boosting rounds
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),          # Step size shrinkage
        'max_depth': trial.suggest_int('max_depth', 3, 10),                             # Maximum depth of a tree
        'num_leaves': trial.suggest_int('num_leaves', 10, 100)                           # Maximum number of leaves in one tree
    }
    
    # Initialize LightGBM regressor with suggested hyperparameters
    lgb_reg = lgb.LGBMRegressor(**param)

    # Calculate mean R-squared score from cross-validation
    auc = cross_val_score(lgb_reg, X_train_boost, y_train_boost, cv=3, scoring='r2', n_jobs=-1).mean()

    return auc

# Create a study object to optimize hyperparameters
study = optuna.create_study(direction='maximize')

# Optimize hyperparameters using Optuna
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

# Initialize the best LightGBM model with the best hyperparameters
best_lgb_model = lgb.LGBMRegressor(**best_params)

[I 2024-05-20 00:16:07,901] A new study created in memory with name: no-name-aeec75dc-d16c-4ff3-a438-e068e49d2463
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),          # Step size shrinkage
[I 2024-05-20 00:16:12,068] Trial 0 finished with value: 0.9201464930481723 and parameters: {'n_estimators': 159, 'learning_rate': 0.024114706538073963, 'max_depth': 3, 'num_leaves': 70}. Best is trial 0 with value: 0.9201464930481723.


Best trial:
  Value: 0.920
  Params:  {'n_estimators': 159, 'learning_rate': 0.024114706538073963, 'max_depth': 3, 'num_leaves': 70}


In [64]:
def best_params_for_model(trial):
    """
    Define the search space for hyperparameters and train the CatBoost model with the suggested hyperparameters.

    Parameters:
    - trial: Optuna's Trial object for sampling hyperparameters

    Returns:
    - auc: Mean R-squared score from cross-validation using the suggested hyperparameters
    """
    # Define the search space for hyperparameters
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),                     # Number of boosting iterations
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),          # Step size shrinkage
        'depth': trial.suggest_int('depth', 3, 10),                                    # Depth of the trees
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),                # L2 regularization term
        'loss_function': trial.suggest_categorical('loss_function', ['RMSE'])           # Loss function for regression tasks
    }

    # Initialize CatBoost regressor with suggested hyperparameters
    cb_reg = CatBoostRegressor(**param)

    # Calculate mean R-squared score from cross-validation
    auc = cross_val_score(cb_reg, X_train_boost, y_train_boost, cv=3, scoring='r2', n_jobs=-1).mean()

    return auc

# Create a study object to optimize hyperparameters
study = optuna.create_study(direction='maximize')

# Optimize hyperparameters using Optuna
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

# Initialize the best CatBoost model with the best hyperparameters
best_cb_model = CatBoostRegressor(**best_params)

[I 2024-05-20 00:16:53,742] A new study created in memory with name: no-name-e98617c9-87fb-4b3f-8ba9-326d082b8e28
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),          # Step size shrinkage
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),                # L2 regularization term
[I 2024-05-20 00:18:10,030] Trial 0 finished with value: 0.9465471791738568 and parameters: {'iterations': 754, 'learning_rate': 0.21665114341149053, 'depth': 9, 'l2_leaf_reg': 0.7531306670934673, 'loss_function': 'RMSE'}. Best is trial 0 with value: 0.9465471791738568.


Best trial:
  Value: 0.947
  Params:  {'iterations': 754, 'learning_rate': 0.21665114341149053, 'depth': 9, 'l2_leaf_reg': 0.7531306670934673, 'loss_function': 'RMSE'}


In [68]:
def best_params_for_model(trial):
    """
    Define the search space for hyperparameters and train the Random Forest model with the suggested hyperparameters.

    Parameters:
    - trial: Optuna's Trial object for sampling hyperparameters

    Returns:
    - auc: Mean R-squared score from cross-validation using the suggested hyperparameters
    """
    # Define the search space for hyperparameters
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 200),                   # Number of trees in the forest
        'max_depth': trial.suggest_int('max_depth', 3, 20),                            # Maximum depth of the trees
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),            # Minimum number of samples required to split a node
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),               # Minimum number of samples required to be at a leaf node
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),  # Number of features to consider when looking for the best split
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])             # Whether bootstrap samples are used when building trees
    }

    # Initialize Random Forest regressor with suggested hyperparameters
    rf_reg = RandomForestRegressor(**param)

    # Use a smaller subset for cross-validation to speed up the process
    auc = cross_val_score(rf_reg, X_train_boost, y_train_boost, cv=3, scoring='r2', n_jobs=-1).mean()

    return auc

# Create a study object to optimize hyperparameters
study = optuna.create_study(direction='maximize')

# Optimize hyperparameters using Optuna with a reduced number of trials
study.optimize(best_params_for_model, n_trials=20)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

# Initialize the best Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(**best_params)
best_rf_model.fit(X_train_boost, y_train_boost)

[I 2024-05-20 00:21:56,740] A new study created in memory with name: no-name-1173f48e-2713-4f69-b95d-0fdba533c3d9
[I 2024-05-20 00:22:00,036] Trial 0 finished with value: 0.8985296576713931 and parameters: {'n_estimators': 32, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': None, 'bootstrap': False}. Best is trial 0 with value: 0.8985296576713931.
[I 2024-05-20 00:22:01,887] Trial 1 finished with value: 0.8974101473597381 and parameters: {'n_estimators': 163, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.8985296576713931.
[I 2024-05-20 00:22:02,532] Trial 2 finished with value: 0.8086753315283873 and parameters: {'n_estimators': 89, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.8985296576713931.
[I 2024-05-20 00:22:03,727] Trial 3 finished with value: 0.8803921946058821 a

Best trial:
  Value: 0.945
  Params:  {'n_estimators': 60, 'max_depth': 13, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': True}


In [69]:
# Initialize an empty list to store optimized models
models_optimized = []

# Add optimized models to the list
models_optimized.extend([
    ('XGBoost Optuna', best_xgb_model),        # Add optimized XGBoost model
    ('LightGBM Optuna', best_lgb_model),       # Add optimized LightGBM model
    ('CatBoost Optuna', best_cb_model),        # Add optimized CatBoost model
    ('RandomForest Optuna', best_rf_model)     # Add optimized Random Forest model
])

In [70]:
# Initialize an empty DataFrame to store model names and their corresponding R-squared scores for optimized models
r2_df_optuna = pd.DataFrame(columns=['Model', 'R2'])

# Iterate over each optimized model in the list of models_optimized
for model_name, model in models_optimized:
    # Train and evaluate the optimized model
    r2 = train_and_evaluate_model(model_name, model, X_train_boost, y_train_boost,X_test_boost, y_test_boost)
    
    # If the R-squared score is not None, add it to the DataFrame
    if r2 is not None:
        r2_df_optuna = pd.concat([r2_df_optuna, pd.DataFrame({'Model': [model_name], 'R2': [r2]})], ignore_index=True)

# Sort the DataFrame by R-squared score in descending order for optimized models
r2_df_sorted_optuna = r2_df_optuna.sort_values(by='R2', ascending=False)

Model Performance for XGBoost Optuna
Mean Absolute Error: 1.6903095021498442
Mean Squared Error: 6.254253715556601
R-squared: 0.9330746241823848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000714 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3312
[LightGBM] [Info] Number of data points in the train set: 2056, number of used features: 20
[LightGBM] [Info] Start training from score 69.395112
Model Performance for LightGBM Optuna
Mean Absolute Error: 1.783500815883585
Mean Squared Error: 6.17495653084404
R-squared: 0.9339231656918165
0:	learn: 7.7772047	total: 41.1ms	remaining: 30.9s
1:	learn: 6.5232067	total: 81ms	remaining: 30.5s
2:	learn: 5.4923079	total: 121ms	remaining: 30.2s
3:	learn: 4.7202694	total: 162ms	remaining: 30.4s
4:	learn: 4.1521204	total: 203ms	remaining: 30.5s
5:	learn: 3.6818695	total: 243ms	remaining: 30.3s
6:	learn: 3.3606357	total: 253ms	remaining: 27s
7:	learn: 3.0768324	t

In [71]:
# Display the DataFrame with model names and their corresponding R-squared scores for optimized models, sorted by R-squared score in descending order
r2_df_sorted_optuna

Unnamed: 0,Model,R2
3,RandomForest Optuna,0.963467
2,CatBoost Optuna,0.962459
1,LightGBM Optuna,0.933923
0,XGBoost Optuna,0.933075


In [72]:
# Concatenate the DataFrames containing the sorted R-squared scores for both non-optimized and optimized models
final_review = pd.concat([r2_df_sorted, r2_df_sorted_optuna], axis=0)

# Sort the concatenated DataFrame by R-squared score in descending order
final_review_sorted = final_review.sort_values(by='R2', ascending=False)

# Reset the index of the sorted DataFrame to start from 0
final_review_sorted.reset_index(drop=True, inplace=True)

# Display the sorted DataFrame
final_review_sorted

Unnamed: 0,Model,R2
0,CatBoost_Custom,0.970382
1,CatBoost,0.96742
2,RandomForest,0.964382
3,XGBoost,0.963719
4,RandomForest Optuna,0.963467
5,LightGBM,0.963277
6,CatBoost Optuna,0.962459
7,LightGBM Optuna,0.933923
8,XGBoost Optuna,0.933075
9,LinearRegression,0.821231
