In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error
import optuna

  from .autonotebook import tqdm as notebook_tqdm


this is a second try where i try to store data in cvs files, afterwards we can analyse what is happening. 

In [2]:
output_folder = "/workspace/code/data/LightGBM/Optuna_cvs"  # <<< MODIFY THIS

# Make sure folder exists
import os
os.makedirs(output_folder, exist_ok=True)

In [5]:
import os
import numpy as np
import pandas as pd
import optuna
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error

# ------------------------------------------------------
# Settings / Seed
# ------------------------------------------------------
seed = 100
np.random.seed(seed)

# ------------------------------------------------------
# Load Data
# ------------------------------------------------------
df = pd.read_pickle("../data/RDKit/rdkit_only_valid_smiles_qm9.pkl")

X = df.drop(columns=['gaps', 'SMILES'])
y = df['gaps']

# ------------------------------------------------------
# Stratification for Regression
# ------------------------------------------------------
def make_stratified_bins(y, n_bins=10):
    """
    Creates quantile bins for stratified k-fold in regression.
    """
    return pd.qcut(y, q=n_bins, labels=False, duplicates="drop")

# ------------------------------------------------------
# Optuna Objective
# ------------------------------------------------------
def objective(trial, X, y, y_bins):
    
    params = {
        "objective": "regression_l1",
        "metric": "mae",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
    }

    inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    maes = []

    for train_idx, val_idx in inner_cv.split(X, y_bins):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        train_data = lgb.Dataset(X_train, y_train)
        val_data = lgb.Dataset(X_val, y_val)

        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1500,
        )

        preds = model.predict(X_val)
        maes.append(mean_absolute_error(y_val, preds))

    return np.mean(maes)

# ------------------------------------------------------
# Nested CV
# ------------------------------------------------------
def fast_nested_cv_stratified(X, y, n_trials=50, output_folder="./results/"):

    os.makedirs(output_folder, exist_ok=True)

    y_bins = make_stratified_bins(y, n_bins=30)
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    outer_maes = []
    best_params_list = []
    all_fold_results = []

    fold_num = 1

    for train_idx, test_idx in outer_cv.split(X, y_bins):

        print(f"\n=== Outer Fold {fold_num} ===")

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        y_bins_inner = make_stratified_bins(y_train, n_bins=30)

        # Inner Optuna search
        study = optuna.create_study(direction="minimize")
        study.optimize(
            lambda t: objective(t, X_train, y_train, y_bins_inner),
            n_trials=n_trials
        )

        # Save Optuna trial history
        df_trials = study.trials_dataframe()
        df_trials.to_csv(f"{output_folder}/optuna_outer_fold_{fold_num}.csv", index=False)

        best_params = study.best_params
        best_params["verbosity"] = -1
        best_params["metric"] = "mae"
        best_params_list.append(best_params)

        # Train final model
        train_set = lgb.Dataset(X_train, y_train)
        test_set = lgb.Dataset(X_test, y_test)

        model = lgb.train(
            best_params,
            train_set,
            valid_sets=[test_set],
            num_boost_round=2000,
        )

        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        outer_maes.append(mae)

        print(f"Fold {fold_num} MAE: {mae:.4f}")

        # Save predictions
        df_pred = pd.DataFrame({
            "true": y_test.values,
            "pred": preds,
            "fold": fold_num,
        })
        df_pred.to_csv(f"{output_folder}/fold_{fold_num}_predictions.csv", index=False)

        # Save feature importances
        fi = pd.DataFrame({
            "feature": X.columns,
            "importance": model.feature_importance(),
        })
        fi.to_csv(f"{output_folder}/fold_{fold_num}_feature_importance.csv", index=False)

        all_fold_results.append(df_pred)
        fold_num += 1

    return outer_maes, best_params_list, all_fold_results

# ------------------------------------------------------
# Run Nested CV
# ------------------------------------------------------
outer_maes, best_params_list, all_fold_results = fast_nested_cv_stratified(X, y)

print("\n=== Summary of Outer Fold MAEs ===")
for i, mae in enumerate(outer_maes, 1):
    print(f"Fold {i} MAE: {mae:.4f}")

print(f"Overall MAE: {np.mean(outer_maes):.4f} ± {np.std(outer_maes):.4f}")

print("\nBEST PARAMETERS PER OUTER FOLD:")
for i, params in enumerate(best_params_list, 1):
    print(f"Fold {i}: {params}")



=== Outer Fold 1 ===


[I 2025-12-06 16:41:26,743] A new study created in memory with name: no-name-b417e750-ba90-4afa-b0aa-5f418d7e077d
[I 2025-12-06 17:36:29,255] Trial 0 finished with value: 0.209299131349503 and parameters: {'num_leaves': 82, 'learning_rate': 0.0049870907285596905, 'feature_fraction': 0.9462428653783523, 'bagging_fraction': 0.764194141800638, 'bagging_freq': 3, 'min_data_in_leaf': 93}. Best is trial 0 with value: 0.209299131349503.
[I 2025-12-06 18:00:29,367] Trial 1 finished with value: 0.17270350549577415 and parameters: {'num_leaves': 42, 'learning_rate': 0.06490232225015537, 'feature_fraction': 0.8802843256662316, 'bagging_fraction': 0.8946036437827809, 'bagging_freq': 4, 'min_data_in_leaf': 46}. Best is trial 1 with value: 0.17270350549577415.
[I 2025-12-06 18:18:04,320] Trial 2 finished with value: 0.1691929964815168 and parameters: {'num_leaves': 122, 'learning_rate': 0.02116963653823926, 'feature_fraction': 0.825434914267058, 'bagging_fraction': 0.9061465820406872, 'bagging_freq'

Fold 1 MAE: 0.1417

=== Outer Fold 2 ===


[I 2025-12-07 03:45:02,724] A new study created in memory with name: no-name-3a90456a-70ee-448c-8a57-e7dfc053f107
[I 2025-12-07 03:58:43,503] Trial 0 finished with value: 0.17342074926836465 and parameters: {'num_leaves': 232, 'learning_rate': 0.00744666501919809, 'feature_fraction': 0.7167097771884452, 'bagging_fraction': 0.8487414144205976, 'bagging_freq': 5, 'min_data_in_leaf': 48}. Best is trial 0 with value: 0.17342074926836465.
[I 2025-12-07 04:14:44,630] Trial 1 finished with value: 0.334064756693231 and parameters: {'num_leaves': 226, 'learning_rate': 0.0011526786202366989, 'feature_fraction': 0.8814418697469981, 'bagging_fraction': 0.7717099556113427, 'bagging_freq': 3, 'min_data_in_leaf': 21}. Best is trial 0 with value: 0.17342074926836465.
[I 2025-12-07 04:26:55,266] Trial 2 finished with value: 0.28831750643904364 and parameters: {'num_leaves': 124, 'learning_rate': 0.0015575518550132435, 'feature_fraction': 0.7155477140044098, 'bagging_fraction': 0.8914997231202796, 'bagg

Fold 2 MAE: 0.1408

=== Outer Fold 3 ===


[I 2025-12-07 13:35:41,041] A new study created in memory with name: no-name-d9a19954-8f11-42a2-8e7c-028453b189e0
[I 2025-12-07 13:46:05,609] Trial 0 finished with value: 0.16046125160874775 and parameters: {'num_leaves': 173, 'learning_rate': 0.07914645253886592, 'feature_fraction': 0.9125141011156918, 'bagging_fraction': 0.7217921997427452, 'bagging_freq': 5, 'min_data_in_leaf': 76}. Best is trial 0 with value: 0.16046125160874775.
[I 2025-12-07 13:56:14,262] Trial 1 finished with value: 0.2108651328115298 and parameters: {'num_leaves': 66, 'learning_rate': 0.0055407898128361, 'feature_fraction': 0.8815758807648181, 'bagging_fraction': 0.9862320670132045, 'bagging_freq': 2, 'min_data_in_leaf': 35}. Best is trial 0 with value: 0.16046125160874775.
[I 2025-12-07 14:11:54,573] Trial 2 finished with value: 0.1905889379636941 and parameters: {'num_leaves': 177, 'learning_rate': 0.0044915824004239805, 'feature_fraction': 0.9668949928741443, 'bagging_fraction': 0.9259883638283968, 'bagging_

Fold 3 MAE: 0.1411

=== Outer Fold 4 ===


[I 2025-12-07 22:57:32,061] A new study created in memory with name: no-name-5b982c59-e75e-424e-85a8-74f09aae0b7f
[I 2025-12-07 23:15:02,539] Trial 0 finished with value: 0.2045215461157664 and parameters: {'num_leaves': 237, 'learning_rate': 0.0028138226462603817, 'feature_fraction': 0.8738316262540823, 'bagging_fraction': 0.8670572480016119, 'bagging_freq': 4, 'min_data_in_leaf': 45}. Best is trial 0 with value: 0.2045215461157664.
[I 2025-12-07 23:26:10,121] Trial 1 finished with value: 0.27720281355292037 and parameters: {'num_leaves': 83, 'learning_rate': 0.0018247276397552596, 'feature_fraction': 0.8544018541759384, 'bagging_fraction': 0.8771973122432484, 'bagging_freq': 2, 'min_data_in_leaf': 58}. Best is trial 0 with value: 0.2045215461157664.
[I 2025-12-07 23:33:15,668] Trial 2 finished with value: 0.19598403598201758 and parameters: {'num_leaves': 52, 'learning_rate': 0.012465704202126882, 'feature_fraction': 0.8668118849431821, 'bagging_fraction': 0.8762248747809818, 'baggin

Fold 4 MAE: 0.1430

=== Outer Fold 5 ===


[I 2025-12-08 09:05:59,627] A new study created in memory with name: no-name-2dc752fa-522d-4864-8c45-83178c25c404
[I 2025-12-08 09:20:19,717] Trial 0 finished with value: 0.16122292701276486 and parameters: {'num_leaves': 216, 'learning_rate': 0.023569229435299183, 'feature_fraction': 0.820911515412293, 'bagging_fraction': 0.8986079517197119, 'bagging_freq': 4, 'min_data_in_leaf': 59}. Best is trial 0 with value: 0.16122292701276486.
[I 2025-12-08 09:36:01,326] Trial 1 finished with value: 0.16343153810687322 and parameters: {'num_leaves': 223, 'learning_rate': 0.029123401623202535, 'feature_fraction': 0.8924297554100104, 'bagging_fraction': 0.7045365501575446, 'bagging_freq': 5, 'min_data_in_leaf': 91}. Best is trial 0 with value: 0.16122292701276486.
[I 2025-12-08 09:43:46,157] Trial 2 finished with value: 0.17560664226923142 and parameters: {'num_leaves': 66, 'learning_rate': 0.027886669964724144, 'feature_fraction': 0.8329305409725136, 'bagging_fraction': 0.8785978741719486, 'baggi

Fold 5 MAE: 0.1423

=== Summary of Outer Fold MAEs ===
Fold 1 MAE: 0.1417
Fold 2 MAE: 0.1408
Fold 3 MAE: 0.1411
Fold 4 MAE: 0.1430
Fold 5 MAE: 0.1423
Overall MAE: 0.1418 ± 0.0008

BEST PARAMETERS PER OUTER FOLD:
Fold 1: {'num_leaves': 244, 'learning_rate': 0.07321816510351337, 'feature_fraction': 0.834631822588647, 'bagging_fraction': 0.9208924098748327, 'bagging_freq': 5, 'min_data_in_leaf': 31, 'verbosity': -1, 'metric': 'mae'}
Fold 2: {'num_leaves': 239, 'learning_rate': 0.07578027741042573, 'feature_fraction': 0.7767958495344323, 'bagging_fraction': 0.9225552909296242, 'bagging_freq': 4, 'min_data_in_leaf': 26, 'verbosity': -1, 'metric': 'mae'}
Fold 3: {'num_leaves': 231, 'learning_rate': 0.08982961244272815, 'feature_fraction': 0.7355949667918182, 'bagging_fraction': 0.9978645173144768, 'bagging_freq': 4, 'min_data_in_leaf': 35, 'verbosity': -1, 'metric': 'mae'}
Fold 4: {'num_leaves': 240, 'learning_rate': 0.04635044026286379, 'feature_fraction': 0.9018367769740792, 'bagging_fract

In [6]:
# ===============================================
# Train FINAL model on the full dataset
# ===============================================

# Choose the best params from the best fold
best_index = np.argmin(outer_maes)
final_params = best_params_list[best_index]

print("Using best parameters from outer fold:", best_index + 1)
print(final_params)

# Train on full dataset
train_set_full = lgb.Dataset(X, y)

final_model = lgb.train(
    final_params,
    train_set_full,
    num_boost_round=2500
)

# Save model
final_model.save_model("final_full_dataset_model.txt")
print("Final model saved.")

final_preds = final_model.predict(X)

df_final = pd.DataFrame({
    "true": y,
    "pred": final_preds
})
df_final.to_csv("final_dataset_predictions.csv", index=False)


Using best parameters from outer fold: 2
{'num_leaves': 239, 'learning_rate': 0.07578027741042573, 'feature_fraction': 0.7767958495344323, 'bagging_fraction': 0.9225552909296242, 'bagging_freq': 4, 'min_data_in_leaf': 26, 'verbosity': -1, 'metric': 'mae'}
Final model saved.
