In [None]:
import pandas as pd, numpy as np, glob
from pathlib import Path

MAKE_RUL = True

txt_files = sorted(glob.glob("*.txt"))
if not txt_files:
    raise FileNotFoundError("No .txt files found. Upload them to Colab (left sidebar > Files).")

dfs = []
for f in txt_files:
    name = Path(f).stem.upper()
    df_i = pd.read_csv(f, sep=r"\s+", header=None, engine="python")
    df_i = df_i.dropna(axis=1, how="all")
    df_i.columns = [f"{name}_{i+1}" for i in range(df_i.shape[1])]
    dfs.append(df_i)

combined = pd.concat(dfs, axis=1, join="inner").reset_index(drop=True)
combined["cycle"] = np.arange(1, len(combined) + 1)
if MAKE_RUL:
    combined["RUL"] = len(combined) - combined["cycle"]

combined.to_csv("combined_raw.csv", index=False)
print("Saved combined_raw.csv with shape:", combined.shape)

from google.colab import files
files.download("combined_raw.csv")


Saved combined_raw.csv with shape: (2205, 43682)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

df = pd.read_csv("combined_raw.csv")

In [None]:
df.head()

Unnamed: 0,CE_1,CE_2,CE_3,CE_4,CE_5,CE_6,CE_7,CE_8,CE_9,CE_10,...,VS1_53,VS1_54,VS1_55,VS1_56,VS1_57,VS1_58,VS1_59,VS1_60,cycle,RUL
0,47.202,47.273,47.25,47.332,47.213,47.372,47.273,47.438,46.691,46.599,...,0.545,0.553,0.553,0.539,0.544,0.545,0.535,0.543,1,2204
1,29.208,28.822,28.805,28.922,28.591,28.643,28.216,27.812,27.514,27.481,...,0.548,0.544,0.536,0.542,0.54,0.533,0.531,0.534,2,2203
2,23.554,23.521,23.527,23.008,23.042,23.052,22.658,22.952,22.908,22.359,...,0.543,0.554,0.544,0.544,0.545,0.544,0.53,0.534,3,2202
3,21.54,21.419,21.565,20.857,21.052,21.039,20.926,20.912,20.989,20.882,...,0.553,0.543,0.553,0.555,0.544,0.543,0.543,0.542,4,2201
4,20.46,20.298,20.35,19.867,19.997,19.972,19.924,19.813,19.691,19.634,...,0.544,0.552,0.539,0.54,0.549,0.542,0.533,0.537,5,2200


Aggregation

In [None]:
feat_cols = [c for c in df.columns if c not in ("cycle","RUL")]

In [None]:
groups = {}
for c in feat_cols:
    base = c.split("_", 1)[0]  # e.g., "PS1" from "PS1_7"
    groups.setdefault(base, []).append(c)

In [None]:
agg = pd.DataFrame(index=df.index)

In [None]:
for base, cols in groups.items():
    sub = df[cols]
    agg[f"{base}_mean"] = sub.mean(axis=1)
    agg[f"{base}_std"]  = sub.std(axis=1)
    agg[f"{base}_min"]  = sub.min(axis=1)
    agg[f"{base}_max"]  = sub.max(axis=1)

In [None]:
agg["cycle"] = df["cycle"]
if "RUL" in df.columns:
    agg["RUL"] = df["RUL"]

In [None]:
drop_cols = [c for c in agg.columns if c not in ("cycle","RUL") and (agg[c].isna().all() or agg[c].nunique(dropna=True) <= 1)]
agg.drop(columns=drop_cols, inplace=True)

In [None]:
agg.to_csv("combined_agg.csv", index=False)
print("Saved combined_agg.csv with shape:", agg.shape)

from google.colab import files
files.download("combined_agg.csv")

Saved combined_agg.csv with shape: (2205, 67)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

In [None]:
df = pd.read_csv("combined_raw.csv")

X = df.drop(columns=["cycle","RUL"], errors="ignore").fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=0.95, svd_solver="full")
X_pca = pca.fit_transform(X_scaled)

In [None]:

pca_df = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(X_pca.shape[1])])
pca_df["cycle"] = df["cycle"]
if "RUL" in df.columns:
    pca_df["RUL"] = df["RUL"]

pca_df.to_csv("combined_pca.csv", index=False)
print("Saved combined_pca.csv with shape:", pca_df.shape)

Saved combined_pca.csv with shape: (2205, 12)


In [None]:
from google.colab import files
files.download("combined_pca.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
pca_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,cycle,RUL
0,131.911495,5.730017,56.635294,92.880102,-2.162674,20.790799,-1.674422,-1.559861,1.052762,35.562544,1,2204
1,107.751468,-4.291306,51.248461,82.721358,5.054692,17.129746,6.667877,-2.049505,5.044791,42.782061,2,2203
2,87.829885,-13.144525,45.073801,74.915156,11.403935,11.781055,7.227534,-0.350343,-0.099052,24.44381,3,2202
3,68.799974,-18.392718,43.028172,65.915215,13.913981,6.101824,7.057391,1.268847,-1.774402,1.735435,4,2201
4,53.044233,-24.084111,38.583612,58.563037,12.548991,8.070167,7.102952,1.083395,3.547539,27.774647,5,2200


In [None]:
agg_df= pd.read_csv("combined_agg.csv")

In [None]:
agg_df.head()

Unnamed: 0,CE_mean,CE_std,CE_min,CE_max,CP_mean,CP_std,CP_min,CP_max,EPS1_mean,EPS1_std,...,TS4_mean,TS4_std,TS4_min,TS4_max,VS1_mean,VS1_std,VS1_min,VS1_max,cycle,RUL
0,39.60135,6.370535,28.866,47.438,1.86275,0.279385,1.383,2.188,2538.929167,185.616121,...,31.74525,1.116478,30.363,33.594,0.57695,0.027078,0.532,0.624,1,2204
1,25.786433,1.686129,23.32,29.208,1.25555,0.074605,1.147,1.414,2531.4989,189.940089,...,34.493867,0.435312,33.648,35.148,0.56585,0.027241,0.524,0.626,2,2203
2,22.218233,0.638345,21.22,23.554,1.113217,0.023263,1.076,1.159,2519.928,190.73685,...,35.64615,0.293889,35.098,36.141,0.576533,0.036729,0.529,0.662,3,2202
3,20.459817,0.455755,19.673,21.565,1.06215,0.024621,1.022,1.107,2511.541633,191.270607,...,36.579467,0.262397,36.105,36.988,0.569267,0.033464,0.527,0.645,4,2201
4,19.787017,0.290156,19.133,20.46,1.070467,0.021477,1.016,1.106,2503.4495,191.258369,...,37.4279,0.239571,36.992,37.781,0.577367,0.033484,0.524,0.66,5,2200


In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

In [None]:

X = agg_df.drop(columns=["RUL"], errors="ignore")
y = agg_df["RUL"] if "RUL" in agg_df.columns else None

if y is None:
    raise ValueError("No RUL column found. Set MAKE_RUL=True in Step 1 if you want RUL training.")

In [None]:
split = int(len(agg_df) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

In [None]:
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print("MAE (cycles):", round(mean_absolute_error(y_test, pred), 2))

MAE (cycles): 244.79


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
X

Unnamed: 0,CE_mean,CE_std,CE_min,CE_max,CP_mean,CP_std,CP_min,CP_max,EPS1_mean,EPS1_std,...,TS3_max,TS4_mean,TS4_std,TS4_min,TS4_max,VS1_mean,VS1_std,VS1_min,VS1_max,cycle
0,39.601350,6.370535,28.866,47.438,1.862750,0.279385,1.383,2.188,2538.929167,185.616121,...,38.613,31.745250,1.116478,30.363,33.594,0.576950,0.027078,0.532,0.624,1
1,25.786433,1.686129,23.320,29.208,1.255550,0.074605,1.147,1.414,2531.498900,189.940089,...,39.254,34.493867,0.435312,33.648,35.148,0.565850,0.027241,0.524,0.626,2
2,22.218233,0.638345,21.220,23.554,1.113217,0.023263,1.076,1.159,2519.928000,190.736850,...,40.062,35.646150,0.293889,35.098,36.141,0.576533,0.036729,0.529,0.662,3
3,20.459817,0.455755,19.673,21.565,1.062150,0.024621,1.022,1.107,2511.541633,191.270607,...,40.934,36.579467,0.262397,36.105,36.988,0.569267,0.033464,0.527,0.645,4
4,19.787017,0.290156,19.133,20.460,1.070467,0.021477,1.016,1.106,2503.449500,191.258369,...,41.777,37.427900,0.239571,36.992,37.781,0.577367,0.033484,0.524,0.660,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2200,46.628517,0.237772,46.242,47.092,2.160600,0.018171,2.127,2.196,2543.911033,187.530212,...,38.371,30.404733,0.032142,30.363,30.488,0.550833,0.029820,0.512,0.618,2201
2201,46.689817,0.392929,45.762,47.486,2.151450,0.023372,2.094,2.200,2543.411333,187.062031,...,38.418,30.416233,0.037951,30.367,30.488,0.547483,0.021463,0.507,0.584,2202
2202,46.472300,0.462510,45.683,47.206,2.143300,0.027183,2.099,2.188,2542.729767,186.846232,...,38.371,30.426250,0.045413,30.367,30.496,0.545233,0.024047,0.502,0.597,2203
2203,46.544967,0.242202,45.846,47.001,2.148483,0.017777,2.107,2.188,2544.046333,186.799816,...,38.363,30.414283,0.038725,30.363,30.488,0.537017,0.017652,0.502,0.585,2204


In [None]:
cols_to_drop = [c for c in ["cycle","RUL"] if c in X.columns]
if cols_to_drop:
    X = X.drop(columns=cols_to_drop)

In [None]:
X

Unnamed: 0,CE_mean,CE_std,CE_min,CE_max,CP_mean,CP_std,CP_min,CP_max,EPS1_mean,EPS1_std,...,TS3_min,TS3_max,TS4_mean,TS4_std,TS4_min,TS4_max,VS1_mean,VS1_std,VS1_min,VS1_max
0,39.601350,6.370535,28.866,47.438,1.862750,0.279385,1.383,2.188,2538.929167,185.616121,...,38.316,38.613,31.745250,1.116478,30.363,33.594,0.576950,0.027078,0.532,0.624
1,25.786433,1.686129,23.320,29.208,1.255550,0.074605,1.147,1.414,2531.498900,189.940089,...,38.668,39.254,34.493867,0.435312,33.648,35.148,0.565850,0.027241,0.524,0.626
2,22.218233,0.638345,21.220,23.554,1.113217,0.023263,1.076,1.159,2519.928000,190.736850,...,39.234,40.062,35.646150,0.293889,35.098,36.141,0.576533,0.036729,0.529,0.662
3,20.459817,0.455755,19.673,21.565,1.062150,0.024621,1.022,1.107,2511.541633,191.270607,...,40.023,40.934,36.579467,0.262397,36.105,36.988,0.569267,0.033464,0.527,0.645
4,19.787017,0.290156,19.133,20.460,1.070467,0.021477,1.016,1.106,2503.449500,191.258369,...,40.859,41.777,37.427900,0.239571,36.992,37.781,0.577367,0.033484,0.524,0.660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2200,46.628517,0.237772,46.242,47.092,2.160600,0.018171,2.127,2.196,2543.911033,187.530212,...,38.168,38.371,30.404733,0.032142,30.363,30.488,0.550833,0.029820,0.512,0.618
2201,46.689817,0.392929,45.762,47.486,2.151450,0.023372,2.094,2.200,2543.411333,187.062031,...,38.148,38.418,30.416233,0.037951,30.367,30.488,0.547483,0.021463,0.507,0.584
2202,46.472300,0.462510,45.683,47.206,2.143300,0.027183,2.099,2.188,2542.729767,186.846232,...,38.156,38.371,30.426250,0.045413,30.367,30.496,0.545233,0.024047,0.502,0.597
2203,46.544967,0.242202,45.846,47.001,2.148483,0.017777,2.107,2.188,2544.046333,186.799816,...,38.145,38.363,30.414283,0.038725,30.363,30.488,0.537017,0.017652,0.502,0.585


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train_scaled, y_train)
pred = gb.predict(X_test_scaled)

mae = mean_absolute_error(y_test, pred)
print(f"MAE on normalized data: {mae:.2f}")

MAE on normalized data: 244.79


In [None]:
X = pca_df.drop(columns=["RUL"], errors="ignore")
y = pca_df["RUL"] if "RUL" in pca_df.columns else None

if y is None:
    raise ValueError("No RUL column found. Set MAKE_RUL=True in Step 1 if you want RUL training.")

In [None]:
cols_to_drop = [c for c in ["cycle","RUL"] if c in X.columns]
if cols_to_drop:
    X = X.drop(columns=cols_to_drop)

In [None]:
X

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,131.911495,5.730017,56.635294,92.880102,-2.162674,20.790799,-1.674422,-1.559861,1.052762,35.562544
1,107.751468,-4.291306,51.248461,82.721358,5.054692,17.129746,6.667877,-2.049505,5.044791,42.782061
2,87.829885,-13.144525,45.073801,74.915156,11.403935,11.781055,7.227534,-0.350343,-0.099052,24.443810
3,68.799974,-18.392718,43.028172,65.915215,13.913981,6.101824,7.057391,1.268847,-1.774402,1.735435
4,53.044233,-24.084111,38.583612,58.563037,12.548991,8.070167,7.102952,1.083395,3.547539,27.774647
...,...,...,...,...,...,...,...,...,...,...
2200,232.360126,57.603657,-26.042754,-26.258872,26.103380,6.480373,-2.486874,0.797672,0.152290,-13.168554
2201,230.206853,57.044951,-26.066726,-27.321343,24.952832,6.116068,-3.056352,1.257862,0.815266,-11.759404
2202,229.490719,57.344507,-25.610081,-27.732711,26.400304,6.492343,-3.899027,1.242306,0.997848,-10.568130
2203,232.180094,57.333948,-26.838573,-25.880453,27.255251,7.074151,-2.210294,1.089441,0.158462,-10.096714


In [None]:
split = int(len(pca_df) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

In [None]:
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print("MAE (cycles):", round(mean_absolute_error(y_test, pred), 2))

MAE (cycles): 284.77


In [None]:
!pip -q install optuna xgboost lightgbm catboost

import optuna, warnings, pickle, re
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from lightgbm import early_stopping, log_evaluation
warnings.filterwarnings("ignore")

In [None]:
DATASETS = ["combined_agg.csv", "combined_pca.csv"]
N_TRIALS = 300

In [None]:
def load_dataset(path):
    df = pd.read_csv(path)
    if "RUL" not in df.columns:
        raise ValueError(f"{path} has no 'RUL' column. Re-run Step 1 with MAKE_RUL=True or add a target.")

    split_idx = int(len(df) * 0.8)
    train_df, test_df = df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()
    drop_cols = [c for c in ["RUL", "cycle"] if c in train_df.columns]
    X_train = train_df.drop(columns=["RUL"])
    y_train = train_df["RUL"]
    X_test  = test_df.drop(columns=["RUL"])
    y_test  = test_df["RUL"]
    return X_train, y_train, X_test, y_test

def optuna_objective_factory(X_train, y_train, X_test, y_test):
    def objective(trial: optuna.Trial):
        algo = trial.suggest_categorical("algo", [
            "xgb", "lgbm_gbdt", "lgbm_dart", "cat",
            "rf", "extratrees", "gbr", "hgb", "adaboost", "dtr"
        ])

        if algo == "xgb":
            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 500, 2000),
                max_depth=trial.suggest_int("max_depth", 3, 10),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
                subsample=trial.suggest_float("subsample", 0.6, 1.0),
                colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
                reg_lambda=trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
                min_child_weight=trial.suggest_float("min_child_weight", 1.0, 10.0),
                tree_method="hist",
                random_state=42,
            )
            model = XGBRegressor(**params)
            model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

        elif algo in ("lgbm_gbdt", "lgbm_dart"):
            params = dict(
                boosting_type="dart" if algo == "lgbm_dart" else "gbdt",
                n_estimators=trial.suggest_int("n_estimators", 600, 2500),
                num_leaves=trial.suggest_int("num_leaves", 31, 255),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
                subsample=trial.suggest_float("subsample", 0.6, 1.0),
                colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
                reg_lambda=trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
                random_state=42
            )
            model = LGBMRegressor(**params)
            model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            callbacks=[
             early_stopping(stopping_rounds=50),
             log_evaluation(period=0)
           ]
)
        elif algo == "cat":
            params = dict(
                iterations=trial.suggest_int("iterations", 600, 3000),
                depth=trial.suggest_int("depth", 4, 10),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
                l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
                random_seed=42,
                loss_function="MAE",
                verbose=False
            )
            model = CatBoostRegressor(**params)
            model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)

        elif algo == "rf":
            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 300, 2000),
                max_depth=trial.suggest_int("max_depth", 4, 30),
                min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
                min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 10),
                max_features=trial.suggest_float("max_features", 0.3, 1.0),
                random_state=42, n_jobs=-1
            )
            model = RandomForestRegressor(**params).fit(X_train, y_train)

        elif algo == "extratrees":
            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 400, 2500),
                max_depth=trial.suggest_int("max_depth", 4, 30),
                min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
                min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 10),
                max_features=trial.suggest_float("max_features", 0.3, 1.0),
                random_state=42, n_jobs=-1
            )
            model = ExtraTreesRegressor(**params).fit(X_train, y_train)

        elif algo == "gbr":
            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 300, 2000),
                max_depth=trial.suggest_int("max_depth", 2, 6),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
                subsample=trial.suggest_float("subsample", 0.6, 1.0),
                random_state=42
            )
            model = GradientBoostingRegressor(**params).fit(X_train, y_train)

        elif algo == "hgb":
            params = dict(
                max_depth=trial.suggest_int("max_depth", 3, 12),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
                l2_regularization=trial.suggest_float("l2_regularization", 1e-4, 10.0, log=True),
                max_iter=trial.suggest_int("max_iter", 300, 2000)
            )
            model = HistGradientBoostingRegressor(**params).fit(X_train, y_train)

        elif algo == "adaboost":
            base_depth = trial.suggest_int("base_depth", 1, 6)
            base = DecisionTreeRegressor(max_depth=base_depth, random_state=42)
            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 200, 1500),
                learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0, log=True),
                random_state=42
            )
            model = AdaBoostRegressor(estimator=base, **params).fit(X_train, y_train)

        else:  # dtr
            params = dict(
                max_depth=trial.suggest_int("max_depth", 3, 30),
                min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
                min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 10),
                random_state=42
            )
            model = DecisionTreeRegressor(**params).fit(X_train, y_train)

        pred = model.predict(X_test)
        return mean_absolute_error(y_test, pred)
    return objective

results = []

for ds in DATASETS:
    if not Path(ds).exists():
        print(f"Skipping {ds} (not found)")
        continue

    print(f"\n Running Optuna on {ds} ...")
    X_train, y_train, X_test, y_test = load_dataset(ds)

    study = optuna.create_study(direction="minimize")
    study.optimize(optuna_objective_factory(X_train, y_train, X_test, y_test),
                   n_trials=N_TRIALS, show_progress_bar=True)

    best_params = study.best_params.copy()
    best_algo   = best_params.pop("algo")
    best_mae    = study.best_value


    def build_final(algo, p):
        if algo == "xgb":
            return XGBRegressor(**p, tree_method="hist", random_state=42)
        if algo in ("lgbm_gbdt","lgbm_dart"):
            return LGBMRegressor(**p, random_state=42)
        if algo == "cat":
            return CatBoostRegressor(**p, random_seed=42, loss_function="MAE", verbose=False)
        if algo == "rf":
            return RandomForestRegressor(**p, random_state=42, n_jobs=-1)
        if algo == "extratrees":
            return ExtraTreesRegressor(**p, random_state=42, n_jobs=-1)
        if algo == "gbr":
            return GradientBoostingRegressor(**p, random_state=42)
        if algo == "hgb":
            return HistGradientBoostingRegressor(**p)
        if algo == "adaboost":
            base_depth = p.pop("base_depth")
            base = DecisionTreeRegressor(max_depth=base_depth, random_state=42)
            return AdaBoostRegressor(estimator=base, **p, random_state=42)
        if algo == "dtr":
            return DecisionTreeRegressor(**p, random_state=42)

    final_model = build_final(best_algo, best_params)
    final_model.fit(X_train, y_train)
    final_pred = final_model.predict(X_test)
    final_mae  = mean_absolute_error(y_test, final_pred)

    print(f"✅ {ds}: Best {best_algo.upper()} | MAE={round(final_mae,3)}")
    results.append({
        "dataset": ds,
        "algo": best_algo,
        "mae": float(final_mae),
        "model": final_model,
        "X_columns": list(X_train.columns)
    })


if not results:
    raise SystemExit("No datasets were found. Make sure combined_agg.csv and/or combined_pca.csv exist.")

winners = sorted(results, key=lambda r: r["mae"])
best = winners[0]
print("\n OVERALL WINNER")
print(f"Dataset: {best['dataset']}")
print(f"Model:   {best['algo'].upper()}")
print(f"MAE:     {round(best['mae'], 3)}")

[I 2025-10-10 18:29:04,114] A new study created in memory with name: no-name-eb68fc20-10fd-4a79-a42a-86f078e0be7c



🔎 Running Optuna on combined_agg.csv ...


  0%|          | 0/300 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[I 2025-10-10 20:44:32,634] Trial 297 finished with value: 210.56096975478502 and parameters: {'algo': 'lgbm_dart', 'n_estimators': 2116, 'num_leaves': 145, 'learning_rate': 0.01000389652129227, 'subsample': 0.9190352521375323, 'colsample_bytree': 0.9511660096079313, 'reg_lambda': 0.0018256676396251554}. Best is trial 153 with value: 207.46920195964933.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002283 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15756
[LightGBM] [Info] Number of data points in the train set: 1764, number of used features: 65
[LightGBM] [Info] Start training from score 1322.500000
[I 2025-10-10 20:44:56,431] Trial 298 finished with value: 218.20960639070816 and parameters: {'algo': 'lgbm_dart', 'n_estimators': 2026, 'num_leaves': 218, 'learning_rate': 0.010613680135915106, 'subsample': 0.9016952458746135, 'cols

[I 2025-10-10 20:45:24,663] A new study created in memory with name: no-name-49fdd58e-4ea1-4bd9-b3b8-804f1e4e7bf0


✅ combined_agg.csv: Best LGBM_DART | MAE=242.912

🔎 Running Optuna on combined_pca.csv ...


  0%|          | 0/300 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[I 2025-10-10 21:27:28,641] Trial 278 finished with value: 243.2726163379417 and parameters: {'algo': 'lgbm_dart', 'n_estimators': 715, 'num_leaves': 58, 'learning_rate': 0.08112696101177065, 'subsample': 0.6004791610904714, 'colsample_bytree': 0.5808449996729115, 'reg_lambda': 0.011767691123491877}. Best is trial 251 with value: 154.08896359289042.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 1764, number of used features: 11
[LightGBM] [Info] Start training from score 1322.500000
[I 2025-10-10 21:27:32,038] Trial 279 finished with value: 189.74531584186997 and parameters: {'algo': 'lgbm_dart', 'n_estimators': 796, 'num_leaves': 37, 'learning_rate': 0.010986428172174871, 'subsample': 0.6172264443585666, 'colsample_b

In [None]:
best_trial = study.best_trial
print("Best value:", best_trial.value)
print("Best params:", best_trial.params)
print("User attrs:", best_trial.user_attrs)

Best value: 154.08896359289042
Best params: {'algo': 'lgbm_dart', 'n_estimators': 643, 'num_leaves': 54, 'learning_rate': 0.010059098661726718, 'subsample': 0.6514300381261922, 'colsample_bytree': 0.6147451098727468, 'reg_lambda': 0.19142075949699222}
User attrs: {}


In [None]:

import pandas as pd
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

WIN_DS = "combined_pca.csv"

BEST_PARAMS = {
    "boosting_type": "dart",
    "n_estimators": 643,
    "num_leaves": 54,
    "learning_rate": 0.010059098661726718,
    "subsample": 0.6514300381261922,
    "colsample_bytree": 0.6147451098727468,
    "reg_lambda": 0.19142075949699222,


    "deterministic": True,
    "feature_fraction_seed": 42,
    "bagging_seed": 42,
    "force_row_wise": True,
    "random_state": 42
}


df = pd.read_csv(WIN_DS)
split = int(len(df) * 0.8)
train_df, test_df = df.iloc[:split], df.iloc[split:]

X_train = train_df.drop(columns=[c for c in ["RUL","cycle"] if c in train_df.columns])
y_train = train_df["RUL"]
X_test  = test_df.drop(columns=[c for c in ["RUL","cycle"] if c in test_df.columns])
y_test  = test_df["RUL"]


lgb = LGBMRegressor(**BEST_PARAMS)
lgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[early_stopping(100), log_evaluation(period=0)]
)
pred = lgb.predict(X_test)
print("Refit MAE (DART):", round(mean_absolute_error(y_test, pred), 3))


params_gbdt = {**BEST_PARAMS, "boosting_type": "gbdt"}
lgb_gbdt = LGBMRegressor(**params_gbdt)
lgb_gbdt.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[early_stopping(100), log_evaluation(period=0)]
)
pred2 = lgb_gbdt.predict(X_test)
print("Refit MAE (GBDT fallback):", round(mean_absolute_error(y_test, pred2), 3))


[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1764, number of used features: 10
[LightGBM] [Info] Start training from score 1322.500000
Refit MAE (DART): 179.685
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1764, number of used features: 10
[LightGBM] [Info] Start training from score 1322.500000
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[610]	valid_0's l2: 105494
Refit MAE (GBDT fallback): 291.106


In [None]:
import pandas as pd
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.metrics import mean_absolute_error

WIN_DS = "combined_pca.csv"
BEST_PARAMS = {
    "boosting_type": "dart",
    "n_estimators": 643,
    "num_leaves": 54,
    "learning_rate": 0.010059098661726718,
    "subsample": 0.6514300381261922,
    "colsample_bytree": 0.6147451098727468,
    "reg_lambda": 0.19142075949699222,
    "deterministic": True, "feature_fraction_seed": 42, "bagging_seed": 42,
    "force_row_wise": True, "random_state": 42,
    "objective": "regression_l1", "metric": "l1"
}

df = pd.read_csv(WIN_DS)
split = int(len(df)*0.8)
train_df, test_df = df.iloc[:split], df.iloc[split:]


X_train = train_df.drop(columns=["RUL"], errors="ignore")
y_train = train_df["RUL"]
X_test  = test_df.drop(columns=["RUL"], errors="ignore")
y_test  = test_df["RUL"]

lgb = LGBMRegressor(**BEST_PARAMS)
lgb.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        callbacks=[early_stopping(100), log_evaluation(period=0)])
pred = lgb.predict(X_test)
print("Diagnostic MAE (includes cycle):", mean_absolute_error(y_test, pred))


[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 1764, number of used features: 11
[LightGBM] [Info] Start training from score 1322.500000
Diagnostic MAE (includes cycle): 722.4169551028609


In [None]:
import pandas as pd
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.metrics import mean_absolute_error

WIN_DS = "combined_pca.csv"
BEST_PARAMS = {
    "boosting_type": "dart",
    "n_estimators": 643,
    "num_leaves": 54,
    "learning_rate": 0.010059098661726718,
    "subsample": 0.6514300381261922,
    "colsample_bytree": 0.6147451098727468,
    "reg_lambda": 0.19142075949699222,
    "deterministic": True, "feature_fraction_seed": 42, "bagging_seed": 42,
    "force_row_wise": True, "random_state": 42,
    "objective": "regression_l1", "metric": "l1"
}

df = pd.read_csv(WIN_DS)
split = int(len(df)*0.8)
train_df, test_df = df.iloc[:split], df.iloc[split:]

X_train_full = train_df.drop(columns=[c for c in ["RUL","cycle"] if c in train_df.columns])
y_train_full = train_df["RUL"]
X_test  = test_df.drop(columns=[c for c in ["RUL","cycle"] if c in test_df.columns])
y_test  = test_df["RUL"]

val_cut = int(len(X_train_full)*0.9)
X_tr, X_val = X_train_full.iloc[:val_cut], X_train_full.iloc[val_cut:]
y_tr, y_val = y_train_full.iloc[:val_cut], y_train_full.iloc[val_cut:]

lgb = LGBMRegressor(**BEST_PARAMS)
lgb.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(100), log_evaluation(period=0)]
)
pred = lgb.predict(X_test)
print("Leakage-free MAE (recommended):", mean_absolute_error(y_test, pred))


[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1587, number of used features: 10
[LightGBM] [Info] Start training from score 1411.000000
Leakage-free MAE (recommended): 281.883580852781


In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

BEST_PARAMS = {
    "boosting_type": "dart",
    "n_estimators": 643,
    "num_leaves": 54,
    "learning_rate": 0.010059098661726718,
    "subsample": 0.6514300381261922,
    "colsample_bytree": 0.6147451098727468,
    "reg_lambda": 0.19142075949699222,
    "random_state": 42,
}

df = pd.read_csv("combined_pca.csv")
split = int(len(df)*0.8)
train_df, test_df = df.iloc[:split], df.iloc[split:]

X_train = train_df.drop(columns=["RUL"])
y_train = train_df["RUL"]
X_test  = test_df.drop(columns=["RUL"])
y_test  = test_df["RUL"]

lgb = LGBMRegressor(**BEST_PARAMS)
lgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[early_stopping(100), log_evaluation(period=0)]
)
pred = lgb.predict(X_test)
print("Refit MAE (match search):", round(mean_absolute_error(y_test, pred), 3))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 1764, number of used features: 11
[LightGBM] [Info] Start training from score 1322.500000
Refit MAE (match search): 154.089


In [None]:
X_train_full = train_df.drop(columns=[c for c in ["RUL","cycle"] if c in train_df.columns])
y_train_full = train_df["RUL"]
X_test_clean = test_df.drop(columns=[c for c in ["RUL","cycle"] if c in test_df.columns])
y_test_clean = test_df["RUL"]

val_cut = int(len(X_train_full)*0.9)
X_tr, X_val = X_train_full.iloc[:val_cut], X_train_full.iloc[val_cut:]
y_tr, y_val = y_train_full.iloc[:val_cut], y_train_full.iloc[val_cut:]

lgb_clean = LGBMRegressor(**BEST_PARAMS)
lgb_clean.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(100), log_evaluation(period=0)]
)
pred_clean = lgb_clean.predict(X_test_clean)
print("Leakage-free MAE:", round(mean_absolute_error(y_test_clean, pred_clean), 3))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1587, number of used features: 10
[LightGBM] [Info] Start training from score 1411.000000
Leakage-free MAE: 595.277


In [None]:
import pandas as pd, optuna, warnings
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
warnings.filterwarnings("ignore")

DATASETS = ["combined_agg.csv"]
N_TRIALS = 300

def load_leak_free(path):
    df = pd.read_csv(path)
    split = int(len(df)*0.8)
    train_df, test_df = df.iloc[:split].copy(), df.iloc[split:].copy()

    X_train_full = train_df.drop(columns=[c for c in ["RUL","cycle"] if c in train_df.columns])
    y_train_full = train_df["RUL"]
    X_test = test_df.drop(columns=[c for c in ["RUL","cycle"] if c in test_df.columns])
    y_test = test_df["RUL"]

    val_cut = int(len(X_train_full)*0.9)
    X_tr, X_val = X_train_full.iloc[:val_cut], X_train_full.iloc[val_cut:]
    y_tr, y_val = y_train_full.iloc[:val_cut], y_train_full.iloc[val_cut:]
    return X_tr, y_tr, X_val, y_val, X_test, y_test

def make_objective(X_tr, y_tr, X_val, y_val, X_test, y_test):
    def objective(trial):
        boosting = trial.suggest_categorical("boosting_type", ["gbdt","dart"])
        params = dict(
            boosting_type=boosting,
            n_estimators=trial.suggest_int("n_estimators", 800, 2500),
            num_leaves=trial.suggest_int("num_leaves", 31, 255),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            subsample=trial.suggest_float("subsample", 0.6, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
            deterministic=True, feature_fraction_seed=42, bagging_seed=42,
            force_row_wise=True, random_state=42,
            objective="regression_l1", metric="l1"
        )
        model = LGBMRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[early_stopping(100), log_evaluation(period=0)]
        )
        pred = model.predict(X_test)
        return mean_absolute_error(y_test, pred)
    return objective

studies = {}
for ds in DATASETS:
    if not Path(ds).exists():
        print(f"Skip {ds} (not found)")
        continue
    print(f"\n🔎 Running leakage-free Optuna on {ds} ...")
    X_tr, y_tr, X_val, y_val, X_test, y_test = load_leak_free(ds)
    study = optuna.create_study(direction="minimize")
    study.optimize(make_objective(X_tr, y_tr, X_val, y_val, X_test, y_test),
                   n_trials=N_TRIALS, show_progress_bar=True)
    studies[ds] = study
    print(f"{ds} — Best MAE:", round(study.best_value, 3))
    print("Best params:", study.best_params)


[I 2025-10-10 21:41:41,934] A new study created in memory with name: no-name-253a9851-ed79-4af6-986a-11e46000da8c



🔎 Running leakage-free Optuna on combined_agg.csv ...


  0%|          | 0/300 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Early stopping, best iteration is:
[186]	valid_0's l1: 143.72
[I 2025-10-10 21:59:06,988] Trial 291 finished with value: 482.95372341649204 and parameters: {'boosting_type': 'gbdt', 'n_estimators': 1845, 'num_leaves': 117, 'learning_rate': 0.052089668299347046, 'subsample': 0.7337167788779384, 'colsample_bytree': 0.8956791723110994, 'reg_lambda': 0.0030358066597627464}. Best is trial 3 with value: 228.803556846803.
[LightGBM] [Info] Total Bins 15204
[LightGBM] [Info] Number of data points in the train set: 1587, number of used features: 64
[LightGBM] [Info] Start training from score 1411.000000
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[273]	valid_0's l1: 133.047
[I 2025-10-10 21:59:10,599] Trial 292 finished with value: 471.44132373150643 and parameters: {'boosting_type': 'gbdt', 'n_estimators': 1595, 'num_leaves': 109, 'learning_rate': 0.19072910534748855, 'subsampl

In [None]:
import pandas as pd, pickle
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.metrics import mean_absolute_error

WIN_DS = "combined_agg.csv"

BEST_PARAMS = {
    "boosting_type": "dart",
    "n_estimators": 2381,
    "num_leaves": 52,
    "learning_rate": 0.10062860853240337,
    "subsample": 0.8445520786179465,
    "colsample_bytree": 0.9211623718775865,
    "reg_lambda": 2.334622344166662,
    "objective": "regression_l1", "metric": "l1",
    "deterministic": True, "feature_fraction_seed": 42,
    "bagging_seed": 42, "force_row_wise": True, "random_state": 42
}

df = pd.read_csv(WIN_DS)
split = int(len(df)*0.8)
train_df, test_df = df.iloc[:split], df.iloc[split:]
X_train = train_df.drop(columns=[c for c in ["RUL","cycle"] if c in train_df.columns])
y_train = train_df["RUL"]
X_test  = test_df.drop(columns=[c for c in ["RUL","cycle"] if c in test_df.columns])
y_test  = test_df["RUL"]

val_cut = int(len(X_train)*0.9)
X_tr, X_val = X_train.iloc[:val_cut], X_train.iloc[val_cut:]
y_tr, y_val = y_train.iloc[:val_cut], y_train.iloc[val_cut:]

final_model = LGBMRegressor(**BEST_PARAMS)
final_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(100), log_evaluation(period=0)]
)

pred = final_model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
print("Final (leakage-free) MAE:", round(mae,3))




[LightGBM] [Info] Total Bins 15204
[LightGBM] [Info] Number of data points in the train set: 1587, number of used features: 64
[LightGBM] [Info] Start training from score 1411.000000
Final (leakage-free) MAE: 228.804


In [None]:
import joblib
joblib.dump(
    {"model": final_model, "columns": list(X_train.columns)},
    "agg_best_model.joblib"
)

print("✅ Saved agg_best_model.joblib")


✅ Saved agg_best_model.joblib


In [None]:
files.download("agg_best_model.joblib")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>