In [None]:

#Imports
# =========================
import json
import time
import numpy as np
import joblib
import optuna
import mlflow
from pathlib import Path
import sqlite3
import pandas as pd

REPO_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_DIR = REPO_DIR / "data"
DATA_DIR.mkdir(exist_ok=True)

DB_PATH = DATA_DIR / "classification.db"

print("‚úÖ DB_PATH:", DB_PATH)




from pathlib import Path
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


# =========================
#Paths
# =========================
REPO_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_DIR = REPO_DIR / "data"
MODELS_DIR = REPO_DIR / "models"
METRICS_DIR = REPO_DIR / "metrics"

MODELS_DIR.mkdir(exist_ok=True)
METRICS_DIR.mkdir(exist_ok=True)


# =========================
#Preprocessing
# =========================
preprocess = Pipeline([
    ("scaler", StandardScaler())
])


# =========================
#Model factory
# =========================
def make_model(model_name: str):
    name = model_name.lower().strip()

    if name in ["ridge", "logreg", "logistic"]:
        return LogisticRegression(
            max_iter=2000,
            penalty="l2",
            solver="lbfgs"
        )

    if name in ["histgradientboosting", "hgb"]:
        return HistGradientBoostingClassifier(random_state=42)

    if name in ["xgboost", "xgb"]:
        return XGBClassifier(
            random_state=42,
            eval_metric="logloss",
            use_label_encoder=False
        )

    if name in ["lightgbm", "lgbm"]:
        return LGBMClassifier(random_state=42)

    raise ValueError(f"Unknown model_name: {model_name}")


# =========================
#Pipeline builder
# =========================
def build_pipeline(model_name: str, use_pca: bool, tuned_params=None):
    model = make_model(model_name)

    if tuned_params:
        model.set_params(**tuned_params)

    steps = [("preprocess", preprocess)]

    if use_pca:
        steps.append(("pca", PCA(n_components=0.95, random_state=42)))

    steps.append(("model", model))

    return Pipeline(steps)


# =========================
#Hyperparameter suggestions
# =========================
def suggest_params(trial, model_name):
    name = model_name.lower()

    if name in ["ridge", "logreg"]:
        return {
            "model__C": trial.suggest_float("C", 0.01, 10.0, log=True)
        }

    if name in ["xgboost", "xgb"]:
        return {
            "model__n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "model__max_depth": trial.suggest_int("max_depth", 3, 8)
        }

    if name in ["lightgbm", "lgbm"]:
        return {
            "model__n_estimators": trial.suggest_int("n_estimators", 100, 500)
        }

    if name in ["histgradientboosting", "hgb"]:
        return {
            "model__max_depth": trial.suggest_int("max_depth", 3, 10)
        }

    return {}


# =========================
#Single experiment runner
# =========================
def run_one_experiment(model_name, use_pca, use_optuna, n_trials=20):
    run_name = f"{model_name}__{'pca' if use_pca else 'no_pca'}__{'optuna' if use_optuna else 'no_optuna'}"
    print("\n" + "="*80)
    print("RUN:", run_name)
    print("="*80)

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    best_params = None

    #Optuna 
    if use_optuna:
        def objective(trial):
            params = suggest_params(trial, model_name)
            pipe = build_pipeline(model_name, use_pca, params)
            scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)
            return float(np.mean(scores))

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

        best_params = study.best_params
        best_cv_f1 = float(study.best_value)

    else:
        pipe_tmp = build_pipeline(model_name, use_pca, None)
        scores = cross_val_score(pipe_tmp, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)
        best_cv_f1 = float(np.mean(scores))

    #Train final
    pipe = build_pipeline(model_name, use_pca, best_params)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    test_f1 = float(f1_score(y_test, y_pred))

    print("cv_f1 :", round(best_cv_f1, 4))
    print("test_f1:", round(test_f1, 4))

    model_path = MODELS_DIR / f"{run_name}.joblib"
    metrics_path = METRICS_DIR / f"{run_name}.json"

    joblib.dump(pipe, model_path)

    payload = {
        "run_name": run_name,
        "model_family": model_name,
        "uses_pca": use_pca,
        "uses_optuna": use_optuna,
        "cv_f1": best_cv_f1,
        "test_f1": test_f1,
        "best_params": best_params,
        "model_path": str(model_path),
        "metrics_path": str(metrics_path),
    }

    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2)

    #Log to MLflow (Dagshub)
    with mlflow.start_run(run_name=run_name):
        mlflow.log_param("model_family", model_name)
        mlflow.log_param("uses_pca", use_pca)
        mlflow.log_param("uses_optuna", use_optuna)

        if best_params:
            mlflow.log_params(best_params)

        mlflow.log_metric("cv_f1", best_cv_f1)
        mlflow.log_metric("test_f1", test_f1)

        mlflow.log_artifact(str(model_path), artifact_path="models")
        mlflow.log_artifact(str(metrics_path), artifact_path="metrics")

    return payload


‚úÖ DB_PATH: c:\Users\preon\OneDrive\Desktop\final project\sql-to-ml-pipeline\data\classification.db


In [12]:
# =============================================================================
# FULL PIPELINE:
# - Build preprocessing
# - Stratified train/test split
# - Train & log 4 models WITHOUT PCA (Ridge, HGB, XGBoost, LightGBM)
# - Train & log 4 models WITH PCA (preprocessing + PCA(0.95) + model)
# - Pick GLOBAL best among 8 models by Test MAE
# - Save, load, and compare the global best model
# =============================================================================


conn = sqlite3.connect(str(DB_PATH))

df = pd.read_sql("""
    SELECT f.*, l.label
    FROM features f
    JOIN labels l ON f.row_id = l.row_id
""", conn)

conn.close()

print("‚úÖ Joined df:", df.shape)
print(df["label"].value_counts())
df.head()



‚úÖ Joined df: (569, 32)
label
1    357
0    212
Name: count, dtype: int64


Unnamed: 0,row_id,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,2,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,3,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,4,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,5,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [13]:
TARGET_COL = "label"
ID_COL = "row_id"

X = df.drop(columns=[TARGET_COL, ID_COL])
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print("‚úÖ Split done:")
print("Train:", X_train.shape, "Test:", X_test.shape)
print("Train label dist:\n", y_train.value_counts(normalize=True))
print("Test  label dist:\n", y_test.value_counts(normalize=True))


‚úÖ Split done:
Train: (455, 30) Test: (114, 30)
Train label dist:
 label
1    0.626374
0    0.373626
Name: proportion, dtype: float64
Test  label dist:
 label
1    0.631579
0    0.368421
Name: proportion, dtype: float64


In [14]:
preprocess = Pipeline(steps=[
    ("scaler", StandardScaler())
])

def make_model(model_name: str):
    name = model_name.lower().strip()

    if name in ["logreg", "logistic"]:
        return LogisticRegression(
            max_iter=2000,
            solver="lbfgs"
        )

    if name in ["histgradientboosting", "hgb"]:
        return HistGradientBoostingClassifier(random_state=42)

    if name in ["xgboost", "xgb"]:
        return XGBClassifier(
            random_state=42,
            eval_metric="logloss",
            use_label_encoder=False
        )

    if name in ["lightgbm", "lgbm"]:
        return LGBMClassifier(random_state=42)

    raise ValueError(f"Unknown model_name: {model_name}")


In [15]:
def suggest_params(trial, model_name):
    name = model_name.lower()

    if name in ["logreg", "logistic"]:
        return {
            "model__C": trial.suggest_float("C", 0.01, 10.0, log=True)
        }

    if name in ["xgboost", "xgb"]:
        return {
            "model__n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "model__max_depth": trial.suggest_int("max_depth", 3, 8)
        }

    if name in ["lightgbm", "lgbm"]:
        return {
            "model__n_estimators": trial.suggest_int("n_estimators", 100, 500)
        }

    if name in ["histgradientboosting", "hgb"]:
        return {
            "model__max_depth": trial.suggest_int("max_depth", 3, 10)
        }

    return {}



In [None]:
# =========================
# Imports (safe to repeat)
# =========================
from pathlib import Path
import json
import numpy as np

import joblib
import optuna
import mlflow
import dagshub

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score


# =========================
# Paths
# =========================
REPO_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
MODELS_DIR = REPO_DIR / "models"
METRICS_DIR = REPO_DIR / "metrics"
MODELS_DIR.mkdir(exist_ok=True)
METRICS_DIR.mkdir(exist_ok=True)


# =========================
# Dagshub MLflow init
# =========================
dagshub.init(repo_owner="RamishaPrionti", repo_name="sql-to-ml-pipeline", mlflow=True)
mlflow.set_experiment("sql_to_ml_pipeline")


# =========================
# REQUIRED: preprocess + model factory

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def make_model(model_name: str):
    name = model_name.lower()
    if name in ["logreg", "logistic", "logistic_regression"]:
        return LogisticRegression(max_iter=2000)
    if name in ["histgradientboosting", "hgb", "hist_gb"]:
        return HistGradientBoostingClassifier(random_state=42)
    if name in ["xgboost", "xgb"]:
        return XGBClassifier(
            random_state=42,
            eval_metric="logloss",
            use_label_encoder=False
        )
    if name in ["lightgbm", "lgbm"]:
        return LGBMClassifier(random_state=42)
    raise ValueError(f"Unknown model_name: {model_name}")


def suggest_params(trial, model_name: str):
    name = model_name.lower()

    if name in ["logreg", "logistic", "logistic_regression"]:
        return {
            "C": trial.suggest_float("C", 1e-3, 10.0, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }

    if name in ["histgradientboosting", "hgb", "hist_gb"]:
        return {
            "max_depth": trial.suggest_int("max_depth", 2, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_iter": trial.suggest_int("max_iter", 100, 400),
        }

    if name in ["xgboost", "xgb"]:
        return {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "max_depth": trial.suggest_int("max_depth", 2, 8),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        }

    if name in ["lightgbm", "lgbm"]:
        return {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "max_depth": trial.suggest_int("max_depth", -1, 12),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        }

    raise ValueError(f"No suggest_params configured for: {model_name}")



def build_pipeline(model_name: str, use_pca: bool, tuned_params: dict | None):
    est = make_model(model_name)
    if tuned_params:
        est.set_params(**tuned_params)

    steps = [("preprocess", preprocess)]
    if use_pca:
        steps.append(("pca", PCA(n_components=0.95, random_state=42)))
    steps.append(("model", est))
    return Pipeline(steps)


def run_one_experiment(model_name: str, use_pca: bool, use_optuna: bool, n_trials: int = 20):
    run_name = f"{model_name}__{'pca' if use_pca else 'no_pca'}__{'optuna' if use_optuna else 'no_optuna'}"
    print("\n" + "=" * 80)
    print("RUN:", run_name)
    print("=" * 80)

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    best_params = None

    #Optuna tuning-
    if use_optuna:
        def objective(trial):
            params = suggest_params(trial, model_name)
            pipe = build_pipeline(model_name, use_pca, params)
            scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)
            return float(np.mean(scores))

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

        best_params = study.best_params
        best_cv_f1 = float(study.best_value)

    #No optuna:
    else:
        pipe_tmp = build_pipeline(model_name, use_pca, tuned_params=None)
        scores = cross_val_score(pipe_tmp, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)
        best_cv_f1 = float(np.mean(scores))

   
    pipe = build_pipeline(model_name, use_pca, best_params)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    test_f1 = float(f1_score(y_test, y_pred))

    print("cv_f1 :", round(best_cv_f1, 4))
    print("test_f1:", round(test_f1, 4))
    if best_params:
        print("best_params:", best_params)

    #Save locally 
    model_path = MODELS_DIR / f"{run_name}.joblib"
    metrics_path = METRICS_DIR / f"{run_name}.json"
    joblib.dump(pipe, model_path)

    payload = {
        "run_name": run_name,
        "model_family": model_name,
        "uses_pca": use_pca,
        "uses_optuna": use_optuna,
        "cv_f1": best_cv_f1,
        "test_f1": test_f1,
        "best_params": best_params,
        "model_path": str(model_path),
        "metrics_path": str(metrics_path),
    }

    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2)

    
    with mlflow.start_run(run_name=run_name):
        mlflow.log_param("model_family", model_name)
        mlflow.log_param("uses_pca", use_pca)
        mlflow.log_param("uses_optuna", use_optuna)
        if best_params:
            mlflow.log_params(best_params)

        mlflow.log_metric("cv_f1", best_cv_f1)
        mlflow.log_metric("test_f1", test_f1)

        mlflow.log_artifact(str(model_path), artifact_path="models")
        mlflow.log_artifact(str(metrics_path), artifact_path="metrics")

    return payload 


def run_all_16():
    model_list = ["logreg", "histgradientboosting", "xgboost", "lightgbm"]
    all_payloads = []
    for model_name in model_list:
        for use_pca in [False, True]:
            for use_optuna in [False, True]:
                all_payloads.append(run_one_experiment(model_name, use_pca, use_optuna, n_trials=20))
    return all_payloads

    

In [17]:
MODEL_NAMES = ["logreg", "hgb", "xgb", "lgbm"]
all_runs = []
start = time.time()

for model_name in MODEL_NAMES:
    all_runs.append(run_one_experiment(model_name, use_pca=False, use_optuna=False))
    all_runs.append(run_one_experiment(model_name, use_pca=True,  use_optuna=False))
    all_runs.append(run_one_experiment(model_name, use_pca=False, use_optuna=True, n_trials=20))
    all_runs.append(run_one_experiment(model_name, use_pca=True,  use_optuna=True, n_trials=20))

elapsed = time.time() - start
print("\n‚úÖ DONE. Total runs:", len(all_runs))
print("Elapsed minutes:", round(elapsed/60, 2))



RUN: logreg__no_pca__no_optuna
cv_f1 : 0.9808
test_f1: 0.9861
üèÉ View run logreg__no_pca__no_optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/28efeb3a8b0546bfab2e579bfed7d242
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

RUN: logreg__pca__no_optuna
cv_f1 : 0.9861
test_f1: 0.979
üèÉ View run logreg__pca__no_optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/f326a0a5b315437b8781be4bcd70c959
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0


[I 2025-12-18 12:30:27,376] A new study created in memory with name: no-name-04bad6be-7179-441e-865e-ff9651289441



RUN: logreg__no_pca__optuna


[I 2025-12-18 12:30:29,162] Trial 0 finished with value: 0.9790558722655804 and parameters: {'C': 1.7479665886752973, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.9790558722655804.
[I 2025-12-18 12:30:30,832] Trial 1 finished with value: 0.9665244700662009 and parameters: {'C': 0.0034499339772257496, 'solver': 'liblinear'}. Best is trial 0 with value: 0.9790558722655804.
[I 2025-12-18 12:30:32,636] Trial 2 finished with value: 0.9755471003357559 and parameters: {'C': 3.813275454197846, 'solver': 'liblinear'}. Best is trial 0 with value: 0.9790558722655804.
[I 2025-12-18 12:30:34,399] Trial 3 finished with value: 0.9794508439114478 and parameters: {'C': 0.0394443224261552, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.9794508439114478.
[I 2025-12-18 12:30:36,087] Trial 4 finished with value: 0.9755471003357559 and parameters: {'C': 2.394260443576521, 'solver': 'liblinear'}. Best is trial 3 with value: 0.9794508439114478.
[I 2025-12-18 12:30:36,112] Trial 5 finished with value: 

cv_f1 : 0.9827
test_f1: 0.9793
best_params: {'C': 0.12822475443271025, 'solver': 'lbfgs'}
üèÉ View run logreg__no_pca__optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/aa7d15d64cb944f1ba764c095f79c4d5
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0


[I 2025-12-18 12:30:39,561] A new study created in memory with name: no-name-a8036137-338c-4919-9ce2-dc1aa82144af
[I 2025-12-18 12:30:39,597] Trial 0 finished with value: 0.9598377059516956 and parameters: {'C': 0.001198324109281906, 'solver': 'liblinear'}. Best is trial 0 with value: 0.9598377059516956.
[I 2025-12-18 12:30:39,632] Trial 1 finished with value: 0.9665244700662009 and parameters: {'C': 0.002527557239748438, 'solver': 'liblinear'}. Best is trial 1 with value: 0.9665244700662009.
[I 2025-12-18 12:30:39,668] Trial 2 finished with value: 0.9826559435778414 and parameters: {'C': 2.1780447644018492, 'solver': 'liblinear'}. Best is trial 2 with value: 0.9826559435778414.
[I 2025-12-18 12:30:39,703] Trial 3 finished with value: 0.9827463684844776 and parameters: {'C': 0.06649813172177312, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.9827463684844776.
[I 2025-12-18 12:30:39,739] Trial 4 finished with value: 0.986146344974002 and parameters: {'C': 0.5626873815011803, 'solver'


RUN: logreg__pca__optuna


[I 2025-12-18 12:30:39,775] Trial 5 finished with value: 0.9826559435778414 and parameters: {'C': 2.2630995839454204, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.986146344974002.
[I 2025-12-18 12:30:39,811] Trial 6 finished with value: 0.9827463684844776 and parameters: {'C': 0.07843156347234295, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.986146344974002.
[I 2025-12-18 12:30:39,825] Trial 7 finished with value: 0.9598377059516956 and parameters: {'C': 0.0011863525933701014, 'solver': 'liblinear'}. Best is trial 4 with value: 0.986146344974002.
[I 2025-12-18 12:30:39,840] Trial 8 finished with value: 0.9562950808464089 and parameters: {'C': 0.006494709822326531, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.986146344974002.
[I 2025-12-18 12:30:39,854] Trial 9 finished with value: 0.986146344974002 and parameters: {'C': 0.49765268989001354, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.986146344974002.
[I 2025-12-18 12:30:39,880] Trial 10 finished with value: 0.98614

cv_f1 : 0.9861
test_f1: 0.9861
best_params: {'C': 0.5626873815011803, 'solver': 'lbfgs'}
üèÉ View run logreg__pca__optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/aa245060b9984e7585d66045b93cd6f0
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

RUN: hgb__no_pca__no_optuna
cv_f1 : 0.9757
test_f1: 0.9796
üèÉ View run hgb__no_pca__no_optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/f96b5c386dee4fc2966c3cb2821f7b92
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

RUN: hgb__pca__no_optuna
cv_f1 : 0.9703
test_f1: 0.9726
üèÉ View run hgb__pca__no_optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/b6b91fdc123c4669a44314a0c4c257b3
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0


[I 2025-12-18 12:31:13,362] A new study created in memory with name: no-name-72fe842a-6d00-497e-8e9e-69f06f99bb8f



RUN: hgb__no_pca__optuna


[I 2025-12-18 12:31:13,586] Trial 0 finished with value: 0.9722222222222223 and parameters: {'max_depth': 5, 'learning_rate': 0.025233258312933292, 'max_iter': 233}. Best is trial 0 with value: 0.9722222222222223.
[I 2025-12-18 12:31:13,808] Trial 1 finished with value: 0.9773669284467714 and parameters: {'max_depth': 10, 'learning_rate': 0.089889237870699, 'max_iter': 304}. Best is trial 1 with value: 0.9773669284467714.
[I 2025-12-18 12:31:13,967] Trial 2 finished with value: 0.9756944444444445 and parameters: {'max_depth': 6, 'learning_rate': 0.17187427908158276, 'max_iter': 265}. Best is trial 1 with value: 0.9773669284467714.
[I 2025-12-18 12:31:14,127] Trial 3 finished with value: 0.9739492437463642 and parameters: {'max_depth': 8, 'learning_rate': 0.04661460903686876, 'max_iter': 110}. Best is trial 1 with value: 0.9773669284467714.
[I 2025-12-18 12:31:14,338] Trial 4 finished with value: 0.96875 and parameters: {'max_depth': 5, 'learning_rate': 0.024518774749504025, 'max_iter':

cv_f1 : 0.9775
test_f1: 0.966
best_params: {'max_depth': 9, 'learning_rate': 0.19238671419468334, 'max_iter': 142}
üèÉ View run hgb__no_pca__optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/be71c54027c44b67a4c34f0db6b9e2f6
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0


[I 2025-12-18 12:31:25,365] A new study created in memory with name: no-name-28abe956-045a-4b48-94ff-fc3a22197386
[I 2025-12-18 12:31:25,442] Trial 0 finished with value: 0.9440773920617146 and parameters: {'max_depth': 8, 'learning_rate': 0.014436443989050446, 'max_iter': 115}. Best is trial 0 with value: 0.9440773920617146.



RUN: hgb__pca__optuna


[I 2025-12-18 12:31:25,571] Trial 1 finished with value: 0.9685672514619883 and parameters: {'max_depth': 6, 'learning_rate': 0.041846122157524986, 'max_iter': 257}. Best is trial 1 with value: 0.9685672514619883.
[I 2025-12-18 12:31:25,648] Trial 2 finished with value: 0.966909057437408 and parameters: {'max_depth': 6, 'learning_rate': 0.2739420508491107, 'max_iter': 266}. Best is trial 1 with value: 0.9685672514619883.
[I 2025-12-18 12:31:25,725] Trial 3 finished with value: 0.9651087421008797 and parameters: {'max_depth': 9, 'learning_rate': 0.2145870578457951, 'max_iter': 228}. Best is trial 1 with value: 0.9685672514619883.
[I 2025-12-18 12:31:25,781] Trial 4 finished with value: 0.9634398766778777 and parameters: {'max_depth': 2, 'learning_rate': 0.03623113414786745, 'max_iter': 184}. Best is trial 1 with value: 0.9685672514619883.
[I 2025-12-18 12:31:25,878] Trial 5 finished with value: 0.9509134480568301 and parameters: {'max_depth': 3, 'learning_rate': 0.01138586325258387, 'ma

cv_f1 : 0.9703
test_f1: 0.9722
best_params: {'max_depth': 4, 'learning_rate': 0.07418190862950527, 'max_iter': 285}
üèÉ View run hgb__pca__optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/60e5cd9391ae4e3cb3f41b2fe3e28683
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

RUN: xgb__no_pca__no_optuna


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


cv_f1 : 0.972
test_f1: 0.966
üèÉ View run xgb__no_pca__no_optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/75ec9023d2f44c72bb9955c7836ac4ac
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

RUN: xgb__pca__no_optuna


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


cv_f1 : 0.9581
test_f1: 0.9583
üèÉ View run xgb__pca__no_optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/ed516ab561f94348bfd5f7dfde8259ea
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0


[I 2025-12-18 12:31:59,442] A new study created in memory with name: no-name-58df846a-221f-4ee7-a724-214ee208a382



RUN: xgb__no_pca__optuna


[I 2025-12-18 12:32:00,163] Trial 0 finished with value: 0.9737665863320313 and parameters: {'n_estimators': 389, 'max_depth': 5, 'learning_rate': 0.08946172648484138, 'subsample': 0.9803170072110121, 'colsample_bytree': 0.6985202156915373}. Best is trial 0 with value: 0.9737665863320313.
[I 2025-12-18 12:32:00,600] Trial 1 finished with value: 0.9753829016986911 and parameters: {'n_estimators': 106, 'max_depth': 6, 'learning_rate': 0.09068251864886465, 'subsample': 0.6089919198760088, 'colsample_bytree': 0.7787887593784977}. Best is trial 1 with value: 0.9753829016986911.
[I 2025-12-18 12:32:01,058] Trial 2 finished with value: 0.9755116959064326 and parameters: {'n_estimators': 327, 'max_depth': 2, 'learning_rate': 0.028448822504454303, 'subsample': 0.9547566055339936, 'colsample_bytree': 0.6212421804729961}. Best is trial 2 with value: 0.9755116959064326.
[I 2025-12-18 12:32:01,530] Trial 3 finished with value: 0.973729462545252 and parameters: {'n_estimators': 243, 'max_depth': 4, 

cv_f1 : 0.9861
test_f1: 0.966
best_params: {'n_estimators': 197, 'max_depth': 3, 'learning_rate': 0.17767326614393697, 'subsample': 0.6564908050297785, 'colsample_bytree': 0.904573744078116}
üèÉ View run xgb__no_pca__optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/7cba0763cccc421da4623076ae816611
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0


[I 2025-12-18 12:32:11,364] A new study created in memory with name: no-name-0e99a870-665c-4d0e-aec9-e3b588c6195d
[I 2025-12-18 12:32:11,477] Trial 0 finished with value: 0.9474018172777062 and parameters: {'n_estimators': 105, 'max_depth': 8, 'learning_rate': 0.01052219845864629, 'subsample': 0.6962206835554969, 'colsample_bytree': 0.6027872264354704}. Best is trial 0 with value: 0.9474018172777062.



RUN: xgb__pca__optuna


[I 2025-12-18 12:32:11,673] Trial 1 finished with value: 0.9667646365509626 and parameters: {'n_estimators': 358, 'max_depth': 7, 'learning_rate': 0.02141372392940984, 'subsample': 0.7325119181819089, 'colsample_bytree': 0.603297229312715}. Best is trial 1 with value: 0.9667646365509626.
[I 2025-12-18 12:32:11,777] Trial 2 finished with value: 0.9631613720158257 and parameters: {'n_estimators': 113, 'max_depth': 5, 'learning_rate': 0.061740789098894376, 'subsample': 0.73922358642428, 'colsample_bytree': 0.943294157278555}. Best is trial 1 with value: 0.9667646365509626.
[I 2025-12-18 12:32:11,877] Trial 3 finished with value: 0.9581747847538201 and parameters: {'n_estimators': 210, 'max_depth': 8, 'learning_rate': 0.12217737610432076, 'subsample': 0.8018438917993561, 'colsample_bytree': 0.7750915896420594}. Best is trial 1 with value: 0.9667646365509626.
[I 2025-12-18 12:32:11,995] Trial 4 finished with value: 0.9648883935399676 and parameters: {'n_estimators': 294, 'max_depth': 3, 'le

cv_f1 : 0.9668
test_f1: 0.9655
best_params: {'n_estimators': 358, 'max_depth': 7, 'learning_rate': 0.02141372392940984, 'subsample': 0.7325119181819089, 'colsample_bytree': 0.603297229312715}
üèÉ View run xgb__pca__optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/410865e7982b49ba9b3b1a17d5f724ee
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

RUN: lgbm__no_pca__no_optuna
[LightGBM] [Info] Number of positive: 285, number of negative: 170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000469 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4545
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.626374 -> initscore=0.516691
[LightGBM] [Info] Start training from score 0.516691
cv_f1 : 0.9757
test_f1: 0.966




üèÉ View run lgbm__no_pca__no_optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/3bc4dbbfa6cf490baa72078955dd5dee
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

RUN: lgbm__pca__no_optuna
[LightGBM] [Info] Number of positive: 285, number of negative: 170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1520
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.626374 -> initscore=0.516691
[LightGBM] [Info] Start training from score 0.516691
cv_f1 : 0.9669
test_f1: 0.9793




üèÉ View run lgbm__pca__no_optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/61fdefbc29d64148bf8f47b304559fff
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0


[I 2025-12-18 12:32:45,391] A new study created in memory with name: no-name-3f8eb5db-ecfd-4003-84ec-2dfce2bdacd1



RUN: lgbm__no_pca__optuna


[I 2025-12-18 12:32:47,241] Trial 0 finished with value: 0.9757287333453909 and parameters: {'n_estimators': 320, 'max_depth': 1, 'learning_rate': 0.019243902032649505, 'num_leaves': 115}. Best is trial 0 with value: 0.9757287333453909.
[I 2025-12-18 12:32:49,101] Trial 1 finished with value: 0.980857329842932 and parameters: {'n_estimators': 454, 'max_depth': 8, 'learning_rate': 0.2320859975356283, 'num_leaves': 22}. Best is trial 1 with value: 0.980857329842932.
[I 2025-12-18 12:32:50,876] Trial 2 finished with value: 0.9808755090168703 and parameters: {'n_estimators': 255, 'max_depth': 0, 'learning_rate': 0.2791025290956269, 'num_leaves': 65}. Best is trial 2 with value: 0.9808755090168703.
[I 2025-12-18 12:32:52,763] Trial 3 finished with value: 0.9791121291448516 and parameters: {'n_estimators': 380, 'max_depth': 6, 'learning_rate': 0.14836494551426224, 'num_leaves': 87}. Best is trial 2 with value: 0.9808755090168703.
[I 2025-12-18 12:32:54,688] Trial 4 finished with value: 0.973

[LightGBM] [Info] Number of positive: 285, number of negative: 170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4545
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.626374 -> initscore=0.516691
[LightGBM] [Info] Start training from score 0.516691
cv_f1 : 0.9843
test_f1: 0.9726
best_params: {'n_estimators': 172, 'max_depth': 4, 'learning_rate': 0.26576135368282805, 'num_leaves': 123}


[I 2025-12-18 12:33:00,985] A new study created in memory with name: no-name-718472f9-eb9f-4a97-9e86-60d8fa27cc99


üèÉ View run lgbm__no_pca__optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/d13f835d8b97423eb6896ce45ce651df
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

RUN: lgbm__pca__optuna


[I 2025-12-18 12:33:01,064] Trial 0 finished with value: 0.9599714008361504 and parameters: {'n_estimators': 184, 'max_depth': 2, 'learning_rate': 0.26357932650032845, 'num_leaves': 23}. Best is trial 0 with value: 0.9599714008361504.
[I 2025-12-18 12:33:01,256] Trial 1 finished with value: 0.9597071405823198 and parameters: {'n_estimators': 166, 'max_depth': 6, 'learning_rate': 0.026204883481624665, 'num_leaves': 41}. Best is trial 0 with value: 0.9599714008361504.
[I 2025-12-18 12:33:01,572] Trial 2 finished with value: 0.95453216374269 and parameters: {'n_estimators': 269, 'max_depth': 9, 'learning_rate': 0.012089509782912766, 'num_leaves': 54}. Best is trial 0 with value: 0.9599714008361504.
[I 2025-12-18 12:33:02,004] Trial 3 finished with value: 0.9653292678558637 and parameters: {'n_estimators': 327, 'max_depth': 8, 'learning_rate': 0.03918882962106877, 'num_leaves': 81}. Best is trial 3 with value: 0.9653292678558637.
[I 2025-12-18 12:33:02,155] Trial 4 finished with value: 0.9

[LightGBM] [Info] Number of positive: 285, number of negative: 170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000331 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1520
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.626374 -> initscore=0.516691
[LightGBM] [Info] Start training from score 0.516691
cv_f1 : 0.9688
test_f1: 0.9863
best_params: {'n_estimators': 368, 'max_depth': 11, 'learning_rate': 0.03532542019021317, 'num_leaves': 30}
üèÉ View run lgbm__pca__optuna at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0/runs/8f2c5157da3e4772b370fd93211c9dfd
üß™ View experiment at: https://dagshub.com/RamishaPrionti/sql-to-ml-pipeline.mlflow/#/experiments/0

‚úÖ DONE. Total runs: 16
Elapsed minutes: 2.97


In [21]:
best = results_df.iloc[0].to_dict()

print("‚úÖ BEST RUN:", best["run_name"])
print("Best test_f1:", best["test_f1"])

best_model = joblib.load(best["model_path"])

FINAL_MODEL_PATH = MODELS_DIR / "final_model.joblib"
joblib.dump(best_model, FINAL_MODEL_PATH)

print("‚úÖ Saved FINAL model to:", FINAL_MODEL_PATH)



‚úÖ BEST RUN: lgbm__pca__optuna
Best test_f1: 0.9863013698630136
‚úÖ Saved FINAL model to: c:\Users\preon\OneDrive\Desktop\final project\sql-to-ml-pipeline\models\final_model.joblib
