In [None]:
# train_models.py (or inside your Jupyter Notebook)
import os
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv(r"D:/MBP_PREDICTOR/data/features_with_embeddings.csv")
print("Dataset shape:", df.shape)

# Features and labels
X = df.drop(columns=["Protein_ID", "Label"])
y = df["Label"]

# -----------------------------
# 2. Split train/test
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 3. Scale features
# -----------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -----------------------------
# 4. Define models + hyperparameters
# -----------------------------
param_grids = {
    "logreg": {
        "model": LogisticRegression(max_iter=500),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "penalty": ["l2"],
            "solver": ["lbfgs", "saga"]
        }
    },
    "rf": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [100, 200, 500],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2]
        }
    },
    "xgb": {
        "model": XGBClassifier(eval_metric="logloss", use_label_encoder=False),
        "params": {
            "n_estimators": [100, 200, 500],
            "max_depth": [3, 6, 10],
            "learning_rate": [0.01, 0.1, 0.2],
            "subsample": [0.8, 1.0]
        }
    }
}

# -----------------------------
# 5. Train & Hyperparameter Tuning
# -----------------------------
best_models = {}
for name, cfg in param_grids.items():
    print(f"\n🔍 Tuning {name}...")
    grid = GridSearchCV(cfg["model"], cfg["params"], cv=3, scoring="f1", n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    
    print(f"Best params for {name}: {grid.best_params_}")
    
    y_pred = grid.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    
    best_models[name] = {
        "model": grid.best_estimator_,
        "f1": f1
    }

# -----------------------------
# 6. Select best model
# -----------------------------
best_model_name = max(best_models, key=lambda k: best_models[k]["f1"])
best_model = best_models[best_model_name]["model"]
print(f"\n✅ Best model: {best_model_name} with F1 = {best_models[best_model_name]['f1']:.4f}")

# -----------------------------
# 7. Save model + scaler
# -----------------------------
os.makedirs("models", exist_ok=True)
joblib.dump(best_model, f"models/{best_model_name}_best.pkl")
joblib.dump(scaler, "models/scaler.pkl")

print("\nModels and scaler saved successfully!")


Dataset shape: (1640, 1050)

🔍 Tuning logreg...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params for logreg: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
              precision    recall  f1-score   support

           0       0.44      0.37      0.40       158
           1       0.49      0.55      0.52       170

    accuracy                           0.47       328
   macro avg       0.46      0.46      0.46       328
weighted avg       0.46      0.47      0.46       328


🔍 Tuning rf...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best params for rf: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.53      0.44      0.48       158
           1       0.55      0.64      0.59       170

    accuracy                           0.54       328
   macro avg       0.54      0.54      0.54       328
weighted avg       0.54      0.54    