In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, matthews_corrcoef
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import warnings
import joblib
from sklearn.ensemble import GradientBoostingClassifier

warnings.filterwarnings('ignore')

def load_data(path):
    df = pd.read_csv(path)
    
    if df.isnull().sum().sum() > 0:
        df = df.fillna(df.median())

    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, scaler, df.columns[:-1]

def select_features_lasso(X, y, feature_names):
    lasso = LassoCV(cv=5, random_state=28, max_iter=10000)
    lasso.fit(X, y)
    coef = lasso.coef_
    selected_indices = np.where(coef != 0)[0]
    
    if len(selected_indices) == 0:
        raise ValueError("Lasso did not select any features.")

    return X[:, selected_indices], selected_indices

def define_models():
    models = {
        'xgb': xgb.XGBClassifier(
            learning_rate=0.017965145536083, 
            max_depth=11, 
            min_child_weight=1, 
            gamma=0.17426232112256684, 
            colsample_bytree=0.1916286359142644, 
            n_estimators=625, 
            seed=0
        ),
        'et': ExtraTreesClassifier(
            n_estimators=200, 
            max_depth=15, 
            min_samples_split=4, 
            min_samples_leaf=2,
            max_features='sqrt', 
            bootstrap=True, 
            random_state=0
        ),
        'rf': RandomForestClassifier(
            max_depth=80, 
            bootstrap=False, 
            max_features='sqrt',
            n_estimators=400, 
            min_samples_split=3,
            random_state=0
        ),
        'gbdt': GradientBoostingClassifier(  
            learning_rate=0.05,
            n_estimators=300,
            max_depth=5,
            subsample=0.8,
            random_state=0
        ),
        'mlp': MLPClassifier(hidden_layer_sizes=(210,), activation='logistic', solver='adam', random_state=10)
    }
    return models

def run_cross_validation(X, y):
    metrics = {k: [] for k in ['Sn', 'Sp', 'Acc', 'MCC', 'AUC']}
    
    models = define_models()
    base_estimators = [(name, model) for name, model in models.items()]
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        meta_learner = LogisticRegression(max_iter=1000, random_state=42)
        stacking_model = StackingClassifier(
            estimators=base_estimators,
            final_estimator=meta_learner,
            stack_method='auto',
            cv=5,
            n_jobs=-1
        )

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        stacking_model.fit(X_train, y_train)
        y_pred = stacking_model.predict(X_val)
        y_proba = stacking_model.predict_proba(X_val)[:, 1]

        TP = np.sum((y_val == 1) & (y_pred == 1))
        TN = np.sum((y_val == 0) & (y_pred == 0))
        FP = np.sum((y_val == 0) & (y_pred == 1))
        FN = np.sum((y_val == 1) & (y_pred == 0))
        
        Sn = TP / (TP + FN) if TP + FN > 0 else 0
        Sp = TN / (TN + FP) if TN + FP > 0 else 0
        Acc = accuracy_score(y_val, y_pred)
        MCC = matthews_corrcoef(y_val, y_pred)
        AUC = roc_auc_score(y_val, y_proba)

        metrics['Sn'].append(Sn)
        metrics['Sp'].append(Sp)
        metrics['Acc'].append(Acc)
        metrics['MCC'].append(MCC)
        metrics['AUC'].append(AUC)
        
        print(f"Fold {fold} | Acc: {Acc:.4f} | Sn: {Sn:.4f} | Sp: {Sp:.4f} | MCC: {MCC:.4f} | AUC: {AUC:.4f}")

    print("\nResult:")
    for metric, values in metrics.items():
        print(f"{metric:<6} | {np.mean(values):.4f} ± {np.std(values):.4f}")
    
    return metrics

if __name__ == "__main__":
    DATA_PATH = ""
    X, y, scaler, feature_names = load_data(DATA_PATH)
    
    if X is not None and y is not None:
        X_selected, selected_indices = select_features_lasso(X, y, feature_names)
        selected_feature_names = feature_names[selected_indices]
        
        metrics = run_cross_validation(X_selected, y)

        joblib.dump(scaler, 'feature_scaler.pkl')
        joblib.dump(selected_indices, 'lasso_selected_indices.pkl')
