In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [3]:
# stored_df = pd Dataframe

In [None]:
X = stored_df.drop('isfaller', axis=1)
y = stored_df['isfaller']

pipelines = {
    'rf': Pipeline([
        ('scaler', StandardScaler()), 
        ('clf', RandomForestClassifier(
            n_estimators=1858, 
            max_depth=53, 
            min_samples_split=16, 
            min_samples_leaf=3, 
            max_features='log2')
        )
    ]),
    'log_reg': Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(C=0.7744747828794467, max_iter= 2142, solver= 'liblinear'))]),
    'xgb': Pipeline([('scaler', StandardScaler()), ('clf', XGBClassifier(eval_metric='auc', use_label_encoder=False))]),
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
def train_and_evaluate(pipelines, X, y, cv):
    results = {}
    for pipeline_name, pipeline in pipelines.items():
        print(f'Training {pipeline_name}...')
        fold_results = []
        auc_scores = []
        
        for train_idx, test_idx in cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            
            pipeline.fit(X_train, y_train)
            
            test_score = pipeline.score(X_test, y_test)
            y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
            auc_score = roc_auc_score(y_test, y_pred_prob)
            
            fold_results.append(test_score)
            auc_scores.append(auc_score)
            
            print(f'Fold Test Accuracy: {test_score:.4f}, AUC: {auc_score:.4f}')
        
        results[pipeline_name] = {
            'fold_results': fold_results,
            'mean_test_score': sum(fold_results) / len(fold_results),
            'std_test_score': pd.Series(fold_results).std(),
            'auc_scores': auc_scores,
            'mean_auc_score': sum(auc_scores) / len(auc_scores),
            'std_auc_score': pd.Series(auc_scores).std()
        }
        
        print(f'{pipeline_name} - Mean Test Accuracy: {results[pipeline_name]["mean_test_score"]:.4f}, Std Test Accuracy: {results[pipeline_name]["std_test_score"]:.4f}')
        print(f'{pipeline_name} - Mean AUC: {results[pipeline_name]["mean_auc_score"]:.4f}, Std AUC: {results[pipeline_name]["std_auc_score"]:.4f}')
        
    return results

results = train_and_evaluate(pipelines, X, y, cv)