In [1]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Константы
DATA_PATH = 'final_pipline.csv'
SIGNIFICANT_VARS_PATH = 'significant_vars.txt'
STACKING_MODEL_PATH = 'best_stacking_model.joblib'
RANDOM_STATE = 0
TEST_SIZE = 0.3
CV_FOLDS = 5

# === Загрузка и подготовка данных ===
def load_data(data_path: str, vars_path: str):
    df = pd.read_csv(data_path)
    df['target'] = df['group'].apply(lambda x: 1 if x == 'patient' else 0)
    df.drop(columns=['filepath', 'group'], inplace=True)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    with open(vars_path, 'r') as file:
        significant_vars = [line.strip() for line in file]

    return df, significant_vars

def fill_missing_by_group(data: pd.DataFrame, group_col: str, feature_cols: list):
    data_filled = data.copy()
    for col in feature_cols:
        group_means = data_filled.groupby(group_col)[col].transform('mean')
        data_filled[col] = data_filled[col].fillna(group_means)
    return data_filled

# === Создание пайплайна ===
def make_pipeline(model):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('clf', model)
    ])

# === Кросс-валидация модели ===
def evaluate_model(pipeline, X, y, cv=CV_FOLDS):
    scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    cv_results = cross_validate(pipeline, X, y, cv=cv, scoring=scoring_metrics)
    return {
        metric: (cv_results[f'test_{metric}'].mean(), cv_results[f'test_{metric}'].std())
        for metric in scoring_metrics
    }

# === Вывод результатов ===
def print_scores(model_name, scores_dict):
    print(f"\nМодель: {model_name}")
    for metric, (mean, std) in scores_dict.items():
        print(f"  {metric}: {mean:.3f} ± {std:.3f}")

# === Основной блок ===
def main():
    # Загрузка данных
    df, significant_vars = load_data(DATA_PATH, SIGNIFICANT_VARS_PATH)
    df_filled = fill_missing_by_group(df, 'target', significant_vars)

    X = df_filled[significant_vars]
    y = df_filled['target']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )
    
    # Определение моделей
    models = {
        'random_forest': RandomForestClassifier(random_state=RANDOM_STATE),
        'logistic_regression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
        'knn': KNeighborsClassifier(),
        'gradient_boosting': GradientBoostingClassifier(random_state=RANDOM_STATE),
        #'xgboost': XGBClassifier(eval_metric='logloss', random_state=RANDOM_STATE, scale_pos_weight=scale_pos_weight)
    }

    results = {}

    # Кросс-валидация и обучение базовых моделей
    for name, model in models.items():
        pipe = make_pipeline(model)
        scores = evaluate_model(pipe, X_train, y_train)
        results[name] = scores
        pipe.fit(X_train, y_train)
        #print_scores(name, scores)

    # Стекинг моделей
    estimators = [(name, make_pipeline(model)) for name, model in models.items()]

    stacking_model = StackingClassifier(
        estimators=estimators,
        final_estimator=RandomForestClassifier(random_state=RANDOM_STATE),
        passthrough=True,
        cv=CV_FOLDS
    )

    param_grid = {
        'final_estimator__n_estimators': [50, 100, 200],
        'final_estimator__max_depth': [None, 10, 20, 30],
    }

    grid_search = GridSearchCV(
        estimator=stacking_model,
        param_grid=param_grid,
        cv=CV_FOLDS,
        scoring='recall',
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    best_stacking_model = grid_search.best_estimator_
    joblib.dump(best_stacking_model, STACKING_MODEL_PATH)

    # Оценка стекинг-модели
    stacking_scores = evaluate_model(best_stacking_model, X_train, y_train)
    results['stacking'] = stacking_scores

    print_scores("stacking", stacking_scores)

    # Итоговые результаты
    print("\n=== Сводка всех моделей ===")
    for model_name, metrics in results.items():
        print_scores(model_name, metrics)

    print(f"\nСтекинг-модель успешно сохранена в файл: '{STACKING_MODEL_PATH}'.")

if __name__ == "__main__":
    main()



Модель: stacking
  accuracy: 0.957 ± 0.017
  precision: 0.958 ± 0.030
  recall: 0.940 ± 0.027
  f1: 0.949 ± 0.019
  roc_auc: 0.987 ± 0.008

=== Сводка всех моделей ===

Модель: random_forest
  accuracy: 0.929 ± 0.020
  precision: 0.955 ± 0.032
  recall: 0.874 ± 0.028
  f1: 0.912 ± 0.025
  roc_auc: 0.981 ± 0.009

Модель: logistic_regression
  accuracy: 0.927 ± 0.027
  precision: 0.915 ± 0.057
  recall: 0.916 ± 0.023
  f1: 0.914 ± 0.028
  roc_auc: 0.971 ± 0.014

Модель: knn
  accuracy: 0.867 ± 0.054
  precision: 0.844 ± 0.064
  recall: 0.843 ± 0.111
  f1: 0.838 ± 0.072
  roc_auc: 0.930 ± 0.036

Модель: gradient_boosting
  accuracy: 0.934 ± 0.034
  precision: 0.949 ± 0.032
  recall: 0.892 ± 0.057
  f1: 0.919 ± 0.043
  roc_auc: 0.986 ± 0.007

Модель: stacking
  accuracy: 0.957 ± 0.017
  precision: 0.958 ± 0.030
  recall: 0.940 ± 0.027
  f1: 0.949 ± 0.019
  roc_auc: 0.987 ± 0.008

Стекинг-модель успешно сохранена в файл: 'best_stacking_model.joblib'.
