In [None]:
# 1: Instalacja i import niezbędnych bibliotek

# Import bibliotek
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import mlflow
import mlflow.sklearn
import shap
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, auc, confusion_matrix, classification_report
)
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tqdm import tqdm
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from mlflow.models.signature import infer_signature
import optuna
import joblib
import warnings

warnings.filterwarnings("ignore")

# Tworzenie potrzebnych katalogów
os.makedirs("../results", exist_ok=True)
os.makedirs("../reports", exist_ok=True)
os.makedirs("../models", exist_ok=True)

# Ustawienie stylu dla wykresów
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Konfiguracja MLflow
mlflow.set_tracking_uri("http://localhost:5000")  # Zmień na adres swojego serwera MLflow
experiment_name = "Heart_Disease_Classification_test_6"  # Nazwa eksperymentu
mlflow.set_experiment(experiment_name)

print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment ID: {mlflow.get_experiment_by_name(experiment_name).experiment_id}")

In [None]:
# 2: Wczytywanie danych przygotowanych wcześniej

# Wczytywanie danych treningowych i testowych
train_data = pd.read_csv("../data/processed/heart_train.csv")
test_data = pd.read_csv("../data/processed/heart_test.csv")

# Podział na cechy i etykietę (target)
X_train = train_data.drop("HeartDisease", axis=1)
y_train = train_data["HeartDisease"].map({"No": 0, "Yes": 1})

X_test = test_data.drop("HeartDisease", axis=1)
y_test = test_data["HeartDisease"].map({"No": 0, "Yes": 1})

In [None]:
# 3: Przygotowanie danych – identyfikacja kolumn

# Identyfikacja kolumn kategorycznych i numerycznych
categorical_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Kolumny kategoryczne:", categorical_cols)
print("Kolumny numeryczne:", numerical_cols)
print(f"Wymiary zbioru treningowego: {X_train.shape}")
print(f"Wymiary zbioru testowego: {X_test.shape}")

In [None]:
# 4: Przygotowanie pipeline do przetwarzania danych

# Definiowanie preprocessingu dla kolumn numerycznych
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Definiowanie preprocessingu dla kolumn kategorycznych
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Połączenie transformerów w jeden preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Funkcja do tworzenia pipeline'u modelu
def create_model_pipeline(classifier):
    """Tworzy pipeline dla modelu, który zawiera preprocessor i klasyfikator."""
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

In [None]:
# 5: Trenowanie modeli i logowanie wyników do MLflow

# Lista klasyfikatorów do przetestowania
classifiers = {
    "LinearSVC": LinearSVC(random_state=42, max_iter=20000),
    "KNN": KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', n_jobs=-1),
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000, n_jobs=-1),
    #"RandomForestClassifier": RandomForestClassifier(random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(),
    #"LGBMClassifier": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "RidgeClassifier": RidgeClassifier(),
    #"NaiveBayes": GaussianNB(),
    "LDA": LinearDiscriminantAnalysis()
}

# Słownik do przechowywania wyników
results = {}

# Funkcja do logowania modelu i metryk do MLflow
def log_model_to_mlflow(model, model_name, pipeline, X_test, y_test):
    y_pred = pipeline.predict(X_test)

    if hasattr(pipeline, "predict_proba"):
        y_prob = pipeline.predict_proba(X_test)[:, 1]
    elif hasattr(pipeline, "decision_function"):
        y_prob = pipeline.decision_function(X_test)
    else:
        y_prob = None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    with mlflow.start_run(run_name=model_name, nested=True):
        for param_name, param_value in model.get_params().items():
            mlflow.log_param(param_name, param_value)

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        cm_path = f"../results/confusion_matrix_{model_name}.png"
        plt.savefig(cm_path)
        plt.close()
        mlflow.log_artifact(cm_path)

        # ROC Curve
        if y_prob is not None:
            fpr, tpr, _ = roc_curve(y_test, y_prob)
            roc_auc = auc(fpr, tpr)

            plt.figure(figsize=(10, 8))
            plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {model_name}')
            plt.legend(loc="lower right")
            roc_path = f"../results/roc_curve_{model_name}.png"
            plt.savefig(roc_path)
            plt.close()
            mlflow.log_artifact(roc_path)
            mlflow.log_metric("roc_auc", roc_auc)
        else:
            roc_auc = None

        signature = infer_signature(X_test, y_pred)

        # Bezpieczne logowanie modelu + artefaktu lokalnie
        local_model_path = f"../models/{model_name}_model.pkl"
        joblib.dump(pipeline, local_model_path)

        # Log lokalnego pliku jako artefakt (nie przez API model registry)
        mlflow.log_artifact(local_model_path)

        # Próbuj logować do MLflow registry, ale jeśli serwer zamknie połączenie – pomiń
        try:
            mlflow.sklearn.log_model(
                pipeline,
                f"{model_name}_model",
                input_example=X_test.iloc[:1],
                signature=signature
            )
            mlflow.register_model(
                model_uri=f"runs:/{mlflow.active_run().info.run_id}/{model_name}_model",
                name=model_name
            )
        except Exception as e:
            print(f"⚠️ Nie udało się zarejestrować modelu {model_name} w MLflow Model Registry: {e}")

        # Classification report
        clf_report = classification_report(y_test, y_pred, output_dict=True)
        clf_df = pd.DataFrame(clf_report).transpose()
        clf_path = f"../reports/classification_report_{model_name}.csv"
        clf_df.to_csv(clf_path)
        mlflow.log_artifact(clf_path)

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "roc_auc": roc_auc
        }

# Trenowanie i logowanie modeli
for name, classifier in tqdm(classifiers.items(), desc="Trenowanie modeli"):
    print(f"\nTrenowanie modelu: {name}")
    pipeline = create_model_pipeline(classifier)
    pipeline.fit(X_train, y_train)
    result = log_model_to_mlflow(classifier, name, pipeline, X_test, y_test)
    results[name] = result
    print(f"Model: {name}, Accuracy: {result['accuracy']:.4f}, F1 Score: {result['f1_score']:.4f}")

In [None]:
# 6: Porównanie wyników modeli

# Porównanie wyników wszystkich modeli
results_df = pd.DataFrame(results).T
print("\nPodsumowanie wyników wszystkich modeli:")
print(results_df)

# Zapis wyników do pliku CSV i logowanie jako artefakt do MLflow
results_csv_path = "../reports/model_results_summary.csv"
results_df.to_csv(results_csv_path)
with mlflow.start_run(run_name="Model_Comparison_Summary", nested=True):
    mlflow.log_artifact(results_csv_path)

# Wizualizacja porównania metryk
plt.figure(figsize=(12, 8))
results_df[['accuracy', 'precision', 'recall', 'f1_score']].plot(kind='bar')
plt.title('Porównanie wyników modeli')
plt.xlabel('Model')
plt.ylabel('Wartość metryki')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("../results/model_comparison.png")
plt.show()

In [None]:
# 7: Tuning hiperparametrów najlepszego modelu (Optuna)

# Wybór najlepszego modelu na podstawie accuracy
best_model_name = results_df['accuracy'].idxmax()
print(f"\nNajlepszy model: {best_model_name} z accuracy: {results_df.loc[best_model_name, 'accuracy']:.4f}")

# Funkcja celu Optuny
def optuna_objective(trial):
    if best_model_name == "RandomForestClassifier":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 5, 30),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4)
        }
        model = RandomForestClassifier(random_state=42, **params)

    elif best_model_name == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0)
        }
        model = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42, **params)

    elif best_model_name == "LGBMClassifier":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3)
        }
        model = LGBMClassifier(random_state=42, **params)

    elif best_model_name == "CatBoost":
        params = {
            "iterations": trial.suggest_int("iterations", 100, 500),
            "depth": trial.suggest_int("depth", 4, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3)
        }
        model = CatBoostClassifier(verbose=0, random_state=42, **params)

    elif best_model_name == "GradientBoosting":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0)
        }
        model = GradientBoostingClassifier(random_state=42, **params)
    else:
        raise ValueError("Model nieobsługiwany przez Optuna.")

    # Pipeline i predykcje
    pipeline = create_model_pipeline(model)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)

    # Zwrotka dla Optuny — f1 jako metryka
    return f1_score(y_test, preds, average='weighted')

# Uruchomienie Optuny
study = optuna.create_study(direction="maximize")
study.optimize(optuna_objective, n_trials=30)

# Najlepsze parametry i model
best_params = study.best_trial.params
print(f"\nNajlepsze parametry znalezione przez Optunę: {best_params}")

# Odtworzenie najlepszego modelu
if best_model_name == "RandomForestClassifier":
    final_model = RandomForestClassifier(random_state=42, **best_params)
elif best_model_name == "XGBoost":
    final_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, **best_params)
elif best_model_name == "LGBMClassifier":
    final_model = LGBMClassifier(random_state=42, **best_params)
elif best_model_name == "CatBoost":
    final_model = CatBoostClassifier(verbose=0, random_state=42, **best_params)
elif best_model_name == "GradientBoosting":
    final_model = GradientBoostingClassifier()
elif best_model_name == "KNN":
    final_model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', n_jobs=-1)
elif best_model_name == "LinearSVC":
    final_model = LinearSVC(random_state=42, max_iter=20000)
elif best_model_name == "LogisticRegression":
    final_model = LogisticRegression(random_state=42, max_iter=1000, n_jobs=-1)
elif best_model_name == "RidgeClassifier":
    final_model = RidgeClassifier()
elif best_model_name == "NaiveBayes":
    final_model = GaussianNB()
elif best_model_name == "LDA":
    final_model = LinearDiscriminantAnalysis()
else:
    raise ValueError("Brak implementacji final_model dla wybranego modelu.")

# Trenowanie finalnego pipeline
final_pipeline = create_model_pipeline(final_model)
final_pipeline.fit(X_train, y_train)
y_pred = final_pipeline.predict(X_test)

# Metryki końcowe
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred, average='weighted')
test_recall = recall_score(y_test, y_pred, average='weighted')
test_f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nMetryki modelu po tuningu Optuna:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")

# Wykres SHAP tylko dla modeli wspieranych
if best_model_name in ["XGBoost", "LGBMClassifier", "CatBoost", "RandomForestClassifier"]:
    X_transformed = final_pipeline.named_steps['preprocessor'].transform(X_test)

    explainer = shap.Explainer(final_pipeline.named_steps['classifier'], X_transformed)
    shap_values = explainer(X_transformed, check_additivity=False)

    plt.figure()
    shap.summary_plot(
        shap_values,
        features=X_transformed,
        feature_names=final_pipeline.named_steps['preprocessor'].get_feature_names_out(),
        show=False
    )
    shap_path = f"../results/shap_summary_{best_model_name}.png"
    plt.savefig(shap_path, bbox_inches="tight")
    plt.close()

    # Logowanie wykresu SHAP jako artefakt
    with mlflow.start_run(run_name=f"SHAP_{best_model_name}", nested=True):
        mlflow.log_artifact(shap_path)

In [None]:
# 8: Zapis najlepszego modelu do pliku

# Zapisanie finalnego pipeline z najlepszymi parametrami do pliku
model_path = f"../models/{best_model_name}_best_model.pkl"
joblib.dump(final_pipeline, model_path)

print(f"\nNajlepszy model zapisany do: {model_path}")

In [None]:
# 9: Predict pipeline – generowanie predykcji

def predict_pipeline(data_to_predict, model_path):
    """
    Przetwarzanie nowych danych i generowanie predykcji.
    
    Args:
        data_to_predict: DataFrame z nowymi danymi
        model_path: Ścieżka do zapisanego modelu
    
    Returns:
        DataFrame z predykcjami i prawdopodobieństwami (jeśli dostępne)
    """
    # Wczytanie modelu
    loaded_model = joblib.load(model_path)

    # Generowanie predykcji
    predictions = loaded_model.predict(data_to_predict)

    # Prawdopodobieństwa (jeśli dostępne)
    if hasattr(loaded_model, "predict_proba"):
        probabilities = loaded_model.predict_proba(data_to_predict)[:, 1]
        return pd.DataFrame({
            'prediction': predictions,
            'probability': probabilities
        })
    else:
        return pd.DataFrame({
            'prediction': predictions
        })

# Przykład użycia na 5 próbkach
print("\nDemonstracja działania predict pipeline:")
sample_data = X_test.iloc[:5]
predictions = predict_pipeline(sample_data, model_path)
print(predictions)

In [None]:
# 10: Podsumowanie projektu

# Podsumowanie
print("\nPodsumowanie projektu:")
print(f"1. Najlepszy model: {best_model_name}")
print(f"2. Najlepsze parametry (Optuna): {best_params}")
print(f"3. Accuracy najlepszego modelu: {test_accuracy:.4f}")
print(f"4. F1 Score najlepszego modelu: {test_f1:.4f}")
print(f"5. Precision najlepszego modelu: {test_precision:.4f}")
print(f"6. Recall najlepszego modelu: {test_recall:.4f}")

print("\nWszystkie eksperymenty i wyniki są dostępne w MLflow pod adresem:")
print(f"MLflow UI: {mlflow.get_tracking_uri()}")
print(f"Experiment ID: {mlflow.get_experiment_by_name(experiment_name).experiment_id}")

In [None]:
# 11: Zakończenie
print("\nProjekt zakończony sukcesem!")