Data set from Kaggle: 

https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease

In [None]:
%pip install catboost matplotlib pandas scikit-learn kaggle optuna ipywidgets shap jupyterlab-rise

In [None]:
%pip install plotly kaleido

In [None]:
from pathlib import Path

cwd = Path.cwd()
print("Aktualna ścieżka:", cwd)

In [None]:
import os
from pathlib import Path
container_check = os.getenv("iscontainer")
if container_check=="y":
    config_dir = Path("/home/vscode/.config/kaggle")
    config_dir.mkdir(parents=True, exist_ok=True)
    
    with open(config_dir / "kaggle.json", "w") as dst:
        with open("./kaggle.json", "r") as src:
            dst.write(src.read())

In [None]:
import os
import zipfile
import shutil
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

dataset_name = "kamilpytlak/personal-key-indicators-of-heart-disease"
download_folder = Path("data/heart-disease")
download_folder.mkdir(parents=True, exist_ok=True)

api.dataset_download_files(dataset_name, path=str(download_folder), unzip=True)

base_dir = Path("data/heart-disease")
file_to_move = base_dir / "2020" / "heart_2020_cleaned.csv"
target_location = base_dir / "heart_2020_cleaned.csv"

# Przenieś plik
if file_to_move.exists():
    shutil.move(str(file_to_move), str(target_location))
    print(f"Przeniesiono: {file_to_move.name} → {target_location}")
else:
    print("Plik nie istnieje:", file_to_move)

# Usuń foldery 2020 i 2022
for folder in ["2020", "2022"]:
    folder_path = base_dir / folder
    if folder_path.exists() and folder_path.is_dir():
        shutil.rmtree(folder_path)
        print(f"Usunięto folder: {folder_path}")
    else:
        print(f"Folder nie istnieje: {folder_path}")


In [None]:
#!dir "./data/heart-disease"

from pathlib import Path

folder = Path("./data/heart-disease")
for item in folder.iterdir():
    if item.is_file():
        print(f"{item.name}: {item.stat().st_size:,} bajtów")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(download_folder / "heart_2020_cleaned.csv")
df['HeartDisease'] = (df['HeartDisease'] == "Yes").astype(int)

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['HeartDisease'])


In [None]:
df_test.info()

In [None]:
df_train.info()

In [None]:
y_train = df_train.pop("HeartDisease")
X_train = df_train

In [None]:
y_test = df_test.pop("HeartDisease")
X_test = df_test

In [None]:
X_train.info()

In [None]:
import joblib
import optuna
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv

outfolder = Path("results")
outfolder.mkdir(parents=True, exist_ok=True)

best_params_path = outfolder / "best_params.pkl"

# Określ kolumny kategoryczne
categorical_features = [
    "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", 
    "Sex", "AgeCategory", "Race", "Diabetic", "PhysicalActivity", 
    "GenHealth", "Asthma", "KidneyDisease", "SkinCancer"
]
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_features if col in X_train.columns]

if not best_params_path.is_file():
    # DODAJ stratify=y_train
    X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
        X_train, y_train, test_size=0.25, random_state=42, stratify=y_train
    )
    
    def objective(trial):
        params = {
            "depth": trial.suggest_int("depth", 2, 10),
            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3),
            "iterations": trial.suggest_int("iterations", 100, 300),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-5, 100.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 1),
            "random_strength": trial.suggest_float("random_strength", 1e-5, 100.0, log=True),
            "auto_class_weights": "Balanced"  # ⭐ DODAJ TO
        }
        model = CatBoostClassifier(
            **params, 
            verbose=0, 
            cat_features=categorical_indices,
            random_seed=42
        )
        model.fit(
            X_train_opt, 
            y_train_opt, 
            eval_set=(X_val_opt, y_val_opt), 
            early_stopping_rounds=50
        )
        return model.get_best_score()["validation"]["Logloss"]
    
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)
    
    joblib.dump(study.best_params, best_params_path)
    params = study.best_params
else:
    params = joblib.load(best_params_path)

print("Best Parameters:", params)

In [None]:
params["eval_metric"] = "F1"
params["loss_function"] = "Logloss"
params["auto_class_weights"] = "Balanced"  

model = CatBoostClassifier(
    **params,
    verbose=True,
    cat_features=categorical_indices,
    random_seed=42
)

data = Pool(X_train, y_train, cat_features=categorical_indices)

cv_results = cv(
    params=params,
    pool=data,
    fold_count=5,
    partition_random_seed=42,
    shuffle=True,
    stratified=True  
)

cv_results.to_csv(outfolder / "cv_results.csv", index=False)

In [None]:
import plotly.graph_objects as go
import numpy as np

# Create figure
fig = go.Figure()

# Konwertuj do numpy arrays
iterations = cv_results["iterations"].values
mean_f1 = cv_results["test-F1-mean"].values
std_f1 = cv_results["test-F1-std"].values

# Add shaded error region NAJPIERW (żeby była pod linią)
fig.add_trace(
    go.Scatter(
        x=np.concatenate([iterations, iterations[::-1]]),
        y=np.concatenate([mean_f1 + std_f1, (mean_f1 - std_f1)[::-1]]),
        fill="toself",
        fillcolor="rgba(0, 0, 255, 0.2)",
        line=dict(width=0),
        showlegend=False,
        hoverinfo='skip',
        name='±1 std'
    )
)

# Add mean performance line
fig.add_trace(
    go.Scatter(
        x=iterations,
        y=mean_f1,
        mode="lines",
        name="Mean F1 Score",
        line=dict(color="blue", width=2)
    )
)

# Customize layout
fig.update_layout(
    title="Cross-Validation (N=5) Mean F1 score with Error Bands",
    xaxis_title="Training Steps",
    yaxis_title="Performance Score",
    template="plotly_white",
    yaxis=dict(range=[0.5, 1]),
    hovermode='x unified'
)

fig.show()

try:
    fig.write_image(outfolder / "test_f1.png")
    print("Wykres zapisany przez plotly")
except Exception as e:
    print(f"Błąd plotly: {e}")
    print("Używam matplotlib...")
    
    # Fallback do matplotlib
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 6))
    plt.fill_between(iterations, mean_f1 - std_f1, mean_f1 + std_f1, 
                     alpha=0.2, color='blue', label='±1 std')
    plt.plot(iterations, mean_f1, 'b-', linewidth=2, label='Mean F1 Score')
    
    plt.title("Cross-Validation (N=5) Mean F1 score with Error Bands")
    plt.xlabel("Training Steps")
    plt.ylabel("Performance Score")
    plt.ylim(0.5, 1)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.savefig(outfolder / "test_f1.png", dpi=300, bbox_inches='tight')
    plt.show()
    print("Wykres zapisany przez matplotlib")

In [None]:
import plotly.graph_objects as go

# Create figure
fig = go.Figure()

# Add mean performance line
fig.add_trace(
    go.Scatter(
        x=cv_results["iterations"], y=cv_results["test-Logloss-mean"], mode="lines", name="Mean logloss", line=dict(color="blue")
    )
)

# Add shaded error region
fig.add_trace(
    go.Scatter(
        x=pd.concat([cv_results["iterations"], cv_results["iterations"][::-1]]),
        y=pd.concat([cv_results["test-Logloss-mean"]+cv_results["test-Logloss-std"], 
                     cv_results["test-Logloss-mean"]-cv_results["test-Logloss-std"]]),
        fill="toself", 
        fillcolor="rgba(0, 0, 255, 0.2)",
        line=dict(color="rgba(255, 255, 255, 0)"),
        showlegend=False
    )
)

# Customize layout
fig.update_layout(
    title="Cross-Validation (N=5) Mean Logloss with Error Bands",
    xaxis_title="Training Steps",
    yaxis_title="Logloss",
    template="plotly_white"
)

fig.show()

fig.write_image(outfolder / "test_logloss.png")

In [None]:
model.fit(
    X_train,
    y_train,
    verbose_eval=50,
    early_stopping_rounds=50,
    use_best_model=False,
    plot=True
)

model.save_model(outfolder / 'catboost_model_HeartDisease.cbm')
joblib.dump(params, outfolder / 'model_params.pkl')

In [None]:
model.predict(df_test)

In [None]:
preds = model.predict(df_test[X_train.columns])

In [None]:
import shap
import matplotlib.pyplot as plt
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_test)

shap.summary_plot(shap_values, df_test, show=False)
plt.savefig(outfolder / "test_shap_overall.png")

In [None]:
df_test["target"] = preds
df_test.to_csv(outfolder / "predictions.csv", index=False)