In [None]:
%pip install cookiecutter-data-science

In [None]:
import os
import zipfile
import shutil
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

dataset_name = "kamilpytlak/personal-key-indicators-of-heart-disease"
download_folder = Path("data/heart-disease")
download_folder.mkdir(parents=True, exist_ok=True)

api.dataset_download_files(dataset_name, path=str(download_folder), unzip=True)

base_dir = Path("data/heart-disease")
file_to_move = base_dir / "2020" / "heart_2020_cleaned.csv"
target_location = base_dir / "heart_2020_cleaned.csv"

# Przenieś plik
if file_to_move.exists():
    shutil.move(str(file_to_move), str(target_location))
    print(f"Przeniesiono: {file_to_move.name} → {target_location}")
else:
    print("Plik nie istnieje:", file_to_move)

# Usuń foldery 2020 i 2022
for folder in ["2020", "2022"]:
    folder_path = base_dir / folder
    if folder_path.exists() and folder_path.is_dir():
        shutil.rmtree(folder_path)
        print(f"Usunięto folder: {folder_path}")
    else:
        print(f"Folder nie istnieje: {folder_path}")

In [None]:
#!dir "./data/heart-disease"

from pathlib import Path

folder = Path("./data/heart-disease")
for item in folder.iterdir():
    if item.is_file():
        print(f"{item.name}: {item.stat().st_size:,} bajtów")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Wczytanie danych
df = pd.read_csv(download_folder / "heart_2020_cleaned.csv")

print(f"Kształt danych: {df.shape}")
print(f"\nKolumny:\n{df.columns.tolist()}")
print(f"\nTypy danych:\n{df.dtypes}")
print(f"\nPierwsze wiersze:")
print(df.head())

# Podział na train/test
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['HeartDisease'])

# Przygotowanie X i y
y_train = df_train.pop("HeartDisease")
X_train = df_train

y_test = df_test.pop("HeartDisease")
X_test = df_test

print(f"\n{'='*50}")
print(f"Rozmiar train: {X_train.shape}")
print(f"Rozmiar test: {X_test.shape}")
print(f"\nRozkład target w train:")
print(y_train.value_counts())
print(f"\nRozkład procentowy:")
print(y_train.value_counts(normalize=True))

In [None]:
import joblib
import optuna
from pathlib import Path
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

outfolder = Path("results")
outfolder.mkdir(parents=True, exist_ok=True)

best_params_path = outfolder / "best_params.pkl"

# Definiuj zmienne kategoryczne dla CatBoost
cat_features = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 
                'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 
                'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']

if not best_params_path.is_file():
    X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
        X_train, y_train, test_size=0.25, random_state=42, stratify=y_train
    )
    
    def objective(trial):
        params = {
            "depth": trial.suggest_int("depth", 2, 10),
            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
            "iterations": trial.suggest_int("iterations", 100, 500),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-5, 100.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 1),
            "random_strength": trial.suggest_float("random_strength", 1e-5, 100.0, log=True),
            "auto_class_weights": "Balanced",  # Ważne dla nierównowagi klas!
            "random_seed": 42,
            "verbose": 0
        }
        
        model = CatBoostClassifier(**params)
        model.fit(
            X_train_opt, y_train_opt, 
            cat_features=cat_features,
            eval_set=(X_val_opt, y_val_opt), 
            early_stopping_rounds=50
        )
        
        return model.get_best_score()["validation"]["Logloss"]
    
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50, show_progress_bar=True)
    
    joblib.dump(study.best_params, best_params_path)
    params = study.best_params
else:
    params = joblib.load(best_params_path)

print("Best Parameters:", params)

In [None]:
from catboost import CatBoostClassifier, Pool, cv

params["eval_metric"] = "F1"
params["loss_function"] = "Logloss"
params["auto_class_weights"] = "Balanced"  

model = CatBoostClassifier(
    **params,
    verbose=True
)

# Pool z oznaczeniem zmiennych kategorycznych
data = Pool(
    X_train, 
    y_train,
    cat_features=cat_features  
)

cv_results = cv(
    params=params,
    pool=data,
    fold_count=5,
    partition_random_seed=42,
    shuffle=True,
    stratified=True  
)

cv_results.to_csv(outfolder / "cv_results_v2.csv", index=False)
print("\nWyniki CV:")
print(cv_results.describe())

In [None]:
import plotly.graph_objects as go

# Create figure
fig = go.Figure()

# Add mean performance line
fig.add_trace(
    go.Scatter(
        x=cv_results["iterations"], y=cv_results["test-F1-mean"], mode="lines", name="Mean F1 Score", line=dict(color="blue")
    )
)

# Add shaded error region
fig.add_trace(
    go.Scatter(
        x=pd.concat([cv_results["iterations"], cv_results["iterations"][::-1]]),
        y=pd.concat([cv_results["test-F1-mean"]+cv_results["test-F1-std"], 
                     cv_results["test-F1-mean"]-cv_results["test-F1-std"]]),
        fill="toself", 
        fillcolor="rgba(0, 0, 255, 0.2)",
        line=dict(color="rgba(255, 255, 255, 0)"),
        showlegend=False
    )
)

# Customize layout
fig.update_layout(
    title="Cross-Validation (N=5) Mean F1 score with Error Bands",
    xaxis_title="Training Steps",
    yaxis_title="Performance Score",
    template="plotly_white",
    yaxis=dict(range=[0.5, 1])
)

fig.show()

fig.write_image(outfolder / "test_f1_v2.png")

In [None]:
import plotly.graph_objects as go

# Create figure
fig = go.Figure()

# Add mean performance line
fig.add_trace(
    go.Scatter(
        x=cv_results["iterations"], y=cv_results["test-Logloss-mean"], mode="lines", name="Mean logloss", line=dict(color="blue")
    )
)

# Add shaded error region
fig.add_trace(
    go.Scatter(
        x=pd.concat([cv_results["iterations"], cv_results["iterations"][::-1]]),
        y=pd.concat([cv_results["test-Logloss-mean"]+cv_results["test-Logloss-std"], 
                     cv_results["test-Logloss-mean"]-cv_results["test-Logloss-std"]]),
        fill="toself", 
        fillcolor="rgba(0, 0, 255, 0.2)",
        line=dict(color="rgba(255, 255, 255, 0)"),
        showlegend=False
    )
)

# Customize layout
fig.update_layout(
    title="Cross-Validation (N=5) Mean Logloss with Error Bands",
    xaxis_title="Training Steps",
    yaxis_title="Logloss",
    template="plotly_white"
)

fig.show()

fig.write_image(outfolder / "test_logloss_v2.png")

In [None]:
model = CatBoostClassifier(
    **params,
    verbose=True
)

model.fit(
    X_train,
    y_train,
    cat_features=cat_features,  
    verbose_eval=50,
    early_stopping_rounds=50,
    use_best_model=False,
    plot=True
)

model.save_model(outfolder / 'catboost_model_HeartDisease_v2.cbm')
joblib.dump(params, outfolder / 'model_params_v2.pkl')

In [None]:
model.predict(df_test)

In [None]:
preds = model.predict(df_test[X_train.columns])

In [None]:
import shap
import matplotlib.pyplot as plt
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_test)

shap.summary_plot(shap_values, df_test, show=False)
plt.savefig(outfolder / "test_shap_overall_v2.png")

In [None]:
df_test["target"] = preds
df_test.to_csv(outfolder / "predictions_v2.csv", index=False)