In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Daten laden
df = pd.read_csv("heart.csv")

# 2. Features und Zielvariable definieren
X = df.drop(columns=["HeartDisease"])
y = df["HeartDisease"]

# 3. Numerische und kategoriale Features bestimmen
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

print("Numerische Features:", numeric_features)
print("Kategoriale Features:", categorical_features)

# 4. Gemeinsames Preprocessing für beide Modelle
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# 5. Train-Test-Split (stratifiziert, damit die Klassenverteilung erhalten bleibt)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Entscheidungsbaum-Modell
tree_clf = DecisionTreeClassifier(random_state=42)

tree_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", tree_clf)
])

tree_pipeline.fit(X_train, y_train)
y_pred_tree = tree_pipeline.predict(X_test)

acc_tree = accuracy_score(y_test, y_pred_tree)
print(f"Accuracy Entscheidungsbaum: {acc_tree:.3f}")
print("\nClassification Report – Entscheidungsbaum:")
print(classification_report(y_test, y_pred_tree))

# 7. Random Forest Modell
rf_clf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", rf_clf)
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"\nAccuracy Random Forest: {acc_rf:.3f}")
print("\nClassification Report – Random Forest:")
print(classification_report(y_test, y_pred_rf))


Numerische Features: ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
Kategoriale Features: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
Accuracy Entscheidungsbaum: 0.793

Classification Report – Entscheidungsbaum:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77        82
           1       0.81      0.81      0.81       102

    accuracy                           0.79       184
   macro avg       0.79      0.79      0.79       184
weighted avg       0.79      0.79      0.79       184


Accuracy Random Forest: 0.908

Classification Report – Random Forest:
              precision    recall  f1-score   support

           0       0.92      0.87      0.89        82
           1       0.90      0.94      0.92       102

    accuracy                           0.91       184
   macro avg       0.91      0.90      0.91       184
weighted avg       0.91      0.91      0.91       184

