In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv("./heart.csv")

# Tampilkan info dasar
print(df.info()) 
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB
None
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina    

In [6]:
# --- PREPROCESSING ---

# Hapus kolom yang tidak relevan
df = df.drop(columns=["id", "dataset"])

# Ubah target jadi biner (0 = sehat, >0 = sakit)
df["target"] = (df["num"] > 0).astype(int)
df = df.drop(columns=["num"])

# Tangani missing value:
# - Numerik: isi dengan median
# - Kategorikal: isi dengan modus
for col in df.columns:
    if df[col].dtype == "object":
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

# One-hot encode kolom kategorikal
df = pd.get_dummies(df, drop_first=True)

# Split data
X = df.drop(columns=["target"])
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standarisasi fitur
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [7]:
from imblearn.over_sampling import SMOTE

# Inisialisasi SMOTE
smote = SMOTE(random_state=42)

# Terapkan hanya pada data training (bukan test!)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# Cek hasil balancing
print(pd.Series(y_train_bal).value_counts(normalize=True) * 100)


target
1    50.0
0    50.0
Name: proportion, dtype: float64


In [8]:
# Cek distribusi target (balance check)
print(pd.Series(y_train_bal).value_counts(normalize=True) * 100)

target
1    50.0
0    50.0
Name: proportion, dtype: float64


In [11]:
# --- TRAINING MODEL ---

# Random Forest
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train_bal, y_train_bal)
y_pred_rf = rf.predict(X_test_scaled)

# KNN
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_bal, y_train_bal)
y_pred_knn = knn.predict(X_test_scaled)

# --- EVALUASI ---
def evaluate_model(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    return {
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1,
        "Confusion_Matrix": cm
    }

results = [
    evaluate_model("Random Forest", y_test, y_pred_rf),
    evaluate_model("K-Nearest Neighbors", y_test, y_pred_knn)
]

print(results)


[{'Model': 'Random Forest', 'Accuracy': 0.8586956521739131, 'Precision': 0.8518518518518519, 'Recall': 0.9019607843137255, 'F1-Score': 0.8761904761904762, 'Confusion_Matrix': array([[66, 16],
       [10, 92]])}, {'Model': 'K-Nearest Neighbors', 'Accuracy': 0.8260869565217391, 'Precision': 0.8571428571428571, 'Recall': 0.8235294117647058, 'F1-Score': 0.84, 'Confusion_Matrix': array([[68, 14],
       [18, 84]])}]
