# Show Data

In [28]:
import pandas as pd

data = pd.read_csv('heart_disease_uci.csv')

print(data.head())


   id age     sex    dataset               cp  trestbps   chol    fbs  \
0   1  63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2  67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3  67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4  37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5  41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal    0  


# Preprocessing = Missing Value


In [29]:
print("Missing values per column:\n", data.isnull().sum())
print(data.dtypes)

Missing values per column:
 id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64
id            int64
age          object
sex          object
dataset      object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object


In [30]:
import pandas as pd
import numpy as np

data = pd.read_csv('heart_disease_uci.csv') 

text_to_number = {
    "four -eight": "48",
    "four eight": "48",
}

data['age'] = data['age'].replace(text_to_number)

In [31]:
# Separate numerical and categorical columns
numerical_cols = ['trestbps', 'chol', 'thalch', 'oldpeak', 'ca']  # Numerical features
categorical_cols = ['fbs', 'restecg', 'slope', 'thal', 'exang']  # Categorical features

In [32]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='mean')
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

print(data.isnull().sum())

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


# Encode + Standarize + SPLIT DATA


In [33]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Define categorical and numerical columns
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'dataset']
numerical_columns = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

# Separate features and target variable
X = data.drop(columns=['id', 'num'])  # Drop 'id' (irrelevant) and 'num' (target)
y = data['num']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # Encode categorical features
    ]
)

# Apply the preprocessing pipeline
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Check the transformed data shape
print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)


Training data shape: (736, 29)
Test data shape: (184, 29)


# Handle Class Imbalance

In [34]:

from imblearn.over_sampling import SMOTE

# SMOTE (Synthetic Minority Oversampling Technique)
smote = SMOTE(random_state=42)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

import numpy as np
unique, counts = np.unique(y_train_balanced, return_counts=True)
print("Class distribution after SMOTE:", dict(zip(unique, counts)))


Class distribution after SMOTE: {np.int64(0): np.int64(336), np.int64(1): np.int64(336), np.int64(2): np.int64(336)}




ANN pake scikitlearn MLPClassifier, https://scikit-learn.org/1.5/api/sklearn.neural_network.html

In [36]:
from sklearn.neural_network import MLPClassifier
ann = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),
    activation='relu',
    solver='adam',
    learning_rate_init=0.001,
    max_iter=500,
    random_state=42,
    early_stopping=True
)


In [37]:
ann.fit(X_train_balanced, y_train_balanced)


In [38]:
from sklearn.metrics import classification_report
y_pred = ann.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.80      0.80      0.80        75
           1       0.57      0.52      0.54        54
           2       0.70      0.76      0.73        55

    accuracy                           0.71       184
   macro avg       0.69      0.69      0.69       184
weighted avg       0.70      0.71      0.70       184



# SVM

In [39]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the SVM model
svm_model = SVC(kernel='rbf', random_state=42)

# Train the model
svm_model.fit(X_train_balanced, y_train_balanced)


In [40]:
y_pred = svm_model.predict(X_test)


In [41]:
accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", accuracy)


SVM Accuracy: 0.7228260869565217


In [42]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83        75
           1       0.60      0.48      0.54        54
           2       0.71      0.76      0.74        55

    accuracy                           0.72       184
   macro avg       0.70      0.70      0.70       184
weighted avg       0.71      0.72      0.72       184

