<a href="https://colab.research.google.com/github/SasinduShanaka/heart_disease_predict_model/blob/main/Heart_Disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
df = pd.read_csv("heart_disease.csv")

df.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140.0,289.0,0.0,Normal,172.0,N,0.0,Up,0
1,49,F,NAP,160.0,180.0,0.0,Normal,156.0,N,1.0,Flat,1
2,37,M,ATA,130.0,283.0,0.0,ST,98.0,N,0.0,Up,0
3,48,F,ASY,138.0,214.0,0.0,Normal,108.0,Y,1.5,Flat,1
4,54,M,NAP,150.0,195.0,0.0,Normal,122.0,N,0.0,Up,0


In [4]:
df.shape

(922, 12)

In [5]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,922.0,920.0,918.0,921.0,919.0,920.0,922.0
mean,54.244035,134.192391,202.845316,0.233442,136.769314,0.888587,0.555315
std,21.265503,48.054115,150.0378,0.423251,25.467932,1.067112,0.497201
min,3.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,175.0,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,155.5,1.5,1.0
max,622.0,1300.0,3331.0,1.0,202.0,6.2,1.0


In [6]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,4
RestingBP,2
Cholesterol,4
FastingBS,1
RestingECG,0
MaxHR,3
ExerciseAngina,0
Oldpeak,2


In [8]:
from sklearn.impute import SimpleImputer

num_cols = df.select_dtypes(include=["int64","float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

df[num_cols] = SimpleImputer(strategy="median").fit_transform(df[num_cols])
df[cat_cols] = SimpleImputer(strategy="most_frequent").fit_transform(df[cat_cols])




In [12]:
df.duplicated().sum()

np.int64(0)

In [11]:
df.drop_duplicates(inplace=True)

In [None]:
import numpy as np

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower, upper)


In [13]:
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]


In [14]:
numerical_cols = X.select_dtypes(include=["int64","float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns


In [15]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(drop="first"), categorical_cols)
    ]
)


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

lr_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression())
])

lr_model.fit(X_train, y_train)


In [18]:
from sklearn.tree import DecisionTreeClassifier

dt_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

dt_model.fit(X_train, y_train)


In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

rf_model.fit(X_train, y_train)


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [21]:
models = {
    "Logistic Regression": lr_model,
    "Decision Tree": dt_model,
    "Random Forest": rf_model
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(name)
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1-score :", f1_score(y_test, y_pred))
    print("-"*30)


Logistic Regression
Accuracy : 0.8858695652173914
Precision: 0.8857142857142857
Recall   : 0.9117647058823529
F1-score : 0.8985507246376812
------------------------------
Decision Tree
Accuracy : 0.782608695652174
Precision: 0.7924528301886793
Recall   : 0.8235294117647058
F1-score : 0.8076923076923077
------------------------------
Random Forest
Accuracy : 0.8695652173913043
Precision: 0.8545454545454545
Recall   : 0.9215686274509803
F1-score : 0.8867924528301887
------------------------------
