In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import joblib


In [14]:

# Load dataset
df = pd.read_csv('/content/heart.csv')
print("Dataset Loaded Successfully.")

Dataset Loaded Successfully.


In [15]:
# Check for null values
print("\nNull values in dataset:\n", df.isnull().sum())


Null values in dataset:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [16]:
# Label encoding for object columns
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])
print("\nLabel Encoding Completed.")


Label Encoding Completed.


In [17]:

# Feature selection using correlation
corr_matrix = df.corr()
target = 'target' if 'target' in df.columns else df.columns[-1]
corr_target = abs(corr_matrix[target])
top_features = corr_target.drop(target).sort_values(ascending=False).head(5).index.tolist()
print(f"\nTop 5 Selected Features: {top_features}")


Top 5 Selected Features: ['oldpeak', 'exang', 'cp', 'thalach', 'ca']


In [18]:
# Data split
X = df[top_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:


# Train models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

best_model = None
best_score = 0

print("\nModel Evaluation:")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print(f"{name}: Accuracy = {score:.4f}")

    if score > best_score:
        best_score = score
        best_model = model
        best_model_name = name



Model Evaluation:
Logistic Regression: Accuracy = 0.7854
Random Forest: Accuracy = 0.9805
K-Nearest Neighbors: Accuracy = 0.7854
Naive Bayes: Accuracy = 0.7659


In [20]:
# Save the best model
joblib.dump(best_model, 'best_model.pkl')

print(f"\nBest Model: {best_model_name} with Accuracy = {best_score:.4f}")
print("Model saved as 'best_model.pkl'.")
print(f"\nFeatures used for training: {top_features}")


Best Model: Random Forest with Accuracy = 0.9805
Model saved as 'best_model.pkl'.

Features used for training: ['oldpeak', 'exang', 'cp', 'thalach', 'ca']


In [11]:
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


In [12]:
# Display the first 20 rows of the test dataset
print("X_test (features):")
display(X_test.head(20))

print("\ny_test (target variable):")
display(y_test.head(20))

X_test (features):


Unnamed: 0,oldpeak,exang,cp,thalach,ca
527,0.0,0,0,163,0
359,0.0,0,2,115,0
447,0.8,1,0,145,1
31,1.1,0,1,162,0
621,0.0,1,0,150,2
590,0.2,1,1,121,1
905,2.2,1,0,96,1
737,2.6,1,0,129,2
76,0.0,0,2,175,2
948,2.6,1,0,125,0



y_test (target variable):


Unnamed: 0,target
527,1
359,1
447,0
31,1
621,0
590,1
905,0
737,0
76,1
948,0
