In [None]:
import numpy as np
import pandas as pd
import plotly.express as px 
import plotly.io as pio
pio.renderers.default = 'notebook'
# import warnings
# warnings.filterwarnings('ignore')

In [None]:
#loading dataset
heart_df = pd.read_csv('heart.csv')

In [None]:
#display first 5 rows
heart_df.head()

In [None]:
#getting dataset summary
heart_df.info()

In [None]:
#getting dataset statistics
heart_df.describe()

In [None]:
heart_df.describe(include='all')

Data preprocessing

In [None]:
#checking for null values
heart_df.isnull().sum()

In [None]:
#checking duplicate values
heart_df.duplicated().sum()

In [None]:
#checking for unique values 
heart_df.nunique()

In [None]:
#displaying column names
heart_df.columns

In [None]:
#displaying columns with object data type
cat_col = heart_df.select_dtypes(include='object').columns


converting categorical variables to numeric
1. sex :m=0, f = 1
2. ChestPainType:ATA=0, NAP=1, ASY=2, TA=3
3. RestingECG:Normal=0, ST=1, LVH=2
4. ExcersiceAngina: N=0, Y=1
5. STSlope: UP=0, Flat=1, Down=2


In [None]:
#convert categorical variables to numeric
for col in cat_col:
    print(col)
    print((heart_df[col].unique()),list(range(heart_df[col].nunique())))
    heart_df[col].replace((heart_df[col].unique()), range(heart_df[col].nunique()), inplace=True)
    print('*'*90)
    print()

In [None]:
heart_df

In [None]:
heart_df['Cholesterol'].value_counts()

Imputing the 0 values in cholestrol column with KNN imputer

In [None]:
np.nan

In [None]:
heart_df['Cholesterol'].replace(0, np.nan, inplace=True)

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
after_impute = imputer.fit_transform(heart_df)
heart_df = pd.DataFrame(after_impute, columns=heart_df.columns)

In [None]:
heart_df['Cholesterol'].isna().sum()

In [None]:
count=0
for i in heart_df['Cholesterol']:
    if i == 0:
        count += 1
        print(count)

Doing the same for Resting Blood Pressure

In [None]:
heart_df['RestingBP'][heart_df['RestingBP']==0]

In [None]:
# Imputing missing values in 'RestingBP' using KNNImputer
from sklearn.impute import KNNImputer
import numpy as np
heart_df = heart_df.copy()
heart_df['RestingBP'] = heart_df['RestingBP'].replace(0, np.nan)
imputer = KNNImputer(n_neighbors=3)
heart_df = pd.DataFrame(imputer.fit_transform(heart_df), columns=heart_df.columns)

In [None]:
heart_df['RestingBP'].unique()


In [None]:
heart_df['RestingBP'].isna().sum()

change column type to int

In [None]:
#change column type to int
withoutOldPeak=heart_df.columns
withoutOldPeak = withoutOldPeak.drop('Oldpeak')
heart_df[withoutOldPeak] = heart_df[withoutOldPeak].astype('int32')

In [None]:
heart_df.info()

Data visualization

In [None]:
heart_df.corr()['HeartDisease'][:-1].sort_values()

In [None]:
px.line(heart_df.corr()['HeartDisease'][:-1].sort_values())

Age and heart disease distribution

In [None]:
px.sunburst(heart_df, path=['HeartDisease','Age'])

In [None]:
px.histogram(heart_df, x='Age', color='HeartDisease')

In [None]:
#percentage of heart disease data distribution
px.pie(heart_df, names='HeartDisease', title='Heart Disease Distribution')

sex vs HeartDisease

In [None]:
px.histogram(heart_df, x='Sex', color='HeartDisease')

Chestpaintype vs HeartDisease

In [None]:
px.histogram(heart_df, x='ChestPainType', color='HeartDisease')
#chestpaintype: ATA=0, NAP=1, ASY=2, TA=3

RestingBP vs HeartDisease

In [None]:
px.sunburst(heart_df, path=['HeartDisease', 'RestingBP'])

FastingBS vs HeartDisease

In [None]:
px.histogram(heart_df, x='FastingBS', color='HeartDisease')

MAXHR vs HeartDisease

In [None]:
px.sunburst(heart_df, path=['HeartDisease', 'MaxHR'])

OldPeak vs HeartDisease

In [None]:
px.violin(heart_df, y='Oldpeak', color='HeartDisease')

ST Slope vs HeartDisease

In [None]:
px.histogram(heart_df, x='ST_Slope', color='HeartDisease')
# ST_Slope: down=0, Flat=1, up=2

ExcersiseAnginia vs HeartDisease

In [None]:
px.histogram(heart_df, x='ExerciseAngina', color='HeartDisease')

Train Test split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    heart_df.drop('HeartDisease', axis=1),
    heart_df['HeartDisease'],
    test_size=0.2, 
    random_state=42,
    stratify=heart_df['HeartDisease']
)


Logistic Regression

In [None]:
  # Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Scaling the data (important for some solvers)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Defining solvers with their optimal settings
solvers = {
    'lbfgs': {'max_iter': 1000},
    'liblinear': {'max_iter': 1000},
    'sag': {'max_iter': 10000},      
    'saga': {'max_iter': 10000},    
    'newton-cg': {'max_iter': 1000},
    'newton-cholesky': {'max_iter': 1000}
}
# Testing solvers with proper settings
best_score = 0
best_solver = None

for solver, params in solvers.items():
    try:
        lr = LogisticRegression(solver=solver, **params)
        lr.fit(x_train_scaled, y_train)
        score = lr.score(x_test_scaled, y_test)
        
        print(f"Solver: {solver:<15} Accuracy: {score:.4f}")
        
        if score > best_score:
            best_score = score
            best_solver = solver
    except Exception as e:
        print(f"Solver {solver} failed: {str(e)}")

# Final model with best solver
if best_solver:
    lr = LogisticRegression(solver=best_solver, 
                          max_iter=solvers[best_solver]['max_iter'])
    lr.fit(x_train_scaled, y_train)
    y_pred = lr.predict(x_test_scaled)
    print(f'\nBest solver: {best_solver}')
    print(f'Final accuracy: {accuracy_score(y_test, y_pred):.4f}')
else:
    print("No suitable solver found")

In [None]:
#adding pickle module
import pickle
file= open('logistic_regression_model.pkl', 'wb')
pickle.dump(lr, file)

Performance matrics for logistic regression


In [None]:
# Performance metrics for Logistic Regression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print('Logistic Regression Performance:')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Support Vector Machine(SVM)

In [None]:
#support vector machine
from sklearn.svm import SVC
from sklearn.metrics import f1_score
kernels = {'linear':0, 'poly':0, 'rbf':0, 'sigmoid':0}
best = ''
for i in kernels:
    svm = SVC(kernel=i)
    svm.fit(x_train, y_train)
    yhat = svm.predict(x_test)
    kernels[i] = f1_score(y_test, yhat, average='weighted')
    if kernels[i]== max(kernels.values()):
        best = i
  
svm = SVC(kernel=best)
svm.fit(x_train, y_train)
svm_pred = svm.predict(x_test)
print(f'svm f1_score kernel=({best}) : {f1_score(y_test, svm_pred, average="weighted"):.4f}')


In [None]:
#adding pickle file
import pickle
file= open('svm_model.pkl', 'wb')
pickle.dump(svm, file)

Performance metrics for SVM

In [None]:
# Performance metrics for SVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print('SVM Performance:')
print('Accuracy:', accuracy_score(y_test, svm_pred))
print('Precision:', precision_score(y_test, svm_pred))
print('Recall:', recall_score(y_test, svm_pred))
print('F1 Score:', f1_score(y_test, svm_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, svm_pred))

Decision Tree Classifier

In [None]:
#decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dtree= DecisionTreeClassifier(class_weight='balanced')
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4],
    'random_state': [0, 42]
} 
grid_search = GridSearchCV(dtree, param_grid, cv=5)
grid_search.fit(x_train, y_train)  
ctree=DecisionTreeClassifier(**grid_search.best_params_, class_weight='balanced')
ctree.fit(x_train, y_train)
dtc_pred = ctree.predict(x_test)
print(f"Decision Tree's accuracy", accuracy_score(y_test, dtc_pred))

In [None]:
#adding pickle module
import pickle
file= open('decision_tree_model.pkl', 'wb')
pickle.dump(ctree, file)

Performance metrics for Decision tree

In [None]:
# Performance metrics for Decision Tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print('Decision Tree Performance:')
print('Accuracy:', accuracy_score(y_test, dtc_pred))
print('Precision:', precision_score(y_test, dtc_pred))
print('Recall:', recall_score(y_test, dtc_pred))
print('F1 Score:', f1_score(y_test, dtc_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, dtc_pred))

Random Forest Classifier

In [None]:
#random forest classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier()

# Corrected parameter grid (removed invalid 'none' from max_features)
param_grid = {
    'n_estimators': [50, 100, 150, 500],
    'max_depth': [3, 6, 9, 19],
    'max_features': ['sqrt', 'log2'],  # Removed 'none'
    'max_leaf_nodes': [3, 6, 9],
}

grid_search = GridSearchCV(rfc, param_grid)
grid_search.fit(x_train, y_train)

rfctree = RandomForestClassifier(**grid_search.best_params_)
rfctree.fit(x_train, y_train)
rfc_pred = rfctree.predict(x_test)

print(f"Random Forest's accuracy: {accuracy_score(y_test, rfc_pred)}")

In [None]:
#adding pickle module
import pickle
file= open('random_forest.pkl', 'wb')
pickle.dump(rfctree, file)

Performance metrics for Random Forest

In [None]:
# Performance metrics for Random Forest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print('Random Forest Performance:')
print('Accuracy:', accuracy_score(y_test, rfc_pred))
print('Precision:', precision_score(y_test, rfc_pred))
print('Recall:', recall_score(y_test, rfc_pred))
print('F1 Score:', f1_score(y_test, rfc_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, rfc_pred))

In [None]:
#checking accuracy of logistic regression model
print("Training accuracy:", lr.score(x_train_scaled, y_train))
print("Test accuracy:", lr.score(x_test_scaled, y_test))