In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
# Load the data
data = pd.read_csv('heart disease prediction.csv')


In [5]:
data.info

<bound method DataFrame.info of      Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
0     40   M           ATA        140          289          0     Normal   
1     49   F           NAP        160          180          0     Normal   
2     37   M           ATA        130          283          0         ST   
3     48   F           ASY        138          214          0     Normal   
4     54   M           NAP        150          195          0     Normal   
..   ...  ..           ...        ...          ...        ...        ...   
913   45   M            TA        110          264          0     Normal   
914   68   M           ASY        144          193          1     Normal   
915   57   M           ASY        130          131          0     Normal   
916   57   F           ATA        130          236          0        LVH   
917   38   M           NAP        138          175          0     Normal   

     MaxHR ExerciseAngina  Oldpeak ST_Slope  HeartDisea

In [6]:
data.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [7]:
# Data Preprocessing
# Encoding categorical columns
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)


In [8]:
# Scaling numerical features
scaler = StandardScaler()
num_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
data[num_cols] = scaler.fit_transform(data[num_cols])


In [9]:
# Split the dataset into features (X) and target (y)
X = data.drop(columns=['HeartDisease'])
y = data['HeartDisease']

In [10]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Model Building
model = LogisticRegression()
model.fit(X_train, y_train)

In [12]:
# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [13]:
# Evaluation
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')
print(classification_report(y_test, y_test_pred))

Train Accuracy: 0.8705722070844687
Test Accuracy: 0.8532608695652174
              precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184



In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [15]:
# Define the model
model = LogisticRegression()

In [16]:
# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],         # Regularization penalty
    'C': [0.01, 0.1, 1, 10, 100],    # Inverse of regularization strength
    'solver': ['liblinear', 'saga']   # Solvers that support l1 and l2 penalties
}


In [17]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

In [18]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)



In [19]:
# Get the best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [20]:
# Predictions and Evaluation with the tuned model
y_test_pred = best_model.predict(X_test)


In [21]:
# Evaluate the model
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Best Parameters: {best_params}')
print(f'Test Accuracy: {test_accuracy}')
print(classification_report(y_test, y_test_pred))

Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Test Accuracy: 0.8532608695652174
              precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184



In [22]:
# Save the tuned model to disk
joblib.dump(best_model, 'heart_disease_tuned_model.pkl')


['heart_disease_tuned_model.pkl']