# import  useful libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
df=pd.read_csv('Heart_Disease_Prediction.csv')

# handling missing values

In [3]:
X=df.where(pd.notnull(df),'')

In [4]:
X.head

<bound method NDFrame.head of      Age  Sex  Chest pain type   BP  Cholesterol  FBS over 120  EKG results  \
0     70    1                4  130          322             0            2   
1     67    0                3  115          564             0            2   
2     57    1                2  124          261             0            0   
3     64    1                4  128          263             0            0   
4     74    0                2  120          269             0            2   
..   ...  ...              ...  ...          ...           ...          ...   
265   52    1                3  172          199             1            0   
266   44    1                2  120          263             0            0   
267   56    0                2  140          294             0            2   
268   57    1                4  140          192             0            0   
269   67    1                4  160          286             0            2   

     Max HR  Exercise

# categorize the prediction results

In [5]:
df.loc[df['Heart Disease']=='Presence','Heart Disease',] = 1
df.loc[df['Heart Disease']=='Absence','Heart Disease',] = 0

In [6]:
X=df.drop('Heart Disease',axis=1)

In [7]:
Y=df['Heart Disease']

# create logistic regression model

In [8]:
model = LogisticRegression(solver='liblinear', max_iter=1000)

In [9]:
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)

#  extract useful features

In [10]:
from sklearn.feature_selection import RFE

# Create the RFE model and select the number of features to retain
num_features_to_retain = 11  # Adjust this based on your preference
rfe = RFE(estimator=model, n_features_to_select=num_features_to_retain)

# Fit the RFE model to your data
X_rfe = rfe.fit_transform(X, Y)

# Get the selected features
selected_features = X.columns[rfe.support_]

# Print the selected features
print("Selected Features:", selected_features)


Selected Features: Index(['Age', 'Sex', 'Chest pain type', 'FBS over 120', 'EKG results',
       'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST',
       'Number of vessels fluro', 'Thallium'],
      dtype='object')


In [11]:
X_selected=X[selected_features]

In [12]:
X_train,X_test,Y_train,Y_test=train_test_split(X_selected,Y,random_state=3,test_size=0.3)



# scaling the training data

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(189, 11)
(81, 11)
(189,)
(81,)


# performing cross validation for better performance

In [15]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Assuming X_selected is your feature matrix and Y is your target variable
# Update the parameter grid based on your requirements
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization type
}

# Define the scoring metrics you are interested in
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='binary'),
    'recall': make_scorer(recall_score, average='binary'),
    'f1': make_scorer(f1_score, average='binary'),
}


# Create the cross-validation strategy (StratifiedKFold for classification)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, scoring=scoring, cv=cv, refit='f1', verbose=1)

# Fit the model with cross-validated grid search
grid_search.fit(X_selected, Y)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=LogisticRegression(max_iter=1000, solver='liblinear'),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'penalty': ['l1', 'l2']},
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=binary),
                      'precision': make_scorer(precision_score, average=binary),
                      'recall': make_scorer(recall_score, average=binary)},
             verbose=1)

In [16]:
# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'C': 1, 'penalty': 'l1'}


In [17]:
# Get the best model
best_model = grid_search.best_estimator_
print(best_model)

LogisticRegression(C=1, max_iter=1000, penalty='l1', solver='liblinear')


In [18]:
# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)

# calculate the accuracy score

In [19]:
accuracy =accuracy_score(Y_test,y_pred)
confusion=confusion_matrix(Y_test,y_pred)
report=classification_report(Y_test,y_pred)

In [20]:
print(f'Accuracy:{accuracy}')
print(f'Confusion Matrix:{confusion}')
print(f'Classification Report:{report}')

Accuracy:0.8518518518518519
Confusion Matrix:[[45  6]
 [ 6 24]]
Classification Report:              precision    recall  f1-score   support

           0       0.88      0.88      0.88        51
           1       0.80      0.80      0.80        30

    accuracy                           0.85        81
   macro avg       0.84      0.84      0.84        81
weighted avg       0.85      0.85      0.85        81



# prediction on random example

In [21]:
X_new={'Age':50, 'Sex':1.0, 'Chest pain type':5.0, 'FBS over 120':0.0, 'EKG results':1.0,
       'Max HR':140.5, 'Exercise angina':2.0, 'ST depression':3.2, 'Slope of ST':0.0,
       'Number of vessels fluro':1.0, 'Thallium':5.3}

In [22]:
X_new = pd.DataFrame([X_new])
y_pred=best_model.predict(X_new)

In [23]:
print(y_pred)
if y_pred==1:
    print("Heart Disease")
else:
    print("No Heart Disease")
        

[1]
Heart Disease
