In [1]:
#Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [33]:

#Step 2: Load and view the heart disease dataset
data = pd.read_csv(r"C:\Users\HP 1040 G7 X360\OneDrive\Desktop\heart.csv")
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [41]:
#Step 3: Specify the label (y) and features (X)
y=data['target']
X = data.drop('target', axis=1)
print(X.head())
print(y.head())



   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  
0   0     1  
1   0     2  
2   0     2  
3   0     2  
4   0     2  
0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64


In [43]:
#Step 4: Split the heart disease dataset by 20% into the “train” and “test” datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [45]:
#Step 5: Scale the data
scaler = StandardScaler()
scale = scaler.fit(X_train)
X_train = scale.transform(X_train)
X_test = scale.transform(X_test)

In [47]:
#Step 6: Train the logistic regression algorithm on the “train” dataset
model = LogisticRegression()
model.fit(X_train, y_train)

In [49]:
#Step 7: Predict the probability of a patient having heart disease using the “test” dataset
pred=model.predict(X_test)
pred

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [57]:
#Creating a confusion matrix
confusion_matrix(y_test, pred)

array([[25,  4],
       [ 5, 27]], dtype=int64)

In [51]:
#Step 8: Plot a simple confusion matrix to view the reliability of the model
confusion_matrix(y_test,pred)
score = accuracy_score(y_test, pred)
score

0.8524590163934426

In [53]:
#Step 9: View more confusion matrix metrics
matrix = classification_report(y_test, pred)
print(matrix)

              precision    recall  f1-score   support

           0       0.83      0.86      0.85        29
           1       0.87      0.84      0.86        32

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61



In [None]:
Let’s make sense of the model
When building logistic regression classifiers, it is important to think long and hard about which confusion matrix metric is best for determining the reliability of the model. For example, when classifying cancer, it is not good for a model to produce a large number of false negatives. The reason is that you would not want cancer patients to be told they have no cancer and so miss out on cancer treatment. In assessing the problem above similar logic applies. Therefore seeing that the model produces five false negatives out of a possible thirty-two I ask myself if it is OK that five out of thirty-two people (85%) are diagnosed negative when they are actually positive. My answer surely has to be no. Therefore more work is needed to produce a model with fewer false negatives.