Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Processing

In [2]:
# loading the csv data to a Pandas Dataframe
heart_data = pd.read_csv('/content/heart_cleveland_upload.csv')

In [3]:
# number of rows and columns in the dataset
heart_data.shape

(297, 14)

In [4]:
# print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [5]:
# print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0
296,35,1,3,126,282,0,2,156,1,0.0,0,0,2,1


In [7]:
# getting some more data from the dataset
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       297 non-null    int64  
 5   fbs        297 non-null    int64  
 6   restecg    297 non-null    int64  
 7   thalach    297 non-null    int64  
 8   exang      297 non-null    int64  
 9   oldpeak    297 non-null    float64
 10  slope      297 non-null    int64  
 11  ca         297 non-null    int64  
 12  thal       297 non-null    int64  
 13  condition  297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB


In [8]:
# checking for missing values
heart_data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [9]:
# statistical measures of the dataset
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
count,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0
mean,54.542088,0.676768,2.158249,131.693603,247.350168,0.144781,0.996633,149.599327,0.326599,1.055556,0.602694,0.676768,0.835017,0.461279
std,9.049736,0.4685,0.964859,17.762806,51.997583,0.352474,0.994914,22.941562,0.469761,1.166123,0.618187,0.938965,0.95669,0.49934
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,2.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,1.0,2.0,130.0,243.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,0.0,0.0
75%,61.0,1.0,3.0,140.0,276.0,0.0,2.0,166.0,1.0,1.6,1.0,1.0,2.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,3.0,2.0,1.0


In [11]:
# checking the distribution of the target variable
heart_data['condition'].value_counts()

Unnamed: 0_level_0,count
condition,Unnamed: 1_level_1
0,160
1,137


1 --> Defective Heart
0 --> Healthy Heart

splitting the data and label


In [13]:
X = heart_data.drop(columns='condition', axis=1)
Y = heart_data['condition']

In [15]:
print(X)
print(Y)

     age  sex  cp  trestbps  chol  ...  exang  oldpeak  slope  ca  thal
0     69    1   0       160   234  ...      0      0.1      1   1     0
1     69    0   0       140   239  ...      0      1.8      0   2     0
2     66    0   0       150   226  ...      0      2.6      2   0     0
3     65    1   0       138   282  ...      0      1.4      1   1     0
4     64    1   0       110   211  ...      1      1.8      1   0     0
..   ...  ...  ..       ...   ...  ...    ...      ...    ...  ..   ...
292   40    1   3       152   223  ...      0      0.0      0   0     2
293   39    1   3       118   219  ...      0      1.2      1   0     2
294   35    1   3       120   198  ...      1      1.6      1   0     2
295   35    0   3       138   183  ...      0      1.4      0   0     0
296   35    1   3       126   282  ...      1      0.0      0   0     2

[297 rows x 13 columns]
0      0
1      0
2      0
3      1
4      0
      ..
292    1
293    1
294    1
295    0
296    1
Name: condit

Splitting the data into training and test data

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [35]:
print(X.shape, X_train.shape, X_test.shape)

(297, 13) (237, 13) (60, 13)


Model Training

Logistic Regression Model

In [36]:
model = LogisticRegression()

In [37]:
# training the LogisticRegression Model with training dataset
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation : Accuracy Score

In [38]:
# accuracy on training data
X_train_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_pred, Y_train)

In [39]:
print('Accuracy on Training Data: ', training_data_accuracy)

Accuracy on Training Data:  0.8565400843881856


In [40]:
# accuracy on test data
X_test_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_pred, Y_test)

In [41]:
print('Accuracy on Test Deata: ', test_data_accuracy)

Accuracy on Test Deata:  0.9


Building a Predictive System

In [45]:
input_data = (61,1,0,134,234,0,0,145,0,2.6,1,2,0)

# change the input_data to a numpy array
input_data_as_np_array = np.asarray(input_data)

# reshape the array as we are predicting only for one value
input_data_reshaped = input_data_as_np_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0] == 1):
  print('Person has defective Heart')
else:
  print('Person has Healthy Heart')


[1]
Person has defective Heart


