In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Data Collection, Formatting and Analysis

In [2]:
# Read and save data
data = pd.read_csv("heart.csv")
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [3]:
# Remove duplicate values
data.drop_duplicates(inplace=True)

In [4]:
# Display counts of all target values (0- disease present, 1- disease not present)
data["target"].value_counts()

target
1    164
0    138
Name: count, dtype: int64

In [5]:
data.shape

(302, 14)

In [6]:
# Display positive and negative correlations
data.corr()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
age,1.0,-0.094962,-0.063107,0.283121,0.207216,0.119492,-0.11159,-0.395235,0.093216,0.20604,-0.164124,0.302261,0.065317,-0.221476
sex,-0.094962,1.0,-0.05174,-0.057647,-0.195571,0.046022,-0.060351,-0.046439,0.14346,0.098322,-0.03299,0.11306,0.211452,-0.283609
cp,-0.063107,-0.05174,1.0,0.046486,-0.072682,0.096018,0.041561,0.293367,-0.392937,-0.146692,0.116854,-0.195356,-0.16037,0.43208
trestbps,0.283121,-0.057647,0.046486,1.0,0.125256,0.178125,-0.115367,-0.048023,0.068526,0.1946,-0.122873,0.099248,0.06287,-0.146269
chol,0.207216,-0.195571,-0.072682,0.125256,1.0,0.011428,-0.147602,-0.005308,0.064099,0.050086,0.000417,0.086878,0.09681,-0.081437
fbs,0.119492,0.046022,0.096018,0.178125,0.011428,1.0,-0.083081,-0.007169,0.024729,0.004514,-0.058654,0.144935,-0.032752,-0.026826
restecg,-0.11159,-0.060351,0.041561,-0.115367,-0.147602,-0.083081,1.0,0.04121,-0.068807,-0.056251,0.090402,-0.083112,-0.010473,0.134874
thalach,-0.395235,-0.046439,0.293367,-0.048023,-0.005308,-0.007169,0.04121,1.0,-0.377411,-0.342201,0.384754,-0.228311,-0.09491,0.419955
exang,0.093216,0.14346,-0.392937,0.068526,0.064099,0.024729,-0.068807,-0.377411,1.0,0.286766,-0.256106,0.125377,0.205826,-0.435601
oldpeak,0.20604,0.098322,-0.146692,0.1946,0.050086,0.004514,-0.056251,-0.342201,0.286766,1.0,-0.576314,0.23656,0.20909,-0.429146


In [7]:
# Display statistics for data
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


## Splitting Data into Training and Testing Sets

In [8]:
# Set independent and dependent variables
x = data.drop("target", axis=1)
y = data["target"]

In [9]:
x.shape

(302, 13)

In [10]:
# Split Data into training and testing sets using sklearn.model_selection
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=40)

In [11]:
x_train.shape

(241, 13)

In [12]:
x_test.shape

(61, 13)

## Feature Scaling

In [13]:
sc = StandardScaler()

In [14]:
sc.fit(x_train)

In [15]:
# Set x_train and x_test to have a standardized distribution
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)

In [16]:
x_train

array([[-0.76158321,  0.70490738,  0.96436106, ...,  0.95315324,
        -0.67698227, -0.51037721],
       [-0.76158321,  0.70490738, -0.9723974 , ...,  0.95315324,
        -0.67698227, -0.51037721],
       [-1.73786806,  0.70490738,  0.96436106, ...,  0.95315324,
         3.40183588, -0.51037721],
       ...,
       [-0.8700593 ,  0.70490738, -0.9723974 , ..., -0.71141148,
         1.36242681,  1.17456673],
       [ 0.10622555,  0.70490738, -0.9723974 , ..., -0.71141148,
         0.34272227,  1.17456673],
       [ 0.97403431, -1.4186261 , -0.9723974 , ..., -0.71141148,
         1.36242681, -0.51037721]])

## Creating a Training Model

In [17]:
lr = LogisticRegression()

In [18]:
lr.fit(x_train, y_train)

In [19]:
# Create predictions using logistic regression from sklearn
y_predictions = lr.predict(x_test)

In [20]:
y_predictions


array([1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1], dtype=int64)

In [21]:
# The percent accuracy of the training model
accuracy_score(y_test, y_predictions)

0.9016393442622951

## Generating Results

In [22]:
x_train[1]

array([-0.76158321,  0.70490738, -0.9723974 , -1.08803518, -0.77822762,
       -0.42587856,  0.85167763, -0.2877988 , -0.6917569 , -0.79207469,
        0.95315324, -0.67698227, -0.51037721])

In [23]:
input_values = (52, 1, 0, 125, 212, 0, 1, 168, 0, 1, 2, 2, 3)

df = np.asarray(input_values)
pred = lr.predict(df.reshape(1, -1))

if pred[0] == 1:
    result = True
else:
    result = False

In [24]:
result

True

In [25]:
import pickle
pickle.dump(lr, open('MLmodel.pkl', 'wb'))