In [1]:
# Import the modules
from sqlalchemy import create_engine
from config import username, password
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

### Prepare the data: import and scale

In [5]:
# Create engine connecting to database path
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/heart_data')

# Define query
query_heart = "SELECT * FROM heart_data"

# Heart table to DataFrame
df0 = pd.read_sql(query_heart, engine)
df0.head()

OperationalError: (psycopg2.OperationalError) FATAL:  role "postgres" does not exist

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [2]:
#### This section needs to instead retrieve data from SQL


# Read the CSV file exported from "Heart data.ipynb" into a Pandas DataFrame
df0 = pd.read_csv('Resources/heart_data.csv')
df0.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0
1,67,1,4,160,286,0,2,108,1,1.5,2,0
2,67,1,4,120,229,0,2,129,1,2.6,2,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0


In [3]:
# Scale continuous variables
continuous_data_scaled = StandardScaler().fit_transform(df0[['age','trestbps','chol','thalach','oldpeak']])

# Create a DataFrame with the scaled data
df0_continuous_scaled = pd.DataFrame(continuous_data_scaled, columns=['age','trestbps','chol','thalach','oldpeak'])
df0_continuous_scaled.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak
0,0.967841,0.626472,-0.290663,0.34833,0.92425
1,1.418342,1.458552,0.649048,-1.3481,0.20319
2,1.418342,-0.760328,-0.361584,-0.499885,1.194648
3,-1.960411,-0.205608,0.010754,1.842805,2.005841
4,-1.509911,-0.205608,-0.804844,1.236937,0.113058


In [4]:
# Add the categorical columns and encode them as dummies to complete the transformed data
df0_continuous_scaled_cats = pd.concat([df0_continuous_scaled,df0[['cp', 'restecg', 'slope','sex','exang','fbs','num']]], axis =1)
df = pd.get_dummies(df0_continuous_scaled_cats, columns=['cp', 'restecg', 'slope'], dtype=int)
df.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,exang,fbs,num,cp_1,cp_2,cp_3,cp_4,restecg_0,restecg_1,restecg_2,slope_1,slope_2,slope_3
0,0.967841,0.626472,-0.290663,0.34833,0.92425,1,0,1,0,1,0,0,0,0,0,1,0,0,1
1,1.418342,1.458552,0.649048,-1.3481,0.20319,1,1,0,0,0,0,0,1,0,0,1,0,1,0
2,1.418342,-0.760328,-0.361584,-0.499885,1.194648,1,1,0,0,0,0,0,1,0,0,1,0,1,0
3,-1.960411,-0.205608,0.010754,1.842805,2.005841,1,0,0,0,0,0,1,0,1,0,0,0,0,1
4,-1.509911,-0.205608,-0.804844,1.236937,0.113058,0,0,0,0,0,1,0,0,0,0,1,1,0,0


In [5]:
df.to_csv("Resources/heart_data_scaled.csv", index=False)

### Prepare the model: Split the data into training and testing data

In [6]:
# Separate the datafram into target and features
y = df['num']
X = df.drop(columns='num')

In [7]:
# Confirm the target and features variables
print(y.head())
X.head()

0    0
1    0
2    0
3    0
4    0
Name: num, dtype: int64


Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,exang,fbs,cp_1,cp_2,cp_3,cp_4,restecg_0,restecg_1,restecg_2,slope_1,slope_2,slope_3
0,0.967841,0.626472,-0.290663,0.34833,0.92425,1,0,1,1,0,0,0,0,0,1,0,0,1
1,1.418342,1.458552,0.649048,-1.3481,0.20319,1,1,0,0,0,0,1,0,0,1,0,1,0
2,1.418342,-0.760328,-0.361584,-0.499885,1.194648,1,1,0,0,0,0,1,0,0,1,0,1,0
3,-1.960411,-0.205608,0.010754,1.842805,2.005841,1,0,0,0,0,1,0,1,0,0,0,0,1
4,-1.509911,-0.205608,-0.804844,1.236937,0.113058,0,0,0,0,1,0,0,0,0,1,1,0,0


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(345, 18)

### Create the model: Logistic regression

In [9]:
# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [10]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

### Evaluate the model’s performance

In [11]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[37, 19],
       [22, 37]])

In [12]:
# Print the classification report for the model
testing_report = classification_report(y_test, predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       0.63      0.66      0.64        56
           1       0.66      0.63      0.64        59

    accuracy                           0.64       115
   macro avg       0.64      0.64      0.64       115
weighted avg       0.64      0.64      0.64       115

