In [2]:
from path import Path
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
data = Path('../resources/heart.csv')
heart_df = pd.read_csv(data)
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
#Change names of columns
heart_df.columns=['age', 'sex','chest_pain_type','resting_blood_pressure','serum_cholestoral(mg/dl)', 'fasting_blood_sugar', 'resting_ecg_results','max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression', 'st_slope_elevation', 'major_vessels', 'thal', 'heart_disease']
heart_df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral(mg/dl),fasting_blood_sugar,resting_ecg_results,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope_elevation,major_vessels,thal,heart_disease
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [5]:
#List data types
heart_df.dtypes

age                           int64
sex                           int64
chest_pain_type               int64
resting_blood_pressure        int64
serum_cholestoral(mg/dl)      int64
fasting_blood_sugar           int64
resting_ecg_results           int64
max_heart_rate_achieved       int64
exercise_induced_angina       int64
st_depression               float64
st_slope_elevation            int64
major_vessels                 int64
thal                          int64
heart_disease                 int64
dtype: object

In [9]:
#Change names of columns
new_heart_df = heart_df.drop(["chest_pain_type","resting_ecg_results","max_heart_rate_achieved","exercise_induced_angina","st_depression","st_slope_elevation","major_vessels","thal"], axis=1)
new_heart_df

Unnamed: 0,age,sex,resting_blood_pressure,serum_cholestoral(mg/dl),fasting_blood_sugar,heart_disease
0,52,1,125,212,0,0
1,53,1,140,203,1,0
2,70,1,145,174,0,0
3,61,1,148,203,0,0
4,62,0,138,294,1,0
...,...,...,...,...,...,...
1020,59,1,140,221,0,1
1021,60,1,125,258,0,0
1022,47,1,110,275,0,0
1023,50,0,110,254,0,1


In [10]:
# Find null values
for column in new_heart_df.columns:
    print(f"Column {column} has {new_heart_df[column].isnull().sum()} null values")

Column age has 0 null values
Column sex has 0 null values
Column resting_blood_pressure has 0 null values
Column serum_cholestoral(mg/dl) has 0 null values
Column fasting_blood_sugar has 0 null values
Column heart_disease has 0 null values


In [11]:
# Feature and Target Columns
y = new_heart_df["heart_disease"]
X = new_heart_df.drop(columns="heart_disease")

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(768, 5)

In [13]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=500,
                                random_state=1)

In [14]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=1)

In [15]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,1
2,1,1
3,0,0
4,1,1
5,1,0
6,0,1
7,1,1
8,1,1
9,0,1


In [16]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,89,36
Actual 1,50,82


In [17]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6653696498054474


In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,89,36
Actual 1,50,82


Accuracy Score : 0.6653696498054474
Classification Report
              precision    recall  f1-score   support

           0       0.64      0.71      0.67       125
           1       0.69      0.62      0.66       132

    accuracy                           0.67       257
   macro avg       0.67      0.67      0.67       257
weighted avg       0.67      0.67      0.66       257



In [19]:
y_pred2 = classifier.predict(X_train)
results = pd.DataFrame({"Prediction": y_pred2, "Actual": y_train}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,1
1,1,1
2,1,0
3,0,0
4,1,0
5,0,0
6,1,1
7,0,0
8,0,1
9,0,1


In [20]:
# Calculating the confusion matrix
cm = confusion_matrix(y_train, y_pred2)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,243,131
Actual 1,142,252


In [21]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_pred2))

0.64453125


In [22]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {accuracy_score(y_train, y_pred2)}")
print("Classification Report")
print(classification_report(y_train, y_pred2))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,243,131
Actual 1,142,252


Accuracy Score : 0.64453125
Classification Report
              precision    recall  f1-score   support

           0       0.63      0.65      0.64       374
           1       0.66      0.64      0.65       394

    accuracy                           0.64       768
   macro avg       0.64      0.64      0.64       768
weighted avg       0.64      0.64      0.64       768



# RF FEATURE SELECTION

In [24]:
RF_new_heart_df = heart_df.drop(["fasting_blood_sugar","resting_ecg_results","sex","exercise_induced_angina","st_slope_elevation"], axis=1)
RF_new_heart_df

Unnamed: 0,age,chest_pain_type,resting_blood_pressure,serum_cholestoral(mg/dl),max_heart_rate_achieved,st_depression,major_vessels,thal,heart_disease
0,52,0,125,212,168,1.0,2,3,0
1,53,0,140,203,155,3.1,0,3,0
2,70,0,145,174,125,2.6,0,3,0
3,61,0,148,203,161,0.0,1,3,0
4,62,0,138,294,106,1.9,3,2,0
...,...,...,...,...,...,...,...,...,...
1020,59,1,140,221,164,0.0,0,2,1
1021,60,0,125,258,141,2.8,1,3,0
1022,47,0,110,275,118,1.0,1,2,0
1023,50,0,110,254,159,0.0,0,2,1


In [25]:
# Feature and Target Columns
y = RF_new_heart_df["heart_disease"]
X = RF_new_heart_df.drop(columns="heart_disease")

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(768, 8)

In [27]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=500,
                                random_state=1)

In [28]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=1)

In [29]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,0
1,1,1
2,1,1
3,0,0
4,1,1
5,1,0
6,1,1
7,1,1
8,1,1
9,1,1


In [30]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,91,34
Actual 1,13,119


In [31]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8171206225680934


In [32]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,91,34
Actual 1,13,119


Accuracy Score : 0.8171206225680934
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.73      0.79       125
           1       0.78      0.90      0.84       132

    accuracy                           0.82       257
   macro avg       0.83      0.81      0.81       257
weighted avg       0.83      0.82      0.82       257



In [33]:
y_pred2 = classifier.predict(X_train)
results = pd.DataFrame({"Prediction": y_pred2, "Actual": y_train}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,0
3,1,0
4,0,0
5,0,0
6,1,1
7,1,0
8,0,1
9,1,1


In [34]:
# Calculating the confusion matrix
cm = confusion_matrix(y_train, y_pred2)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,298,76
Actual 1,54,340


In [35]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_pred2))

0.8307291666666666


In [36]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {accuracy_score(y_train, y_pred2)}")
print("Classification Report")
print(classification_report(y_train, y_pred2))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,298,76
Actual 1,54,340


Accuracy Score : 0.8307291666666666
Classification Report
              precision    recall  f1-score   support

           0       0.85      0.80      0.82       374
           1       0.82      0.86      0.84       394

    accuracy                           0.83       768
   macro avg       0.83      0.83      0.83       768
weighted avg       0.83      0.83      0.83       768



# EXPORT TO DATABASE

In [None]:
from sqlalchemy import create_engine
from config import db_password

In [None]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/heart_data"

In [None]:
engine = create_engine(db_string)

In [None]:
new_heart_df.to_sql(name='modifiable_risk_factors', con=engine)

In [None]:
pip install psycopg2-binary 