In [3]:
import pandas as pd
import numpy as np
import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt 
import time
import random
import os
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, precision_score, recall_score,
                             accuracy_score, roc_auc_score, RocCurveDisplay)


In [6]:
# Manually download dataset
heart = pd.read_csv("heart_disease_health_indicators.csv")
    
# Even from just a cursory info() summary, we can see that the dataset is already pre-cleaned for us for the most part.
print(heart.info())
print(heart.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253661 entries, 0 to 253660
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   HeartDiseaseorAttack  253661 non-null  int64
 1   HighBP                253661 non-null  int64
 2   HighChol              253661 non-null  int64
 3   CholCheck             253661 non-null  int64
 4   BMI                   253661 non-null  int64
 5   Smoker                253661 non-null  int64
 6   Stroke                253661 non-null  int64
 7   Diabetes              253661 non-null  int64
 8   PhysActivity          253661 non-null  int64
 9   Fruits                253661 non-null  int64
 10  Veggies               253661 non-null  int64
 11  HvyAlcoholConsump     253661 non-null  int64
 12  AnyHealthcare         253661 non-null  int64
 13  NoDocbcCost           253661 non-null  int64
 14  GenHlth               253661 non-null  int64
 15  MentHlth              253661 non-n

In [None]:
print(heart['HeartDiseaseorAttack'].value_counts())

In [None]:
for col in heart.columns:
    print(heart[col].value_counts())
    print("______________________________")

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(heart.corr(), annot=True, cmap='inferno')
plt.title('Correlation Coefficients')
plt.show()

In [None]:
# Train/test split

# Set X, y
X = heart.loc[:,heart.columns != 'HeartDiseaseorAttack']
y = heart[['HeartDiseaseorAttack']]

# split 80% training data, 20% "_tmp" for validation & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=451, stratify=y)
# of remaining 20%, split in half to get 10% validation, 10% test
#X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5,
 # random_state=0, stratify=y_tmp) # try without random_state, stratify
    
# Oversample the minority class in the training data (i.e. HeartDiseaseorAttack == 1)
from imblearn.over_sampling import RandomOverSampler
rs = RandomOverSampler(random_state=451)
X_train_resampled, y_train_resampled = rs.fit_resample(X_train, y_train)

# Compare original ratio vs. oversample ratio
print(f'Our original ratio is a heart attack/disease rate of ')
print(heart['HeartDiseaseorAttack'].value_counts()[1] / (len(heart['HeartDiseaseorAttack'])))

print('Our new ratio, after oversampling, is ')
print(y_train_resampled['HeartDiseaseorAttack'].value_counts()[1] / (len(y_train_resampled['HeartDiseaseorAttack'])))

In [None]:
# X = heart.loc[:,heart.columns != 'HeartDiseaseorAttack']
# y = heart[['HeartDiseaseorAttack']]

# #print(X)
# #print(y)
# #print(np.array(y))
# print(y)
# print(y.values.ravel())
# print(y.ravel())

In [None]:
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

# Perform a logistic regression on the entire dataset to predict HeartDiseaseorAttack
clf = LogisticRegression(max_iter = 3000) # max_iter = 3000
clf.fit(X_train, y_train.values.ravel())
score_orig = clf.score(X_test, y_test.values.ravel())

clf2 = LogisticRegression(max_iter = 3000)
clf2.fit(X_train_resampled, y_train_resampled.values.ravel())
score_oversample = clf2.score(X_test, y_test.values.ravel())

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

print(f'The logistic regression scores {score_orig} before oversampling, and scores {score_oversample} after oversampling.')


In [None]:
# linear_model.Ridge()
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

# Perform a lasso regression to perform feature selection
lasso_model = linear_model.LassoCV(max_iter=2000, random_state=451)
lasso_model.fit(X_train, y_train.values.ravel())
lasso_orig = lasso_model.score(X_test, y_test.values.ravel())

lasso_model2 = linear_model.LassoCV(max_iter=2000, random_state=451)
lasso_model2.fit(X_train_resampled, y_train_resampled.values.ravel())
lasso_oversample = lasso_model2.score(X_test, y_test.values.ravel())
              
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

# print(f'The lasso regression scores {lasso_orig} before oversampling, and scores {lasso_oversample} after oversampling.')

# Read out attributes
#coeffs = lasso_model2.coef_         # dense np.array
#coeffs = lasso_model2.sparse_coef_  # sparse matrix

#coeffs = lasso_model2.intercept_    # probably also relevant

# print(f'The lasso model coefficients are: \n{lasso_model2.coef_}')

print("The LASSO model set the following columns to 0:")
print(np.array(X.columns)[lasso_model2.coef_==0])
print('')
print("And the following columns have nonzero coefficients:")
print(np.array(X.columns)[lasso_model2.coef_!=0])

In [None]:
#test a logistic regression limited to the above variables:
X_train_new = X_train_resampled[['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'Diabetes', 
                                 'HvyAlcoholConsump', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk',
                                 'Sex', 'Age', 'Income']]
X_test_new = X_test[['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'Diabetes', 
                                 'HvyAlcoholConsump', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk',
                                 'Sex', 'Age', 'Income']]

clf3 = LogisticRegression(max_iter = 3000)
clf3.fit(X_train_new, y_train_resampled.values.ravel())
score_oversample = clf3.score(X_test_new, y_test.values.ravel())
print(score_oversample)


print('\n\nThe confusion matrix is as follows:')
# Confusion matrix. 'metrics' display usage from https://www.w3schools.com/python/python_ml_confusion_matrix.asp
clf_predict = clf3.predict(X_test_new)
conf_matrix = confusion_matrix(y_test, clf_predict)
conf_matrix_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_matrix)
conf_matrix_display.plot()
plt.show()

