In [14]:
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


# Load and process data.

In [15]:
# Import the processed data.
df = pd.read_csv('processed_data.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,Sex_Male,LastCheckupTime_Within past 2 years (1 year but less than 2 years ago),LastCheckupTime_Within past 5 years (2 years but less than 5 years ago),LastCheckupTime_Within past year (anytime less than 12 months ago),PhysicalActivities_Yes,...,AgeCategory_Age 40 to 44,AgeCategory_Age 45 to 49,AgeCategory_Age 50 to 54,AgeCategory_Age 55 to 59,AgeCategory_Age 60 to 64,AgeCategory_Age 65 to 69,AgeCategory_Age 70 to 74,AgeCategory_Age 75 to 79,AgeCategory_Age 80 or older,AlcoholDrinkers_Yes
0,-0.48774,0.113974,-0.025174,-0.061718,1.404549,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,-0.48774,-0.508455,0.674879,1.619866,0.760563,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,-0.48774,-0.508455,-0.725227,-0.528825,0.116577,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,-0.48774,-0.508455,0.674879,0.872496,-0.097769,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,-0.48774,-0.508455,-0.025174,0.685653,1.189729,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [16]:
# Assign HadHeartAttack_Yes as target variable.
y = df.HadHeartAttack_Yes.values
X = df.drop(columns="HadHeartAttack_Yes").values

# Split training/test datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Basic Decision Tree Model

In [17]:
# Create the decision tree classifier.
model = tree.DecisionTreeClassifier()

# Fit the model.
model = model.fit(X_train, y_train)

# Making predictions using the testing data.
predictions = model.predict(X_test)

In [18]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results.
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5636,282
Actual 1,240,92


Accuracy Score : 0.91648
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      5918
           1       0.25      0.28      0.26       332

    accuracy                           0.92      6250
   macro avg       0.60      0.61      0.61      6250
weighted avg       0.92      0.92      0.92      6250



Observations
- Precision of 0.26 indicates that of all subjects identified as having heart disease, only 26% actually have heart disease.
- Recall of 0.30 indicates that only 30% of people with heart disease were identified as having heart disease.

# Random Forest Model

## n_estimators = 500

In [19]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [20]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5869,49
Actual 1,283,49


Accuracy Score : 0.94688
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      5918
           1       0.50      0.15      0.23       332

    accuracy                           0.95      6250
   macro avg       0.73      0.57      0.60      6250
weighted avg       0.93      0.95      0.93      6250



## Random Tree Parameter tuning.
The following code block attempts to improve the precision and recall of the model by adjusting certain parameters.

In [45]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=1500, 
                                  random_state=1, 
                                  max_leaf_nodes=250, 
                                  max_features= 50)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Making predictions using the testing data
predictions = rf_model.predict(X_test)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5848,70
Actual 1,251,81


Accuracy Score : 0.94864
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5918
           1       0.54      0.24      0.34       332

    accuracy                           0.95      6250
   macro avg       0.75      0.62      0.65      6250
weighted avg       0.94      0.95      0.94      6250



# Random Tree with 100,000 records.

In [46]:
# Import the processed data.
df = pd.read_csv('processed_data_100k.csv')
df = df.drop(['Unnamed: 0'], axis=1)

# Assign HadHeartAttack_Yes as target variable.
y = df.HadHeartAttack_Yes.values
X = df.drop(columns="HadHeartAttack_Yes").values

# Split training/test datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [47]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=1500, 
                                  random_state=1, 
                                  max_leaf_nodes=250, 
                                  max_features= 50)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Making predictions using the testing data
predictions = rf_model.predict(X_test)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,23387,245
Actual 1,1076,292


Accuracy Score : 0.94716
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     23632
           1       0.54      0.21      0.31      1368

    accuracy                           0.95     25000
   macro avg       0.75      0.60      0.64     25000
weighted avg       0.93      0.95      0.94     25000

