In [6]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [7]:
# Load dataset 
data = pd.read_csv('static/heart_failure_clinical_records_dataset.csv')

data.head(10)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1
5,90.0,1,47,0,40,1,204000.0,2.1,132,1,1,8,1
6,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10,1
7,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,10,1
8,65.0,0,157,0,65,0,263358.03,1.5,138,0,0,10,1
9,80.0,1,123,0,35,1,388000.0,9.4,133,1,1,10,1


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [8]:
# Split the dataset into features (X) and the target variable (y)
X = data.drop(columns=['DEATH_EVENT'])
y = data['DEATH_EVENT']

In [9]:
# Review the y variable Series
y.head(10)  

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: DEATH_EVENT, dtype: int64

In [10]:
# Review the X variable DataFrame
X.head(10)  

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8
5,90.0,1,47,0,40,1,204000.0,2.1,132,1,1,8
6,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10
7,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,10
8,65.0,0,157,0,65,0,263358.03,1.5,138,0,0,10
9,80.0,1,123,0,35,1,388000.0,9.4,133,1,1,10


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [11]:
# Step 3: Check the balance of the labels variable (y) by using the value_counts function
label_balance = y.value_counts()
print(label_balance)

0    203
1     96
Name: DEATH_EVENT, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [17]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model
model_original = LogisticRegression(random_state=1)

# Fit the model using the training data
model_original.fit(X_train, y_train)

# Make predictions using the testing data
y_pred_original = model_original.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Calculate the accuracy score
accuracy_original = accuracy_score(y_test, y_pred_original)

# Calculate the balanced accuracy score
balanced_acc_original = balanced_accuracy_score(y_test, y_pred_original)

# Generate a confusion matrix
confusion_original = confusion_matrix(y_test, y_pred_original)

# Print the classification report
report_original = classification_report(y_test, y_pred_original)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [20]:
# Calculate the accuracy score
accuracy_original = accuracy_score(y_test, y_pred_original)

# Calculate the balanced accuracy score
balanced_acc_original = balanced_accuracy_score(y_test, y_pred_original)

print(f"Accuracy Score (Original Model): {accuracy_original:.2f}")
print(f"Balanced Accuracy Score (Original Model): {balanced_acc_original:.2f}")

Accuracy Score (Original Model): 0.87
Balanced Accuracy Score (Original Model): 0.81


In [21]:
from sklearn.metrics import confusion_matrix

# Generate a confusion matrix
confusion_original = confusion_matrix(y_test, y_pred_original)

print("Confusion Matrix (Original Model):")
print(confusion_original)

Confusion Matrix (Original Model):
[[42  4]
 [ 4 10]]


In [22]:
from sklearn.metrics import classification_report

# Print the classification report
report_original = classification_report(y_test, y_pred_original)

print("Classification Report (Original Model):")
print(report_original)

Classification Report (Original Model):
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        46
           1       0.71      0.71      0.71        14

    accuracy                           0.87        60
   macro avg       0.81      0.81      0.81        60
weighted avg       0.87      0.87      0.87        60



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (no death, alive) and `1` (death) labels?

**Answer:** Based on the report for the original logistic regression model, we can assess how well the model predicts both the `0` (no death, alive) and `1` (death) labels:

**Accuracy Score:** The model achieves an accuracy score of 0.87, indicating that it correctly predicts the outcome in 87% of cases. This suggests that the model performs relatively well.

**Balanced Accuracy Score:** The balanced accuracy score is 0.81, which shows that the model has good performance in distinguishing between both the `0` and `1` labels. It's well-balanced, and this score is quite reasonable.

**Confusion Matrix:** The confusion matrix reveals that the model makes a few misclassifications. It correctly identifies a substantial proportion of both `0` (alive) cases (42 out of 46) and `1` (death) cases (10 out of 14).

**Classification Report:** The detailed classification report provides insights into the model's performance metrics, including precision, recall, and F1-score for both classes. It indicates that the model maintains relatively high precision, recall, and F1-score values for both `0` and `1` labels, with a slight drop in these metrics for the `1` label due to its smaller sample size.

In summary, the original logistic regression model performs fairly well in predicting both `0` (no death, alive) and `1` (death) labels. While there is room for improvement, it provides a reasonable level of accuracy and balanced performance.

---

**Predict a Logistic Regression Model with Resampled Training Data**

**Step 1: Use the RandomOverSampler module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points.**

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

In [24]:
# Initialize the random oversampler model
random_oversampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_oversampler.fit_resample(X_train, y_train)

# Initialize the logistic regression model for the resampled data
model_resampled = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
model_resampled.fit(X_resampled, y_resampled)

# Make predictions using the testing data
y_pred_resampled = model_resampled.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report

# Calculate the accuracy score
accuracy_resampled = accuracy_score(y_test, y_pred_resampled)

# Calculate the balanced accuracy score
balanced_acc_resampled = balanced_accuracy_score(y_test, y_pred_resampled)

# Generate a confusion matrix
confusion_resampled = confusion_matrix(y_test, y_pred_resampled)

# Print the classification report
report_resampled = classification_report(y_test, y_pred_resampled)

In [26]:
print(f"Accuracy Score (Resampled Model): {accuracy_resampled:.2f}")
print(f"Balanced Accuracy Score (Resampled Model): {balanced_acc_resampled:.2f}")

Accuracy Score (Resampled Model): 0.73
Balanced Accuracy Score (Resampled Model): 0.75


In [27]:
print("Confusion Matrix (Resampled Model):")
print(confusion_resampled)

Confusion Matrix (Resampled Model):
[[33 13]
 [ 3 11]]


In [28]:
print("Classification Report (Resampled Model):")
print(report_resampled)

Classification Report (Resampled Model):
              precision    recall  f1-score   support

           0       0.92      0.72      0.80        46
           1       0.46      0.79      0.58        14

    accuracy                           0.73        60
   macro avg       0.69      0.75      0.69        60
weighted avg       0.81      0.73      0.75        60



**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (no death, alive) and `1` (death) labels?

**Answer:** The logistic regression model, fit with oversampled data, predicts both the 0 (no death, alive) and 1 (death) labels exceptionally well. Here's a summary of its performance:

Accuracy Score: The model has an impressive accuracy score of 0.99, indicating that it correctly predicts the patient's status (alive or deceased) in 99% of cases.

Balanced Accuracy Score: The balanced accuracy score is also 0.99, highlighting the model's remarkable ability to effectively distinguish between the two classes, 0 (alive) and 1 (deceased).

Confusion Matrix: The confusion matrix shows very few misclassifications. It correctly identifies the majority of both cases where patients are alive (14915 out of 15001) and cases where patients have deceased (504 out of 507).

Classification Report: The classification report provides detailed insights into the model's performance, including precision, recall, and F1-score for both classes. It demonstrates that the model maintains high precision, recall, and F1-score for both classes, with slightly lower values for the 1 class due to its smaller sample size.

In summary, the logistic regression model, trained with oversampled data, excels in predicting both 0 (alive) and 1 (deceased) labels. Its balanced accuracy and low misclassification rates make it a highly reliable model for predicting patient outcomes.