<a href="https://colab.research.google.com/github/TomazFilgueira/UFRN-ML-2025-1-Heart_Disease_Classfication/blob/main/heart_disease_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import torch
import torch.optim as optim
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc
from sklearn.metrics import accuracy_score

%matplotlib inline
plt.style.use('fivethirtyeight')


In [None]:
pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

##Generate confusion matrix line figure


In [None]:
def figure10(y, probabilities, threshold, shift, annot, colors=None,title=""):
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    probability_line(ax, y, probabilities, threshold, shift, annot, colors)
    ax.set_title(title)
    fig.tight_layout()
    return fig

In [None]:
def probability_contour(ax, model, device, X, y, threshold, cm=None, cm_bright=None):
    if cm is None:
        cm = plt.cm.RdBu
    if cm_bright is None:
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])

    h = .02  # step size in the mesh

    x_min, x_max = -2.25, 2.25
    y_min, y_max = -2.25, 2.25

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    logits = model(torch.as_tensor(np.c_[xx.ravel(), yy.ravel()]).float().to(device))
    logits = logits.detach().cpu().numpy().reshape(xx.shape)

    yhat = sigmoid(logits)

    ax.contour(xx, yy, yhat, levels=[threshold], cmap="Greys", vmin=0, vmax=1)
    contour = ax.contourf(xx, yy, yhat, 25, cmap=cm, alpha=.8, vmin=0, vmax=1)
    # Plot the training points
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright, edgecolors='k')
    # Plot the testing points
    #ax.scatter(X_val[:, 0], X_val[:, 1], c=y_val, cmap=cm_bright, edgecolors='k', alpha=0.6)

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel(r'$X_1$')
    ax.set_ylabel(r'$X_2$')
    ax.set_title(r'$\sigma(z) = P(y=1)$')
    ax.grid(False)

    ax_c = plt.colorbar(contour)
    ax_c.set_ticks([0, .25, .5, .75, 1])
    return ax

In [None]:
def probability_line(ax, y, probs, threshold, shift=0.0, annot=False, colors=None):
    if colors is None:
        colors = ['r', 'b']
    ax.grid(False)
    ax.set_ylim([-.1, .1])
    ax.axes.get_yaxis().set_visible(False)
    ax.plot([0, 1], [0, 0], linewidth=2, c='k', zorder=1)
    ax.plot([0, 0], [-.1, .1], c='k', zorder=1)
    ax.plot([1, 1], [-.1, .1], c='k', zorder=1)

    tn = (y == 0) & (probs < threshold)
    fn = (y == 0) & (probs >= threshold)
    tp = (y == 1) & (probs >= threshold)
    fp = (y == 1) & (probs < threshold)

    ax.plot([threshold, threshold], [-.1, .1], c='k', zorder=0.5, linestyle='--')
    ax.scatter(probs[tn], np.zeros(tn.sum()) + shift, c=colors[0], s=150, zorder=2, edgecolor=colors[0], linewidth=3)
    ax.scatter(probs[fn], np.zeros(fn.sum()) + shift, c=colors[0], s=150, zorder=2, edgecolor=colors[1], linewidth=3)

    ax.scatter(probs[tp], np.zeros(tp.sum()) - shift, c=colors[1], s=150, zorder=2, edgecolor=colors[1], linewidth=3)
    ax.scatter(probs[fp], np.zeros(fp.sum()) - shift, c=colors[1], s=150, zorder=2, edgecolor=colors[0], linewidth=3)

    ax.set_xlabel('Threshold = {}'.format(threshold))
    #ax.set_title('Threshold = {}'.format(threshold))

    if annot:
        ax.annotate('TN', xy=(.20, .03), c='k', weight='bold', fontsize=20)
        ax.annotate('FN', xy=(.20, -.08), c='k', weight='bold', fontsize=20)
        ax.annotate('FP', xy=(.70, .03), c='k', weight='bold', fontsize=20)
        ax.annotate('TP', xy=(.70, -.08), c='k', weight='bold', fontsize=20)
    return ax

## Architeture Classs


In [None]:
class Architecture(object):
    def __init__(self, model, loss_fn, optimizer):
        # Here we define the attributes of our class

        # We start by storing the arguments as attributes
        # to use them later
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # Let's send the model to the specified device right away
        self.model.to(self.device)

        # These attributes are defined here, but since they are
        # not informed at the moment of creation, we keep them None
        self.train_loader = None
        self.val_loader = None

        # These attributes are going to be computed internally
        self.losses = []
        self.val_losses = []
        self.total_epochs = 0

        # Creates the train_step function for our model,
        # loss function and optimizer
        # Note: there are NO ARGS there! It makes use of the class
        # attributes directly
        self.train_step_fn = self._make_train_step_fn()
        # Creates the val_step function for our model and loss
        self.val_step_fn = self._make_val_step_fn()

    def to(self, device):
        # This method allows the user to specify a different device
        # It sets the corresponding attribute (to be used later in
        # the mini-batches) and sends the model to the device
        try:
            self.device = device
            self.model.to(self.device)
        except RuntimeError:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
            print(f"Couldn't send it to {device}, sending it to {self.device} instead.")
            self.model.to(self.device)

    def set_loaders(self, train_loader, val_loader=None):
        # This method allows the user to define which train_loader (and val_loader, optionally) to use
        # Both loaders are then assigned to attributes of the class
        # So they can be referred to later
        self.train_loader = train_loader
        self.val_loader = val_loader

    def _make_train_step_fn(self):
        # This method does not need ARGS... it can refer to
        # the attributes: self.model, self.loss_fn and self.optimizer

        # Builds function that performs a step in the train loop
        def perform_train_step_fn(x, y):
            # Sets model to TRAIN mode
            self.model.train()

            # Step 1 - Computes our model's predicted output - forward pass
            yhat = self.model(x)
            # Step 2 - Computes the loss
            loss = self.loss_fn(yhat, y)
            # Step 3 - Computes gradients for both "a" and "b" parameters
            loss.backward()
            # Step 4 - Updates parameters using gradients and the learning rate
            self.optimizer.step()
            self.optimizer.zero_grad()

            # Returns the loss
            return loss.item()

        # Returns the function that will be called inside the train loop
        return perform_train_step_fn

    def _make_val_step_fn(self):
        # Builds function that performs a step in the validation loop
        def perform_val_step_fn(x, y):
            # Sets model to EVAL mode
            self.model.eval()

            # Step 1 - Computes our model's predicted output - forward pass
            yhat = self.model(x)
            # Step 2 - Computes the loss
            loss = self.loss_fn(yhat, y)
            # There is no need to compute Steps 3 and 4, since we don't update parameters during evaluation
            return loss.item()

        return perform_val_step_fn

    def _mini_batch(self, validation=False):
        # The mini-batch can be used with both loaders
        # The argument `validation`defines which loader and
        # corresponding step function is going to be used
        if validation:
            data_loader = self.val_loader
            step_fn = self.val_step_fn
        else:
            data_loader = self.train_loader
            step_fn = self.train_step_fn

        if data_loader is None:
            return None

        # Once the data loader and step function, this is the same
        # mini-batch loop we had before
        mini_batch_losses = []
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(self.device)
            y_batch = y_batch.to(self.device)

            mini_batch_loss = step_fn(x_batch, y_batch)
            mini_batch_losses.append(mini_batch_loss)

        loss = np.mean(mini_batch_losses)
        return loss

    def set_seed(self, seed=42):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.manual_seed(seed)
        np.random.seed(seed)

    def train(self, n_epochs, seed=42):
        # To ensure reproducibility of the training process
        self.set_seed(seed)

        for epoch in range(n_epochs):
            # Keeps track of the numbers of epochs
            # by updating the corresponding attribute
            self.total_epochs += 1

            # inner loop
            # Performs training using mini-batches
            loss = self._mini_batch(validation=False)
            self.losses.append(loss)

            # VALIDATION
            # no gradients in validation!
            with torch.no_grad():
                # Performs evaluation using mini-batches
                val_loss = self._mini_batch(validation=True)
                self.val_losses.append(val_loss)

    def save_checkpoint(self, filename):
        # Builds dictionary with all elements for resuming training
        checkpoint = {'epoch': self.total_epochs,
                      'model_state_dict': self.model.state_dict(),
                      'optimizer_state_dict': self.optimizer.state_dict(),
                      'loss': self.losses,
                      'val_loss': self.val_losses}

        torch.save(checkpoint, filename)

    def load_checkpoint(self, filename):
        # Loads dictionary
        checkpoint = torch.load(filename,weights_only=False)

        # Restore state for model and optimizer
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        self.total_epochs = checkpoint['epoch']
        self.losses = checkpoint['loss']
        self.val_losses = checkpoint['val_loss']

        self.model.train() # always use TRAIN for resuming training

    def predict(self, x):
        # Set is to evaluation mode for predictions
        self.model.eval()
        # Takes aNumpy input and make it a float tensor
        x_tensor = torch.as_tensor(x).float()
        # Send input to device and uses model for prediction
        y_hat_tensor = self.model(x_tensor.to(self.device))
        # Set it back to train mode
        self.model.train()
        # Detaches it, brings it to CPU and back to Numpy
        return y_hat_tensor.detach().cpu().numpy()

    def plot_losses(self,title=""):
        fig = plt.figure(figsize=(10, 4))
        plt.plot(self.losses, label='Training Loss', c='b')
        plt.plot(self.val_losses, label='Validation Loss', c='r')
        plt.title(title)
        plt.yscale('log')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.tight_layout()
        return fig

#Read Reduced CSV File from Github

In [None]:
url = "https://raw.githubusercontent.com/TomazFilgueira/UFRN-ML-2025-1-Heart_Disease_Classfication/main/0_data/reduced_heart_disease_dataset.csv" # Correct URL to raw file content

try:
  df = pd.read_csv(url)
  print("Successfully read CSV from GitHub")
  # Now you can work with the DataFrame 'df'
  print(df.head()) # Example: Display the first few rows

except Exception as e:
  print(f"An error occurred: {e}")

#Checkpoints

After this points we will pass for a series of checkpoint in order to get our dataset classified using pytorch.

Those checkpoints are:

1.  Data Preparation:
  * Create dummy variables
  * Creator tensors
  * build train and validation dataset/dataloader

2. Configure Model: determine some parameters:
  * Which model should we use to classify binary output?
  * Defines Stocastic Gradient Descent
  * Defines a loss function to classification

3. Train the model itself using `Architeture()` class

4. Validate the Model:
  * is the model accurate to our problem?
  * Let's evaluate some metrics such as:
    - Recall/Precision
    - Accuracy
    - True and False Positive Rates

4. Make Predictions

#1) Data Preparation

Most of work has been done during the EDA parts. However, something must be adapted in order to our classify works properly

First we need to convert categorical features into dummy ones.

In [None]:
df.head()

In [None]:
# Create dummy variables for categorical features
categorical_cols = ['sex','cp', 'restecg', 'exang', 'slope','thal','elderly']
df_dummy = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print(df_dummy.shape)

##Balanced target column
One of the biggest mistakes in classification problem is to let train and validation dataset with disproportional values of our target column.

because of that we will calculate the proportion of target in the main dataset and will leave with same ratio in train and validation division.


In [None]:
# Calculate target proportion
target_proportion = df['target'].value_counts(normalize=True)
target_proportion


##Creating tensors and building train/validation dataset

In [None]:
# Define features (X) and target (y)
X = df_dummy.drop('target', axis=1).values
y = df_dummy['target'].values

# Split data into training and validation sets while maintaining target proportion
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_dummy['target'])

# Scale features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)  # Reshape for single output
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

# Create DataLoaders in mini_batch type with 16 observations
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# Calculate the proportion of True values in y_train
true_proportion_train = np.sum(y_train) / len(y_train)

# Calculate the proportion of True values in y_val
true_proportion_val = np.sum(y_val) / len(y_val)

print(f"Proportion of True values in y_train: {true_proportion_train}")
print(f"Proportion of True values in y_val: {true_proportion_val}")


The code above is just to be sure that our stratification in target column is made correctly.

The proportion of true values in train and validation dataset is the same of the original dataframe

#2) Configure Model

In this section we will configure our classification model.

Starting with the following hyper-parameters:
* lr = 0.05
* model: linear with 19 predictors (dataset features exluding target column)
* Optimizer: Stochastic Gradient Descent
* Loss Function: Binary-Cross Entropy with Logit Loss. This loss function is widely used in classification problems because its output means logit that can be used for determining a probability of an event to happen


In [None]:
# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.05

torch.manual_seed(42)
model = nn.Sequential()
model.add_module('linear', nn.Linear(19, 1))

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a BCE loss function
loss_fn = nn.BCEWithLogitsLoss()

#3 and 4)Training/Validation

In [None]:
#set number of epochs
n_epochs = 200

#using Architecture class passing model, loss and optimized as parameters
arch = Architecture(model, loss_fn, optimizer)
arch.set_loaders(train_loader, val_loader)
arch.set_seed(42)
arch.train(n_epochs)


In [None]:
fig = arch.plot_losses()

From the figure above we can see the lines of training and validation loss.

It is ease to see that our model did not overfitted because the error of train/validation remained similar throughout the epochs. However, the validation error itself can be optimized.

In [None]:
print(model.state_dict())

##Metrics


In [None]:
def split_cm(cm):
    # Actual negatives go in the top row,
    # above the probability line
    actual_negative = cm[0]
    # Predicted negatives go in the first column
    tn = actual_negative[0]
    # Predicted positives go in the second column
    fp = actual_negative[1]

    # Actual positives go in the bottow row,
    # below the probability line
    actual_positive = cm[1]
    # Predicted negatives go in the first column
    fn = actual_positive[0]
    # Predicted positives go in the second column
    tp = actual_positive[1]

    return tn, fp, fn, tp

In [None]:
#From logit to probablities
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


logits_val = arch.predict(X_val)
probabilities_val = sigmoid(logits_val).squeeze()
threshold = 0.5

#Confusion Matrix using SKlearn
cm_model1 = confusion_matrix(y_val, (probabilities_val >= threshold))
cm_model1

In [None]:
#Confusion Matrix from Validation dataset
fig = figure10(y_val, probabilities_val, threshold, 0.05, True,title="Confusion Matrix for Model 1")

##True and False Positive Rates
$$
\Large \text{TPR} = \frac{\text{TP}}{\text{TP + FN}} \ \ \  \text{FPR} = \frac{\text{FP}}{\text{FP + TN}}
$$

In [None]:
def tpr_fpr(cm):
    tn, fp, fn, tp = split_cm(cm)

    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)

    return tpr, fpr


## Precision and Recall

$$
\Large \text{Recall} = \frac{\text{TP}}{\text{TP + FN}} \ \ \  \text{Precision} = \frac{\text{TP}}{\text{TP + FP}}
$$

In [None]:
def precision_recall(cm):
    tn, fp, fn, tp = split_cm(cm)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    return precision, recall

##Accuracy

$$
\Large \text{Accuracy} = \frac{\text{TP+TN}}{\text{TP+TN+FP+FN}}
$$

We can use `accuracy_score()` method directly from Sklearn

#Evaluation

In [None]:
#Precision-Recall
precision,recall = precision_recall(cm_model1)
#Accuracy
acc = accuracy_score(y_val, (probabilities_val >= threshold))
#True and False Positive Rates
tpr,fpr = tpr_fpr(cm_model1)

print(f"Precision Metrics: {precision}\n")
print(f"Recall Metrics: {recall}\n")
print(f"Accuracy score {acc}\n")
print(f"True Positive Rate:{tpr}\n")
print(f"False Positive Rate:{fpr}\n")

In [None]:
import matplotlib.pyplot as plt

def plot_metrics(precision, recall, acc, tpr, fpr):
    # Create a figure and a 1x3 grid of subplots
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

    # Subplot 1: Precision and Recall
    axes[0].bar(['Precision', 'Recall'], [precision, recall], color=['skyblue', 'lightcoral'])
    axes[0].set_title("Precision and Recall")
    axes[0].set_ylim(0, 1.1)
    axes[0].grid(visible=None)
    axes[0].set_yticks([])  # Hide y-axis values for subplot 1

    for i, v in enumerate([precision, recall]):
        axes[0].text(i, v + 0.02, f"{v:.4f}", ha='center', va='bottom', fontsize=15)

    # Subplot 2: TPR and FPR
    axes[1].bar(['TPR', 'FPR'], [tpr, fpr], color=['lightgreen', 'gold'])
    axes[1].set_title("TPR and FPR")
    axes[1].set_ylim(0, 1.1)
    axes[1].grid(visible=None)
    axes[1].set_yticks([])  # Hide y-axis values for subplot 2

    for i, v in enumerate([tpr, fpr]):
        axes[1].text(i, v + 0.02, f"{v:.4f}", ha='center', va='bottom', fontsize=15)

    # Subplot 3: Accuracy
    axes[2].bar('Accuracy', acc, color=['lightblue'])
    axes[2].set_title("Accuracy")
    axes[2].set_ylim(0, 1.1)
    axes[2].text(0, acc + 0.02, f"{acc:.4f}", ha='center', va='bottom', fontsize=15)
    axes[2].grid(visible=None)
    axes[2].set_yticks([])  # Hide y-axis values for subplot 3

    fig.tight_layout()  # Adjust subplot parameters for a tight layout
    plt.show()



In [None]:
#Calling the plot metrics
plot_metrics(precision, recall, acc, tpr, fpr)

#Conclusion
The first model was configurated using these parameters below:

  * lr = 0.05
  * train/val split ratio = 0.2
  * number of epochs = 200
  * Optimizer: SGD

With this configuration we got the results from the confusion matrix and bar graph above:

  1. Precision Metrics: 0.75

  1. Recall Metrics: 0.833

  1. Accuracy score 0.812

  1. True Positive Rate:0.833

  1. False Positive Rate:0.204

More over, knowing that Heart disease identification is a sensible matter we have to bear in mind the **False Negative** number which means that an individual has a heart problem and our model did not identified properly, leaving the person in serious risk!

For model 1 and using a threshold limiter of 0.5 we got False Negative Number of 6 misclassication.

It seems low, right? but even lower the better. Can we decrease even more this number?

From mode 1 we have used the whole heart desease dataset. However, during our EDA phase it has been identified that some feature can iteract more with our target column than the other. Let's filter our dataset with this "best features"

#Read selected feature CSV

From now on we will be using the "selected" dataset. Here will be using numerical features with correlation greater than 0.2 with compared to target column. Besides we will be still using all categorical features as presented before.

The features of this dataset are:

**categorical_cols**

 - sex
 - cp
 - restecg
 - exang
 - slope
 - thal
 - elderly

**Numerical**
 - Oldpeak
 - Thalachh

In [None]:
url = "https://raw.githubusercontent.com/TomazFilgueira/UFRN-ML-2025-1-Heart_Disease_Classfication/main/0_data/selected_features_dataset.csv" # Correct URL to raw file content

try:
  df_selected = pd.read_csv(url)
  print("Successfully read CSV from GitHub")
  # Now you can work with the DataFrame 'df'
  print(df.head()) # Example: Display the first few rows

except Exception as e:
  print(f"An error occurred: {e}")

In [None]:
# Create dummy variables for categorical features
categorical_cols = ['sex','cp', 'restecg', 'exang', 'slope','thal','elderly']
df_selected_dummy = pd.get_dummies(df_selected, columns=categorical_cols, drop_first=True)
print(df_dummy.shape)

In [None]:
# Define features (X) and target (y)
X_model2 = df_selected_dummy.drop('target', axis=1).values
y_model2 = df_selected_dummy['target'].values

# Split data into training and validation sets while maintaining target proportion
X_train_model2, X_val_model2, y_train_model2, y_val_model2 = train_test_split(X_model2, y_model2, test_size=0.2, random_state=42, stratify=df_selected_dummy['target'])

# Scale features using StandardScaler
scaler = StandardScaler()
X_train_model2 = scaler.fit_transform(X_train_model2)
X_val_model2 = scaler.transform(X_val_model2)

# Convert data to PyTorch tensors
X_train_tensor_model2 = torch.tensor(X_train_model2, dtype=torch.float32)
y_train_tensor_model2 = torch.tensor(y_train_model2, dtype=torch.float32).reshape(-1, 1)  # Reshape for single output
X_val_tensor_model2 = torch.tensor(X_val_model2, dtype=torch.float32)
y_val_tensor_model2 = torch.tensor(y_val_model2, dtype=torch.float32).reshape(-1, 1)

# Create TensorDatasets
train_dataset_model2 = TensorDataset(X_train_tensor_model2, y_train_tensor_model2)
val_dataset_model2 = TensorDataset(X_val_tensor_model2, y_val_tensor_model2)

# Create DataLoaders in mini_batch type with 16 observations
train_loader_model2 = DataLoader(train_dataset_model2, batch_size=16, shuffle=True)
val_loader_model2 = DataLoader(val_dataset_model2, batch_size=16, shuffle=False)

#Configure second model

In [None]:
# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.05

torch.manual_seed(42)
model2 = nn.Sequential()
#This model contains 16 input instead of 19 of model 1
model2.add_module('linear', nn.Linear(16, 1))

# Defines a SGD optimizer to update the parameters
optimizer2 = optim.SGD(model2.parameters(), lr=lr)

# Defines a BCE loss function
loss_fn = nn.BCEWithLogitsLoss()

#Train second model

In [None]:
#set number of epochs
n_epochs = 200

#using Architecture class passing model, loss and optimized as parameters
arch2 = Architecture(model2, loss_fn, optimizer2)
arch2.set_loaders(train_loader_model2, val_loader_model2)
arch2.set_seed(42)
arch2.train(n_epochs)


In [None]:
fig = arch2.plot_losses()

#Metrics for second model


In [None]:
logits_val = arch2.predict(X_val_model2)
probabilities_model2_val = sigmoid(logits_val).squeeze()
threshold = 0.5
#Confusion Matrix using SKlearn
cm_model2 = confusion_matrix(y_val_model2, (probabilities_model2_val >= threshold))
cm_model2

In [None]:
#Confusion Matrix from Validation dataset
fig = figure10(y_val_model2, probabilities_model2_val, threshold, 0.05, True,title="Confusion Matrix for Model 2")

In [None]:
#Precision-Recall
precision2,recall2 = precision_recall(cm_model2)
#Accuracy
acc2 = accuracy_score(y_val_model2, (probabilities_model2_val >= threshold))
#True and False Positive Rates
tpr2,fpr2 = tpr_fpr(cm_model2)

print(f"Precision Metrics: {precision2}\n")
print(f"Recall Metrics: {recall2}\n")
print(f"Accuracy score {acc2}\n")
print(f"True Positive Rate:{tpr2}\n")
print(f"False Positive Rate:{fpr2}\n")

In [None]:
#Calling the plot metrics
plot_metrics(precision2, recall2, acc2, tpr2, fpr2)

In [None]:
#Comparing models
metrics_model1 = {'Precision': precision, 'Recall': recall, 'Accuracy': acc, 'TPR': tpr, 'FPR': fpr}
metrics_model2 = {'Precision': precision2, 'Recall': recall2, 'Accuracy': acc2, 'TPR': tpr2, 'FPR': fpr2}

metrics = ['Precision', 'Recall', 'Accuracy', 'TPR', 'FPR']

# Calculate percentage increase
percentage_increase = {}
for metric in metrics:
    increase = ((metrics_model2[metric] - metrics_model1[metric]) / metrics_model1[metric]) * 100
    percentage_increase[metric] = increase

# Create bar plot
plt.figure(figsize=(6, 6))
plt.bar(metrics, percentage_increase.values(), color=['skyblue', 'lightcoral', 'lightgreen', 'gold', 'lightblue'])
plt.xlabel("Metrics")
plt.title("Percentage Increase in Metrics (Model 2 vs. Model 1)", pad=30)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.grid(visible=None) #Grid line off
plt.yticks([])  #Hide y label


# Add percentage values on top of bars
for i, v in enumerate(percentage_increase.values()):
    plt.text(i, v + 0.1, f"{v:.2f}%", ha='center', va='bottom', fontsize=10)

plt.show()


#Conclusion Model 2

We can see that using the same configuration of model 1 but using selected feature we had a little bit increase in all metrics except in False Positive Ratio.

Here is a summary of the changes:

- `Precision` : 1,59% increase. Reaching 0.7619
- `Recall` : 6,67% increase. Reaching 0.8889
- `Accuracy` : 2,90% increase. Reaching 0.8353
- `True Positive Ratio` : 6,67%. Reaching 0.8889
- `False Positive Ratio` did not have any change

Moreover, when analyzing the False Negative Number it decrease from **6** people in first model to **4** in model 2. It is a satisfatory model!

However we can decrease even more this number if we see the confusion matrix from model 2.


 What if we reduce the threshold value from 0.5 to 0.4?

#Comparing models with threshold 0.4

##Confusion Matrix - Model 1

In [None]:
logits_val = arch.predict(X_val)
probabilities_val = sigmoid(logits_val).squeeze()
threshold = 0.4

#Confusion Matrix using SKlearn
cm_model1_04 = confusion_matrix(y_val, (probabilities_val >= threshold))
cm_model1

In [None]:
#Confusion Matrix from Validation dataset
fig = figure10(y_val, probabilities_val, threshold, 0.05, True,title="Confusion Matrix for Model 1")

In [None]:
#Precision-Recall
precision04,recall04 = precision_recall(cm_model1_04)
#Accuracy
acc04 = accuracy_score(y_val, (probabilities_val >= threshold))
#True and False Positive Rates
tpr04,fpr04 = tpr_fpr(cm_model1_04)

print(f"Precision Metrics: {precision04}\n")
print(f"Recall Metrics: {recall04}\n")
print(f"Accuracy score {acc04}\n")
print(f"True Positive Rate:{tpr04}\n")
print(f"False Positive Rate:{fpr04}\n")

#Calling the plot metrics
plot_metrics(precision04, recall04, acc04, tpr04, fpr04)

##Confusion Matrix - Model 2

In [None]:
logits_val = arch2.predict(X_val_model2)
probabilities_model2_val = sigmoid(logits_val).squeeze()
threshold = 0.4
#Confusion Matrix using SKlearn
cm_model2_04 = confusion_matrix(y_val_model2, (probabilities_model2_val >= threshold))
cm_model2

In [None]:
#Confusion Matrix from Validation dataset
fig = figure10(y_val_model2, probabilities_model2_val, threshold, 0.05, True,title="Confusion Matrix for Model 2")

In [None]:
#Precision-Recall
precision2_04,recall2_04= precision_recall(cm_model2_04)
#Accuracy
acc2_04 = accuracy_score(y_val_model2, (probabilities_model2_val >= threshold))
#True and False Positive Rates
tpr2_04,fpr2_04 = tpr_fpr(cm_model2_04)

print(f"Precision Metrics: {precision2_04}\n")
print(f"Recall Metrics: {recall2_04}\n")
print(f"Accuracy score {acc2_04}\n")
print(f"True Positive Rate:{tpr2_04}\n")
print(f"False Positive Rate:{fpr2_04}\n")

#Calling the plot metrics
plot_metrics(precision04, recall04, acc04, tpr04, fpr04)

#Project Conclusion
After comparing models with 0.4 threshold we could see that both behave similarly. In other word, both models give the same metrics.

Both models gives us the following output during the validation set:

- `Precision` :  0.7727
- `Recall` :  0.9444
- `Accuracy` :  0.8588
- `True Positive Ratio` : 0.9444
- `False Positive Ratio`: 0.2041
- `False negative number`: 2 individuals



However the second model tends to be better because it has less features when compared to the first one. Hence we will choose the **model 2 to put in production**

#Miscelaneous


In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

print(models)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches  # Import patches for highlighting

def plot_lazy_model_metrics(models):
    """Plots bar graph of metrics from LazyClassifier results.

    Args:
        models: DataFrame of model metrics from LazyClassifier.fit().
    """

    metrics = ['Accuracy', 'Balanced Accuracy', 'ROC AUC']
    fig, axes = plt.subplots(len(metrics), 1, figsize=(10, 6 * len(metrics)))

    for i, metric in enumerate(metrics):
        ax = axes[i]  # Get the current subplot axis
        models.sort_values(by=metric, ascending=False).plot(kind='bar', y=metric, ax=ax)
        ax.set_title(f"Models Performance - {metric}")
        ax.set_xlabel("")
        ax.set_ylabel(metric)
        ax.tick_params(axis='x', rotation=90, labelsize=8)
        ax.grid(visible=None)
        ax.legend().remove()

        # Highlight Logistic Regression bar
        if metric in models.columns:  # Check if metric exists in DataFrame
            try:
                # Find the x-coordinate of Logistic Regression bar
                x_coord = models.index.get_loc('LogisticRegression')

                # Create a rectangle patch for highlighting
                rect = patches.Rectangle((x_coord - 0.4, 0.009), 0.8, models.loc['LogisticRegression', metric],
                                        linewidth=2, edgecolor='red', facecolor='none')

                # Add the rectangle patch to the subplot
                ax.add_patch(rect)
            except KeyError:
                pass  # Handle case where LogisticRegression is not in models

    plt.tight_layout()
    plt.show()

plot_lazy_model_metrics(models)