<span style="font-size:36px;">Implement LogisticRegression</span>

In [337]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split

SEED = 1234
np.random.seed(SEED)


class LogisticRegression(object):
    def __init__(self, isNormalize) -> None:
        self.isNormalize = isNormalize

    def train(
        self,
        X_train,
        Y_train,
        weights,
        bias,
        learning_rate,
        iterations,
    ):
        if self.isNormalize:
            self.X_mean = np.mean(X_train, axis=0)
            self.X_std = np.std(X_train, axis=0)
            X_train = self.normalize(X_train)

        w_history, b_history, cost_history = self.gradient_descent(
            X_train, Y_train, weights, bias, learning_rate, iterations
        )

        w_best, b_best = w_history[-1], b_history[-1]
        train_history = (w_history, b_history, cost_history)

        return w_best, b_best, train_history

    def sigmoid(self, z):
        # Prevent overflow
        if not self.isNormalize:
            z = np.clip(z, -500, 1500)

        sigmoid_values = 1 / (1 + np.exp(-z))
        if not self.isNormalize:
            epsilon = 1e-10
            sigmoid_values = np.where(
                sigmoid_values == 1, sigmoid_values - epsilon, sigmoid_values
            )
            sigmoid_values = np.where(
                sigmoid_values == 0, sigmoid_values + epsilon, sigmoid_values
            )
        return sigmoid_values

    def model(self, X, w, b):
        logist = np.matmul(X, w) + b
        return self.sigmoid(logist)

    def normalize(self, X):
        return (X - self.X_mean) / self.X_std

    def cross_entropy(self, y_predict, y):
        return -np.mean(y * np.log(y_predict) + (1 - y) * np.log(1 - y_predict))

    def compute_cost(self, X, y, w, b):
        y_predict = self.model(X, w, b)
        return self.cross_entropy(y_predict, y)

    def compute_gradient(self, X, y, w, b):
        N = len(X)
        y_predict = self.model(X, w, b)
        delta = y_predict - y
        dw = 1 / N * np.matmul(X.T, delta)
        db = 1 / N * np.sum(delta)
        return dw, db

    def step_gradient(self, X, y, w, b, learning_rate):
        dw, db = self.compute_gradient(X, y, w, b)
        w = w - learning_rate * dw
        b = b - learning_rate * db
        return w, b

    def gradient_descent(self, X, y, w_start, b_start, learning_rate, num_epochs):
        w, b = w_start, b_start

        cost_history = []
        w_history = []
        b_history = []

        for _ in range(num_epochs):
            cost_history.append(self.compute_cost(X, y, w, b))
            w, b = self.step_gradient(X, y, w, b, learning_rate)
            w_history.append(w)
            b_history.append(b)
        return w_history, b_history, cost_history

    def classify(self, y_predict):
        return np.where(y_predict > 0.5, 1, 0)

    # Evaluation
    def accuracy(self, y_predict, y):
        y_predict_class = np.where(y_predict > 0.5, 1, 0)
        return np.mean(y_predict_class == y)

    def sensitivity(self, y_predict, y):
        y_predict_class = self.classify(y_predict)
        TP = np.sum((y_predict_class == 1) & (y == 1))
        FN = np.sum((y_predict_class == 0) & (y == 1))
        if TP + FN == 0:
            return 0
        return TP / (TP + FN)

    def specificity(self, y_predict, y):
        y_predict_class = self.classify(y_predict)
        TN = np.sum((y_predict_class == 0) & (y == 0))
        FP = np.sum((y_predict_class == 1) & (y == 0))
        if TN + FP == 0:
            return 0
        return TN / (TN + FP)

    def precision(self, y_predict, y):
        y_predict_class = self.classify(y_predict)
        TP = np.sum((y_predict_class == 1) & (y == 1))
        FP = np.sum((y_predict_class == 1) & (y == 0))
        if TP + FP == 0:
            return 0
        return TP / (TP + FP)
    
    def ROC(self, y_predict, y):
        FPR = []
        TPR = []
        thresholds = np.sort(np.unique(y_predict))[::-1]

        for threshold in thresholds:
            binary_predictions = (y_predict >= threshold).astype(int)
            TPR.append(self.sensitivity(binary_predictions, y))
            FPR.append(1 - self.specificity(binary_predictions, y))

        ROC_points = FPR, TPR
        return ROC_points

    def AUROC(self, FPR, TPR):
        return np.trapezoid(TPR, FPR)
    
    def F1(self, y_predict, y):
        precision = self.precision(y_predict, y)
        sensitivity = self.sensitivity(y_predict, y)
        if precision + sensitivity == 0:
            return 0
        return 2 * (precision * sensitivity) / (precision + sensitivity)

<span style="font-size:36px;">Function to show results for specific parameters</span>

In [338]:
def show_results(
    X_train,
    y_train,
    X_test,
    y_test,
    weights,
    bias,
    learning_rate,
    iterations,
    isNormalize,
):
    algorithm = LogisticRegression(isNormalize)

    w_best, b_best, train_history = algorithm.train(
        X_train, y_train, weights, bias, learning_rate, iterations
    )

    # TRAIN RESULTS

    # Cost function
    w_history, b_history, cost_history = train_history
    fig_cost = px.line(
        x=range(len(cost_history)),
        y=cost_history,
        title="Cost Function",
        labels={"x": "Epoch", "y": "Cost"},
    )

    if isNormalize:
        X_train = algorithm.normalize(X_train)
    y_predict = algorithm.model(X_train, w_best, b_best)
    y_class = algorithm.classify(y_predict)
    accuracy = algorithm.accuracy(y_predict, y_train)
    F1 = algorithm.F1(y_predict, y_train)

    # Sample predictions
    y_train_sample = y_train[:5]
    y_predict_sample = y_predict[:5]
    y_class_sample = y_class[:5]

    df_train_sample = pd.DataFrame(
        {
            "y_train": y_train_sample.flatten(),
            "y_class": y_class_sample.flatten(),
            "y_predict": y_predict_sample.flatten(),
        }
    )

    # ROC Curve
    ROC_points = algorithm.ROC(y_predict, y_train)
    FPR, TPR = ROC_points

    df_train_ROC = pd.DataFrame({"FPR": FPR, "TPR": TPR})
    fig_train_ROC = px.line(
        df_train_ROC,
        x="FPR",
        y="TPR",
        markers=True,
        title="ROC Curve - Train",
        labels={"FPR": "False Positive Rate (FPR)", "TPR": "True Positive Rate (TPR)"},
    )

    # Print the train evaluations and sample predictions
    print("TRAIN")
    print(f"Train Accuracy: {accuracy}")
    print(f"Train F1: {F1}")
    print(f"Train AUROC: {algorithm.AUROC(FPR, TPR)}")
    print("\nSample predictions")
    print(df_train_sample)
    fig_cost.show()
    fig_train_ROC.show()

    # TEST RESULTS

    if isNormalize:
        X_test = algorithm.normalize(X_test)
    y_predict = algorithm.model(X_test, w_best, b_best)
    y_class = algorithm.classify(y_predict)
    accuracy = algorithm.accuracy(y_predict, y_test)
    F1 = algorithm.F1(y_predict, y_test)

    # Sample predictions
    y_test_sample = y_test[:5]
    y_predict_sample = y_predict[:5]
    y_class_sample = y_class[:5]

    df_test_sample = pd.DataFrame(
        {
            "y_test": y_test_sample.flatten(),
            "y_class": y_class_sample.flatten(),
            "y_predict": y_predict_sample.flatten(),
        }
    )

    # ROC curve
    ROC_points = algorithm.ROC(y_predict, y_test)
    FPR, TPR = ROC_points

    df_test_ROC = pd.DataFrame({"FPR": FPR, "TPR": TPR})
    fig_test_ROC = px.line(
        df_test_ROC,
        x="FPR",
        y="TPR",
        markers=True,
        title="ROC Curve - Test",
        labels={"FPR": "False Positive Rate (FPR)", "TPR": "True Positive Rate (TPR)"},
    )

    # Print the test evaluations and sample predictions
    print("TEST")
    print(f"Test Accuracy: {accuracy}")
    print(f"Test F1: {F1}")
    print(f"Test AUROC: {algorithm.AUROC(FPR, TPR)}")
    print("\nSample predictions")
    print(df_test_sample)
    fig_test_ROC.show()

<span style="font-size:36px;">Implement data and parameters</span>

In [339]:
df = pd.read_csv("data.csv")

X = df.drop(columns=["Diagnosis"]).values
y = df["Diagnosis"].values
N, D = X.shape
y = y.reshape(N, 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=SEED
)

weights = np.random.randn(D, 1)
bias = np.random.randn()

<span style="font-size:36px;">Use LogisticRegression without Normalize</span>

In [340]:
show_results(X_train, y_train, X_test, y_test, weights, bias, 0.001, 1000, False)

TRAIN
Train Accuracy: 0.9131455399061033
Train F1: 0.8737201365187713
Train AUROC: 0.9380105604621978

Sample predictions
   y_train  y_class     y_predict
0        0        0  8.158633e-44
1        1        1  9.999995e-01
2        0        0  4.816274e-31
3        1        0  9.502320e-44
4        0        0  1.634945e-26


TEST
Test Accuracy: 0.916083916083916
Test F1: 0.8846153846153846
Test AUROC: 0.9435950413223141

Sample predictions
   y_test  y_class     y_predict
0       0        0  1.466564e-32
1       0        0  3.734757e-43
2       0        0  3.031225e-44
3       0        0  7.480526e-61
4       0        0  1.136000e-17


<span style="font-size:36px;">Use LogisticRegression with Normalize</span>

In [341]:
show_results(X_train, y_train, X_test, y_test, weights, bias, 0.1, 100, True)

TRAIN
Train Accuracy: 0.9553990610328639
Train F1: 0.939297124600639
Train AUROC: 0.9914521819430303

Sample predictions
   y_train  y_class  y_predict
0        0        0   0.122148
1        1        1   0.837923
2        0        0   0.007097
3        1        1   0.853440
4        0        0   0.123485


TEST
Test Accuracy: 0.951048951048951
Test F1: 0.9320388349514563
Test AUROC: 0.974586776859504

Sample predictions
   y_test  y_class  y_predict
0       0        0   0.029687
1       0        0   0.000375
2       0        0   0.283708
3       0        0   0.002135
4       0        0   0.011731


<span style="font-size:36px;">Data - select only some features</span>

In [342]:
df = pd.read_csv("data.csv")

X = df.iloc[:, 15:22].values
y = df["Diagnosis"].values
N, D = X.shape
X
y = y.reshape(N, 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=SEED
)

weights = np.random.randn(D, 1)
bias = np.random.randn()

<span style="font-size:36px;">Use LogisticRegression without Normalize</span>

In [343]:
show_results(X_train, y_train, X_test, y_test, weights, bias, 0.01, 1000, False)

TRAIN
Train Accuracy: 0.6830985915492958
Train F1: 0.28571428571428575
Train AUROC: 0.9332749271896386

Sample predictions
   y_train  y_class  y_predict
0        0        0   0.470385
1        1        0   0.471753
2        0        0   0.460490
3        1        0   0.483350
4        0        0   0.468367


TEST
Test Accuracy: 0.6853146853146853
Test F1: 0.34782608695652173
Test AUROC: 0.9024793388429753

Sample predictions
   y_test  y_class  y_predict
0       0        0   0.460995
1       0        0   0.453722
2       0        1   0.505563
3       0        0   0.455574
4       0        0   0.464861


<span style="font-size:36px;">Use LogisticRegression with Normalize</span>

In [344]:
show_results(X_train, y_train, X_test, y_test, weights, bias, 0.1, 1000, True)

TRAIN
Train Accuracy: 0.9225352112676056
Train F1: 0.8888888888888888
Train AUROC: 0.9797078114270831

Sample predictions
   y_train  y_class  y_predict
0        0        0   0.365983
1        1        0   0.241511
2        0        0   0.036488
3        1        1   0.737066
4        0        0   0.112817


TEST
Test Accuracy: 0.9020979020979021
Test F1: 0.8653846153846154
Test AUROC: 0.9698347107438017

Sample predictions
   y_test  y_class  y_predict
0       0        0   0.019712
1       0        0   0.004760
2       0        0   0.444957
3       0        0   0.002579
4       0        0   0.098414
