<span style="font-size:36px;">Implement LogisticRegression</span>

In [373]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split

SEED = 1234
np.random.seed(SEED)


class LogisticRegression(object):
    def __init__(self, isNormalize) -> None:
        self.isNormalize = isNormalize

    def train(
        self,
        X_train,
        Y_train,
        weights,
        bias,
        learning_rate,
        iterations,
    ):
        if self.isNormalize:
            self.X_mean = np.mean(X_train, axis=0)
            self.X_std = np.std(X_train, axis=0)
            X_train = self.normalize(X_train)

        w_history, b_history, cost_history = self.gradient_descent(
            X_train, Y_train, weights, bias, learning_rate, iterations
        )

        w_best, b_best = w_history[-1], b_history[-1]
        train_history = (w_history, b_history, cost_history)

        return w_best, b_best, train_history

    def sigmoid(self, z):
        if not self.isNormalize:
            z = np.clip(z, -500, 1500)
        sigmoid_values = 1 / (1 + np.exp(-z))
        if not self.isNormalize:
            epsilon = 1e-10
            sigmoid_values = np.where(
                sigmoid_values == 1, sigmoid_values - epsilon, sigmoid_values
            )
            sigmoid_values = np.where(
                sigmoid_values == 0, sigmoid_values + epsilon, sigmoid_values
            )
        return sigmoid_values

    def model(self, X, w, b):
        logist = np.matmul(X, w) + b
        return self.sigmoid(logist)

    def normalize(self, X):
        return (X - self.X_mean) / self.X_std

    def cross_entropy(self, y_predict, y):
        return -np.mean(y * np.log(y_predict) + (1 - y) * np.log(1 - y_predict))

    def compute_gradient(self, X, y, w, b):
        N = len(X)
        y_predict = self.model(X, w, b)
        delta = y_predict - y
        dw = 1 / N * np.matmul(X.T, delta)
        db = 1 / N * np.sum(delta)
        return dw, db

    def step_gradient(self, X, y, w, b, learning_rate):
        dw, db = self.compute_gradient(X, y, w, b)
        w = w - learning_rate * dw
        b = b - learning_rate * db
        return w, b

    def gradient_descent(self, X, y, w_start, b_start, learning_rate, num_epochs):
        w, b = w_start, b_start

        cost_history = []
        w_history = []
        b_history = []

        for _ in range(num_epochs):
            cost_history.append(self.compute_cost(X, y, w, b))
            w, b = self.step_gradient(X, y, w, b, learning_rate)
            w_history.append(w)
            b_history.append(b)
        return w_history, b_history, cost_history

    def compute_cost(self, X, y, w, b):
        y_predict = self.model(X, w, b)
        return self.cross_entropy(y_predict, y)

    def accuracy(self, y_predict, y):
        y_predict_class = np.where(y_predict > 0.5, 1, 0)
        return np.mean(y_predict_class == y)

    def classify(self, X, w, b):
        y_predict = self.model(X, w, b)
        return np.where(y_predict > 0.5, 1, 0)

<span style="font-size:36px;">Implement data and parameters</span>

In [374]:
df = pd.read_csv("data.csv")

X = df.drop(columns=["Diagnosis"]).values
y = df["Diagnosis"].values
N, D = X.shape
y = y.reshape(N, 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=SEED
)

weights = np.random.randn(D, 1)
bias = np.random.randn()

<span style="font-size:36px;">Use LogisticRegression without normalize</span>

In [375]:
algorithm_not_normalized = LogisticRegression(isNormalize=False)

w_best, b_best, train_history = algorithm_not_normalized.train(
    X_train, y_train, weights, bias, learning_rate=0.001, iterations=100
)

y_predict = algorithm_not_normalized.model(X_train, w_best, b_best)
y_class = algorithm_not_normalized.classify(X_train, w_best, b_best)
accuracy = algorithm_not_normalized.accuracy(y_predict, y_train)

print(f"Train Accuracy: {accuracy}")

y_train_sample = y_train[:5]
y_predict_sample = y_predict[:5]
y_class_sample = y_class[:5]

df = pd.DataFrame(
    {
        "y_train": y_train_sample.flatten(),
        "y_class": y_class_sample.flatten(),
        "y_predict": y_predict_sample.flatten(),
    }
)
print(df)
print()


y_predict = algorithm_not_normalized.model(X_test, w_best, b_best)
y_class = algorithm_not_normalized.classify(X_test, w_best, b_best)
accuracy = algorithm_not_normalized.accuracy(y_predict, y_test)

print(f"Test Accuracy: {accuracy}")

y_test_sample = y_test[:5]
y_predict_sample = y_predict[:5]
y_class_sample = y_class[:5]

df = pd.DataFrame(
    {
        "y_test": y_test_sample.flatten(),
        "y_class": y_class_sample.flatten(),
        "y_predict": y_predict_sample.flatten(),
    }
)
print(df)

w_history, b_history, cost_history = train_history
fig = px.line(
    x=range(len(cost_history)),
    y=cost_history,
    title="Cost Function",
    labels={"x": "Epoch", "y": "Cost"},
)
fig.show()

Train Accuracy: 0.7276995305164319
   y_train  y_class      y_predict
0        0        0  6.418674e-144
1        1        0   1.060849e-30
2        0        0   5.546937e-60
3        1        0  1.721463e-155
4        0        0   2.693460e-77

Test Accuracy: 0.7272727272727273
   y_test  y_class     y_predict
0       0        0  4.886255e-50
1       0        0  7.208140e-62
2       0        0  4.904707e-82
3       0        0  3.892975e-60
4       0        0  6.339042e-69


<span style="font-size:36px;">Use LogisticRegression with Normalize</span>

In [376]:
algorithm_normalized = LogisticRegression(isNormalize=True)

w_best, b_best, train_history = algorithm_normalized.train(
    X_train, y_train, weights, bias, learning_rate=0.1, iterations=100
)

X_train_normalized = algorithm_normalized.normalize(X_train)
y_predict = algorithm_normalized.model(X_train_normalized, w_best, b_best)
y_class = algorithm_normalized.classify(X_train_normalized, w_best, b_best)
accuracy = algorithm_normalized.accuracy(y_predict, y_train)

print(f"Train Accuracy: {accuracy}")

y_train_sample = y_train[:5]
y_predict_sample = y_predict[:5]
y_class_sample = y_class[:5]

df = pd.DataFrame(
    {
        "y_train": y_train_sample.flatten(),
        "y_class": y_class_sample.flatten(),
        "y_predict": y_predict_sample.flatten(),
    }
)
print(df)
print()


X_test_normalized = algorithm_normalized.normalize(X_test)
y_predict = algorithm_normalized.model(X_test_normalized, w_best, b_best)
y_class = algorithm_normalized.classify(X_test_normalized, w_best, b_best)
accuracy = algorithm_normalized.accuracy(y_predict, y_test)

print(f"Test Accuracy: {accuracy}")

y_test_sample = y_test[:5]
y_predict_sample = y_predict[:5]
y_class_sample = y_class[:5]

df = pd.DataFrame(
    {
        "y_test": y_test_sample.flatten(),
        "y_class": y_class_sample.flatten(),
        "y_predict": y_predict_sample.flatten(),
    }
)
print(df)

w_history, b_history, cost_history = train_history
fig = px.line(
    x=range(len(cost_history)),
    y=cost_history,
    title="Cost Function",
    labels={"x": "Epoch", "y": "Cost"},
)
fig.show()

Train Accuracy: 0.9553990610328639
   y_train  y_class  y_predict
0        0        0   0.122148
1        1        1   0.837923
2        0        0   0.007097
3        1        1   0.853440
4        0        0   0.123485

Test Accuracy: 0.951048951048951
   y_test  y_class  y_predict
0       0        0   0.029687
1       0        0   0.000375
2       0        0   0.283708
3       0        0   0.002135
4       0        0   0.011731
