In [None]:
import numpy as np
from numpy.core.multiarray import ndarray

np.random.seed(33)

# Logistic Regression

## Mathematical description 

$\normalsize \mathbf{z} = \mathbf{X} \cdot \mathbf{w}^T$

$\normalsize \mathbf{y} = \sigma(\mathbf{z}) = \frac{1}{1 - e^{-\mathbf{z}}} \Rightarrow e^{-\mathbf{z}} = \frac{1 - \sigma(\mathbf{z})}{\sigma(\mathbf{z})}$

$ $

Loss function - negative log likelihood

$\normalsize \xi(\mathbf{t}, \mathbf{y}) = \sum -\mathbf{t} log(\mathbf{y}) - (1-\mathbf{t})log(1-\mathbf{y})) = -\sum (\mathbf{t} log(\mathbf{y}) + (1-\mathbf{t})log(1-\mathbf{y}))$


$ $

$\normalsize \mathbf{w}(k+1) = \mathbf{w}(k) - \Delta \mathbf{w}(k+1)$.

$\normalsize \Delta \mathbf{w} = \mu \frac{\partial \xi}{\partial \mathbf{w}}$

$ $

$\normalsize \frac{\partial \xi}{\partial \mathbf{w}} = \frac{\partial \xi}{\partial \mathbf{y}} \frac{\partial \mathbf{y}}{\partial \mathbf{z}} \frac{\partial \mathbf{z}}{\partial \mathbf{w}}$

$\normalsize \frac{\partial \mathbf{z}}{\partial \mathbf{w}} = (\mathbf{X} \cdot \mathbf{w}^T)^{'} = \mathbf{X}^T$

$\normalsize \frac{\partial \mathbf{y}}{\partial \mathbf{z}} = (\frac{1}{1 - e^{-\mathbf{z}}})^{'} = - \frac{(1-e^{-\mathbf{z}})^{'}}{(1 - e^{-\mathbf{z}})^2} = \frac{e^{-\mathbf{z}}}{(1 - e^{-\mathbf{z}})^2} = \frac {\frac {1 - \sigma(\mathbf{z})} {\sigma(\mathbf{z})}} {\frac {1} {\sigma^2(\mathbf{z})}} = \sigma^2(\mathbf{z}) \frac {1 - \sigma(\mathbf{z})} {\sigma(\mathbf{z})} = \sigma(\mathbf{z})(1 - \sigma(\mathbf{z})) = \mathbf{y} (1 - \mathbf{y}) $

$\normalsize \frac{\partial \xi}{\partial \mathbf{y}} = (-\mathbf{t} log(\mathbf{y}) - (1-\mathbf{t})log(1-\mathbf{y}))^{'} = (-\mathbf{t} log(\mathbf{y}))^{'} - ((1-\mathbf{t})log(1-\mathbf{y}))^{'} = -\frac{\mathbf{t}}{\mathbf{y}}+\frac{1-\mathbf{t}}{1-\mathbf{y}} = \frac{-\mathbf{t}(1-\mathbf{y})+\mathbf{y}(1-\mathbf{t})}{\mathbf{y}(1-\mathbf{y})}=$

$ = \frac{-\mathbf{t}+\mathbf{ty}+\mathbf{y}-\mathbf{yt}}{\mathbf{y}(1-\mathbf{y})} = \frac{\mathbf{y} - \mathbf{t}}{\mathbf{y} (1 - \mathbf{y})}$

$\normalsize \frac{\partial \xi}{\partial \mathbf{w}} 
= \frac{\partial \xi}{\partial \mathbf{y}} \frac{\partial \mathbf{y}}{\partial \mathbf{z}} \frac{\partial \mathbf{z}}{\partial \mathbf{w}} 
= \frac{\mathbf{y} - \mathbf{t}}{\mathbf{y} (1-\mathbf{y})} \cdot \mathbf{y} (1 - \mathbf{y}) \cdot \mathbf{X}^T    
= (\mathbf{y}-\mathbf{t}) \cdot \mathbf{X}^T $

$ $

$\normalsize \Delta \mathbf{w} = \mu \cdot \frac{\partial \xi}{\partial \mathbf{w}} = \mu \cdot (\mathbf{y}-\mathbf{t}) \cdot \mathbf{X}^T $

## Implementation in Python

In [None]:
def negative_log_likelihood(y_true: ndarray, y_pred: ndarray):
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss


def sigmoid(x: ndarray) -> float:
    return 1.0 / (1 + np.exp(-x))


class LogisticRegression:
    def __init__(self) -> None:
        self.weights = None
        self.intercept = None
        self.errors = list()

    def fit(self, X_train: ndarray, y: ndarray, n_epochs: int, learning_rate: float) -> None:
        X = np.hstack((X_train, np.ones((len(X_train), 1))))
        w = np.random.randn(X.shape[1])
        N = len(y)

        for _ in range(n_epochs):
            y_pred = X @ w
            gradient_step = learning_rate * (y_pred - y) @ X
            w -= gradient_step
            self.errors.append(negative_log_likelihood(y, sigmoid(X @ w)))

        self.weights = w[:-1]
        self.intercept = w[-1]

    def predict(self, X_test: ndarray, return_probability: bool = True) -> ndarray:
        if self.weights is None:
            raise Exception("This linear regression instance is not fitted yet.")
        y_pred = sigmoid(X_test @ self.weights + self.intercept)
        if not return_probability:
            y_pred = np.around(y_pred)
        return y_pred

This implementation is without regularization however for logistic regression it is very important to use regularization due to its asymptotic nature. The optimizer, if not regularized, will enlarge the weights of the logistic regression to put Xw as far as possible to the left or right per sample to reduce the loss maximally.