In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random

# Logistic Regression

## Equations
$$z = Xw + b$$
$$y_{pred} = \sigma(z) = 1 / (1 + e^{-z}) = P(Y=1|X)$$ 
$$y_{output} = y > 0.5 (threshold) $$

$remember \frac{\partial \sigma(z)}{\partial z} = \sigma(z) (1-\sigma(z))$


## Loss Function (Log loss/BCE/CE)
$$L(w,b) = -1/m * [Σ(y * log(y_{pred}) + (1-y) * log(1-y_{pred}))]$$

**Remember as Log-istic Regression** -- Inside the log we have the regression ouptuts i.e. predictions

## Gradients
**Start with $L$ for a single sample**:
$L = -[y \log(\hat{y}) + (1-y) \log(1-\hat{y})]$

**Gradient descent using chain rule**
$\frac{\partial L}{\partial \hat{w}} = \frac{\partial L}{\partial \hat{y}} \times \frac{\partial \hat{y}}{\partial z} \times \frac{\partial z}{\partial w}$


#### Step 1: $\frac{\partial L}{\partial \hat{y}}$
$\frac{\partial L}{\partial \hat{y}} = -[\frac{y}{\hat{y}} - \frac{1-y}{1-\hat{y}}]$

#### Step 2: $\frac{\partial \hat{y}}{\partial z}$
$\frac{\partial \hat{y}}{\partial z} = \hat{y}(1 - \hat{y})$ 


#### Step 3: $\frac{\partial L}{\partial z}$ 
$\frac{\partial L}{\partial z} = \frac{\partial L}{\partial \hat{y}} \cdot \frac{\partial \hat{y}}{\partial z}$

$= (-[\frac{y}{\hat{y}} - \frac{1-y}{1-\hat{y}}]) \cdot (\hat{y}(1 - \hat{y}))$

$= -y(1 - \hat{y}) + (1-y)\hat{y}$

$= \hat{y} - y$

#### Step 4: $\frac{\partial z}{\partial w}$ and $\frac{\partial z}{\partial b}$
$\frac{\partial z}{\partial w} = x$
$\frac{\partial z}{\partial b} = 1$

#### Step 5: $\frac{\partial L}{\partial w}$ and $\frac{\partial L}{\partial b}$ (using chain rule)
$\frac{\partial L}{\partial w} = \frac{\partial L}{\partial z} \cdot \frac{\partial z}{\partial w} = (\hat{y} - y) \cdot x$

$\frac{\partial L}{\partial b} = \frac{\partial L}{\partial z} \cdot \frac{\partial z}{\partial b} = \hat{y} - y$



## Final Gradients (for multiple samples)

For multiple samples, we take the average of these gradients:

$\frac{\partial L}{\partial w} = \frac{1}{n} \sum_{i=1}^n [(y_{pred_i} - y_i) \cdot x_i]$

$\frac{\partial L}{\partial b} = \frac{1}{n} \sum_{i=1}^n (y_{pred_i} - y_i)$

In vector notation:

$\frac{\partial L}{\partial w} = \frac{1}{n} X^T (\hat{y} - y)$

$\frac{\partial L}{\partial b} = \frac{1}{n} \sum (\hat{y} - y)$


---

### Simple explanation
- $X$ is a matrix of shape $(m, n)$
- $y_{pred}$ and $y$ are vectors of shape $(m,)$.
- The operation $X^T \cdot (y_{pred} - y)$ implicitly performs the summation for $\frac{\partial L}{\partial w}$.

----

$$∂L/∂w = (1/m) * X^T * (y_{pred} - y)$$ 
$$∂L/∂b = (1/m) * Σ(y_{pred} - y)$$

Update rule:
$$w = w - α * ∂J/∂w$$
$$b = b - α * ∂J/∂b$$
- $α$ is the learning rate

---

That is the beauty of log-loss
- How $\frac{\partial \sigma(z)}{\partial z} = \sigma(z) (1-\sigma(z))$
- How the partial derivatives are same as linear regression

---

In [16]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent
        for _ in range(self.n_iterations):
            z = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(z)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)

            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        y_pred = self.sigmoid(z)
        return (y_pred > 0.5).astype(int)


In [18]:
# Generate sample data
np.random.seed(110)
X = np.random.randn(100, 3)
y = (X[:,0] > 0).astype(int)  # Binary classification randomly
print(f"X shape = {X.shape}, y shape = {y.shape}")
print(f"X elements = {X[:5]}, y elements = {y[:5]}")

X shape = (100, 3), y shape = (100,)
X elements = [[ 0.3285971  -0.79619855  1.40312383]
 [-1.54779275  1.1667303   1.14772265]
 [ 0.13010933  0.43142236 -0.86831976]
 [ 0.02715937 -1.51873922 -0.81556143]
 [-0.22363745 -2.63109611  0.04686695]], y elements = [1 0 1 1 0]


In [19]:
# Create and train the model
model = LogisticRegression(learning_rate=0.1, n_iterations=1000)
model.fit(X, y)
model.bias, model.weights


(np.float64(-0.23932621342690608),
 array([ 5.39647907, -0.29443978, -0.17058492]))

In [20]:
z = np.dot(X[:5], model.weights) + model.bias
y_pred = model.sigmoid(z)
y_pred, (y_pred > 0.5).astype(int)

(array([8.21863165e-01, 1.08215967e-04, 6.18668473e-01, 6.20939656e-01,
        3.36378157e-01]),
 array([1, 0, 1, 1, 0]))