In [2]:
from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_breast_cancer, load_diabetes


def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Linear regression

In [3]:
data = load_diabetes()
X_train1, y_train1 = data.data, data.target
w1 = np.random.randn(X_train1.shape[1]) * 0.0001
b1 = np.random.randn(1) * 0.0001

In [4]:
def mse_loss_naive(w, b, X, y, alpha=0):
    """
    MSE loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    # YOUR CODE HERE
    
    y_predict = X @ w + b
    n = len(y_predict)
    
    # Find the RSS(w) and delta(RSS(w)) respect to w and b
    for i in range(n):
        loss = loss + (y[i]-y_predict[i])**2
        for j in range(len(dw)):
            dw[j] = dw[j] + X[i][j] * (y[i]-y_predict[i])
        db = db + (y[i]-y_predict[i])
                
    loss = (1/n) * loss 
    dw = (-2/n) * dw     
    db = (-2/n) * db
    
    # Add alpha*(w**2) to the RSS(w) and delta(alpha*(w**2) to delya(RSS(w)))
    for j in range(w.shape[0]):
        loss = loss + alpha * (w[j]**2)
        dw[j] = dw[j] + 2 * alpha * w[j]
    
    return loss, dw, np.array(db).reshape(1,)

## Naive Linear regression loss

In [5]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  8.758840564025294e-16
Gradient check w
numerical: -1.376394 analytic: -1.376395, relative error: 2.740425e-07
numerical: -4.296088 analytic: -4.296088, relative error: 4.501411e-08
numerical: -1.376394 analytic: -1.376395, relative error: 2.740425e-07
numerical: -4.296088 analytic: -4.296088, relative error: 4.501411e-08
numerical: -1.275044 analytic: -1.275044, relative error: 3.114334e-08
numerical: -2.801914 analytic: -2.801914, relative error: 6.421825e-08
numerical: -4.145423 analytic: -4.145425, relative error: 1.512537e-07
numerical: -0.315454 analytic: -0.315454, relative error: 1.242573e-06
numerical: -1.553189 analytic: -1.553188, relative error: 1.413595e-07
numerical: -3.234125 analytic: -3.234126, relative error: 9.803157e-08
numerical: -1.275044 analytic: -1.275044, relative error: 3.114334e-08
numerical: -3.234125 analytic: -3.234126, relative error: 9.803157e-08
numerical: -1.376394 analytic: -1.376395, relative error: 2.740425e-07
numerical: -2.801914 ana

## Naive Ridge regression loss

In [6]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -2.802167 analytic: -2.802167, relative error: 5.630819e-08
numerical: -1.376609 analytic: -1.376610, relative error: 2.526690e-07
numerical: 2.892162 analytic: 2.892163, relative error: 1.646242e-07
numerical: -1.376609 analytic: -1.376610, relative error: 2.526690e-07
numerical: -3.153410 analytic: -3.153410, relative error: 4.021134e-08
numerical: -3.153410 analytic: -3.153410, relative error: 4.021134e-08
numerical: -0.315379 analytic: -0.315380, relative error: 1.100072e-06
numerical: -1.376609 analytic: -1.376610, relative error: 2.526690e-07
numerical: 2.892162 analytic: 2.892163, relative error: 1.646242e-07
numerical: -2.802167 analytic: -2.802167, relative error: 5.630819e-08
numerical: -0.315379 analytic: -0.315380, relative error: 1.100072e-06
numerical: -4.145230 analytic: -4.145232, relative error: 1.693076e-07
numerical: -4.145230 analytic: -4.145232, relative error: 1.693076e-07
numerical: -1.553276 analytic: -1.553275, relative error: 1.4207

In [7]:
def mse_loss_vectorized(w, b, X, y, alpha=0):
    """
    MSE loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    
    # YOUR CODE HERE
    y_predict = X @ w + b
    n = len(y_predict)
    
    loss = (1/n) * sum((y - y_predict)**2) + alpha * w.T @ w
    dw = (-2/n) * (X.T @ (y - y_predict)) + 2 * alpha * w
    db = (-2/n) * sum(y - y_predict)
    
    return loss, dw, np.array(db).reshape(1,)

## Vectorised Linear regression loss

In [8]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  8.758840564025294e-16
Gradient check w
numerical: -3.153318 analytic: -3.153318, relative error: 4.496955e-08
numerical: -3.153318 analytic: -3.153318, relative error: 4.496955e-08
numerical: -3.234125 analytic: -3.234126, relative error: 9.803157e-08
numerical: -1.376394 analytic: -1.376395, relative error: 2.740425e-07
numerical: -2.801914 analytic: -2.801914, relative error: 6.421825e-08
numerical: -4.145423 analytic: -4.145425, relative error: 1.512537e-07
numerical: -4.296088 analytic: -4.296088, relative error: 4.501411e-08
numerical: -4.145423 analytic: -4.145425, relative error: 1.512537e-07
numerical: 2.892060 analytic: 2.892061, relative error: 1.747025e-07
numerical: 2.892060 analytic: 2.892061, relative error: 1.747025e-07
numerical: -4.145423 analytic: -4.145425, relative error: 1.512537e-07
numerical: -0.315454 analytic: -0.315454, relative error: 1.242573e-06
numerical: -0.315454 analytic: -0.315454, relative error: 1.242573e-06
numerical: -1.376394 analyti

## Vectorized ridge regression loss

In [9]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -3.234097 analytic: -3.234098, relative error: 1.040240e-07
numerical: -4.145230 analytic: -4.145232, relative error: 1.473669e-07
numerical: 2.892162 analytic: 2.892163, relative error: 1.646242e-07
numerical: 2.892162 analytic: 2.892163, relative error: 1.646242e-07
numerical: -1.275122 analytic: -1.275122, relative error: 6.642427e-08
numerical: -4.145230 analytic: -4.145232, relative error: 1.473669e-07
numerical: -4.145230 analytic: -4.145232, relative error: 1.473669e-07
numerical: -4.296337 analytic: -4.296337, relative error: 4.517354e-08
numerical: -4.145230 analytic: -4.145232, relative error: 1.473669e-07
numerical: -3.234097 analytic: -3.234098, relative error: 1.040240e-07
numerical: -3.153410 analytic: -3.153410, relative error: 4.021134e-08
numerical: -4.296337 analytic: -4.296337, relative error: 4.517354e-08
numerical: 2.892162 analytic: 2.892163, relative error: 1.646242e-07
numerical: -2.802167 analytic: -2.802167, relative error: 5.630819

# Logistic regression

In [10]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

data = load_breast_cancer()
X_train2, y_train2 = data.data, data.target
w2 = np.random.randn(X_train2.shape[1]) * 0.0001
b2 = np.random.randn(1) * 0.0001

# Naive

In [11]:
def log_loss_naive(w, b, X, y, alpha=0):
    """
    log loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    # YOUR CODE HERE
    y_predict = X @ w + b
    n = len(y_predict)
    
    # Find the RSS(w) and delta(RSS(w)) respect to w and b
    for i in range(n):
        loss = loss + (y[i] * np.log(sigmoid(y_predict[i])) + (1 - y[i]) * np.log(1 - sigmoid(y_predict[i])))
        for j in range(len(dw)):    
            dw[j] = dw[j] + (sigmoid(y_predict[i])-y[i]) * X[i][j]
        db = db + (sigmoid(y_predict[i])-y[i])
        
    loss = loss * (-1/n) 
    dw = dw * (1/n)
    db = db * (1/n)
    
    # Add alpha*(w**2) to the RSS(w) and delta(alpha*(w**2) to delya(RSS(w)))
    for j in range(w.shape[0]):
        loss = loss + alpha * (w[j]**2)
        dw[j] = dw[j] + 2 * alpha * w[j]
        
    return loss, dw, np.array(db).reshape(1,)

In [12]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  0.0
Gradient check w
numerical: 0.004690 analytic: 0.004690, relative error: 3.586962e-09
numerical: -0.009680 analytic: -0.009680, relative error: 5.656910e-10
numerical: -0.770793 analytic: -0.770793, relative error: 1.312659e-09
numerical: 0.006746 analytic: 0.006746, relative error: 1.578565e-09
numerical: 0.004690 analytic: 0.004690, relative error: 3.586962e-09
numerical: 0.077765 analytic: 0.077765, relative error: 3.208659e-10
numerical: -2.628588 analytic: -2.628588, relative error: 9.522350e-10
numerical: -0.000483 analytic: -0.000483, relative error: 3.629401e-08
numerical: 0.006746 analytic: 0.006746, relative error: 1.578565e-09
numerical: -0.001285 analytic: -0.001285, relative error: 1.276333e-08
numerical: -0.010110 analytic: -0.010110, relative error: 1.477207e-09
numerical: 0.010213 analytic: 0.010213, relative error: 3.039768e-09
numerical: -0.003175 analytic: -0.003175, relative error: 6.863294e-09
numerical: 0.004690 analytic: 0.004690, relative error

# Naive with regulariztion

In [13]:
loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: -5.834418 analytic: -5.834418, relative error: 2.773257e-08
numerical: -0.000446 analytic: -0.000446, relative error: 1.034919e-08
numerical: -0.015484 analytic: -0.015484, relative error: 1.595149e-12
numerical: 56.911029 analytic: 56.910073, relative error: 8.393680e-06
numerical: -0.001280 analytic: -0.001280, relative error: 1.753634e-08
numerical: -5.834418 analytic: -5.834418, relative error: 2.773257e-08
numerical: -0.188697 analytic: -0.188697, relative error: 9.658266e-11
numerical: -0.001329 analytic: -0.001329, relative error: 2.483900e-09
numerical: -0.001280 analytic: -0.001280, relative error: 1.753634e-08
numerical: -0.015484 analytic: -0.015484, relative error: 1.595149e-12
numerical: -5.834418 analytic: -5.834418, relative error: 2.773257e-08
numerical: 13.928262 analytic: 13.927983, relative error: 1.000867e-05
numerical: -0.001547 analytic: -0.001547, relative error: 6.780781e-09
numerical: 0.010342 analytic: 0.010342, relative error: 2.91

# Vectorized

In [14]:
def log_loss_vectorized(w, b,X, y, alpha=0):
    """
    log loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    
    # YOUR CODE HERE
    y_predict = X @ w + b
    n = len(y_predict)
    
    loss = (-1/n) * sum(y * np.log(sigmoid(y_predict)) + (1 - y) * np.log(1 - sigmoid(y_predict))) + alpha * w.T @ w
    dw = (1/n) * (X.T @ (sigmoid(y_predict)-y)) + 2 * alpha * w
    db = (1/n) * sum(sigmoid(y_predict)-y)
    
    return loss, dw, np.array(db).reshape(1,)

In [15]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  0.0
Gradient check w
numerical: 0.077765 analytic: 0.077765, relative error: 3.208653e-10
numerical: 0.006498 analytic: 0.006498, relative error: 1.569190e-10
numerical: 0.077765 analytic: 0.077765, relative error: 3.208653e-10
numerical: -0.023586 analytic: -0.023586, relative error: 9.292727e-11
numerical: 13.928469 analytic: 13.928190, relative error: 1.000852e-05
numerical: -0.015793 analytic: -0.015793, relative error: 1.119693e-10
numerical: 0.077765 analytic: 0.077765, relative error: 3.208653e-10
numerical: -0.000483 analytic: -0.000483, relative error: 3.629401e-08
numerical: -0.770793 analytic: -0.770793, relative error: 1.312659e-09
numerical: -0.000636 analytic: -0.000636, relative error: 9.788413e-09
numerical: -0.032399 analytic: -0.032399, relative error: 6.023349e-10
numerical: -4.350171 analytic: -4.350172, relative error: 6.843492e-08
numerical: -0.770793 analytic: -0.770793, relative error: 1.312659e-09
numerical: 0.077765 analytic: 0.077765, relative e

# Vectorized with regularization

In [16]:
loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: -4.350042 analytic: -4.350042, relative error: 6.843666e-08
numerical: 0.004556 analytic: 0.004556, relative error: 4.004011e-09
numerical: 5.233961 analytic: 5.233960, relative error: 5.543847e-08
numerical: 5.233961 analytic: 5.233960, relative error: 5.543847e-08
numerical: -0.009553 analytic: -0.009553, relative error: 7.975633e-10
numerical: -2.628244 analytic: -2.628244, relative error: 9.530282e-10
numerical: -0.000446 analytic: -0.000446, relative error: 1.657152e-08
numerical: -0.001120 analytic: -0.001120, relative error: 1.495679e-08
numerical: 5.233961 analytic: 5.233960, relative error: 5.543847e-08
numerical: -0.001120 analytic: -0.001120, relative error: 1.495679e-08
numerical: 0.022973 analytic: 0.022973, relative error: 7.584786e-10
numerical: -0.000591 analytic: -0.000591, relative error: 3.173852e-08
numerical: 13.928262 analytic: 13.927983, relative error: 1.000867e-05
numerical: -0.000591 analytic: -0.000591, relative error: 3.173852e-08

# Gradient descent for Linear models

In [17]:
class LinearModel():
    def __init__(self):
        self.w = None
        self.b = None

    def train(self, X, y, learning_rate=1e-3, alpha=0, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            # YOUR CODE HERE
            choice = np.random.choice(N, batch_size, replace=False)

            X_batch = X[choice, :]
            y_batch = y[choice ]
            
            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch, alpha)
            loss_history.append(loss)

            # perform parameter update                                                                
            # Update the weights w and bias b using the gradient and the learning rate.          
            # YOUR CODE HERE
            self.w = self.w - learning_rate * dw
            self.b = self.b - learning_rate * db
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))
                
        return loss_history

    def predict(self, X):
        pass

    def loss(self, X_batch, y_batch, reg):
        pass

class LinearRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return mse_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        # YOUR CODE HERE   
        return (X @ self.w + self.b)

class LogisticRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return log_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        """ Return prediction labels vector of 0 or 1 """
        # YOUR CODE HERE
        
        test = X @ self.w + self.b
        test[test<=0] = 0
        test[test>0] = 1
        
        return test

## Linear regression with gradient descent

In [18]:
from sklearn.linear_model import LinearRegression

sk_model = LinearRegression(fit_intercept=True)
sk_model.fit(X_train1, y_train1)
sk_pred = sk_model.predict(X_train1)
sk_mse = mean_squared_error(sk_pred, y_train1)

model = LinearRegressor()
model.train(X_train1, y_train1, num_iters=75000, batch_size=64, learning_rate=1e-2, verbose=True)
pred = model.predict(X_train1)
mse = mean_squared_error(pred, y_train1)

print("MSE scikit-learn:", sk_mse)
print("MSE gradient descent model :", mse)
assert mse - sk_mse < 100

iteration 0 / 75000: loss 26058.408592
iteration 10000 / 75000: loss 2509.561191
iteration 20000 / 75000: loss 2675.312862
iteration 30000 / 75000: loss 3900.579751
iteration 40000 / 75000: loss 3444.909812
iteration 50000 / 75000: loss 2727.710326
iteration 60000 / 75000: loss 3451.208419
iteration 70000 / 75000: loss 3012.399648
MSE scikit-learn: 2859.6903987680657
MSE gradient descent model : 2884.5174456634027


In [19]:
from sklearn.linear_model import LinearRegression

sk_model = LinearRegression(fit_intercept=True)
sk_model.fit(X_train1, y_train1)
sk_pred = sk_model.predict(X_train1)
sk_mse = mean_squared_error(sk_pred, y_train1)

model = LinearRegressor()
model.train(X_train1, y_train1, num_iters=75000, batch_size=64, learning_rate=1e-2, verbose=True)
pred = model.predict(X_train1)
mse = mean_squared_error(pred, y_train1)

print("MSE scikit-learn:", sk_mse)
print("MSE gradient descent model :", mse)
print("Error :", mse - sk_mse)
assert mse - sk_mse < 100

iteration 0 / 75000: loss 30502.170233
iteration 10000 / 75000: loss 2815.482418
iteration 20000 / 75000: loss 2995.865440
iteration 30000 / 75000: loss 3118.623607
iteration 40000 / 75000: loss 3084.435244
iteration 50000 / 75000: loss 3697.506541
iteration 60000 / 75000: loss 2486.205147
iteration 70000 / 75000: loss 2291.832588
MSE scikit-learn: 2859.6903987680657
MSE gradient descent model : 2884.986474915995
Error : 25.296076147929398


## Logistc regression with gradient descent

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)

sk_model = LogisticRegression(fit_intercept=True)
sk_model.fit(X_train2, y_train2)
sk_pred = sk_model.predict(X_train2)
sk_log_loss = log_loss(sk_pred, y_train2)

model = LogisticRegressor()
model.train(X_train2, y_train2, num_iters=75000, batch_size=64, learning_rate=1e-3, verbose=True)
pred = model.predict(X_train2)
model_log_loss = log_loss(pred, y_train2)

print("Log-loss scikit-learn:", sk_log_loss)
print("Log-loss gradiet descent model :", model_log_loss)
print("Error :", rel_error(sk_log_loss, model_log_loss))
assert rel_error(sk_log_loss, model_log_loss) < 1e-7

iteration 0 / 75000: loss 0.693395
iteration 10000 / 75000: loss 0.078190
iteration 20000 / 75000: loss 0.097538
iteration 30000 / 75000: loss 0.038937
iteration 40000 / 75000: loss 0.034762
iteration 50000 / 75000: loss 0.058616
iteration 60000 / 75000: loss 0.050576
iteration 70000 / 75000: loss 0.048212
Log-loss scikit-learn: 0.4249086712816093
Log-loss gradiet descent model : 0.4249086712816093
Error : 0.0
