In [103]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import math

In [104]:
X, y = make_classification(n_samples=50000, n_features=15, n_informative=10, n_redundant=5,
                           n_classes=2, weights=[0.7], class_sep=0.7, random_state=15)

In [105]:
X.shape, y.shape

((50000, 15), (50000,))

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)

In [108]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((37500, 15), (37500,), (12500, 15), (12500,))

In [109]:
from sklearn import linear_model

In [110]:
# alpha : float
# Constant that multiplies the regularization term. 

# eta0 : double
# The initial learning rate for the ‘constant’, ‘invscaling’ or ‘adaptive’ schedules.

clf = linear_model.SGDClassifier(eta0=0.0001, alpha=0.0001, loss='log', random_state=15, penalty='l2', tol=1e-3, verbose=2, learning_rate='constant')
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
              loss='log', max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [111]:
clf.fit(X=x_train, y=y_train)

-- Epoch 1
Norm: 0.77, NNZs: 15, Bias: -0.316653, T: 37500, Avg. loss: 0.455552
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 0.91, NNZs: 15, Bias: -0.472747, T: 75000, Avg. loss: 0.394686
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 0.98, NNZs: 15, Bias: -0.580082, T: 112500, Avg. loss: 0.385711
Total training time: 0.03 seconds.
-- Epoch 4
Norm: 1.02, NNZs: 15, Bias: -0.658292, T: 150000, Avg. loss: 0.382083
Total training time: 0.05 seconds.
-- Epoch 5
Norm: 1.04, NNZs: 15, Bias: -0.719528, T: 187500, Avg. loss: 0.380486
Total training time: 0.06 seconds.
-- Epoch 6
Norm: 1.05, NNZs: 15, Bias: -0.763409, T: 225000, Avg. loss: 0.379578
Total training time: 0.07 seconds.
-- Epoch 7
Norm: 1.06, NNZs: 15, Bias: -0.795106, T: 262500, Avg. loss: 0.379150
Total training time: 0.08 seconds.
-- Epoch 8
Norm: 1.06, NNZs: 15, Bias: -0.819925, T: 300000, Avg. loss: 0.378856
Total training time: 0.09 seconds.
-- Epoch 9
Norm: 1.07, NNZs: 15, Bias: -0.837805, T: 337500, Avg. loss: 0.

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
              loss='log', max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [112]:
clf.coef_, clf.coef_.shape, clf.intercept_

(array([[-0.42336692,  0.18547565, -0.14859036,  0.34144407, -0.2081867 ,
          0.56016579, -0.45242483, -0.09408813,  0.2092732 ,  0.18084126,
          0.19705191,  0.00421916, -0.0796037 ,  0.33852802,  0.02266721]]),
 (1, 15),
 array([-0.8531383]))

## Implement Logistc Regression with L2 regularization Using SGD: without using sklearn

### Instructions

- Load the datasets(train and test) into the respective arrays

- Initialize the weight_vector and intercept term randomly

- Calculate the initlal log loss for the train and test data with the current weight and intercept and store it in a list

- for each epoch:
    - for each batch of data points in train: (keep batch size=1)
        - calculate the gradient of loss function w.r.t each weight in weight vector
        - Calculate the gradient of the intercept <a href='https://drive.google.com/file/d/1nQ08-XY4zvOLzRX-lGf8EYB5arb7-m1H/view?usp=sharing'>check this</a>
        - Update weights and intercept (check the equation number 32 in the above mentioned <a href='https://drive.google.com/file/d/1nQ08-XY4zvOLzRX-lGf8EYB5arb7-m1H/view?usp=sharing'>pdf</a>): <br>
        $w^{(t+1)} ← (1 − \frac{αλ}{N} )w^{(t)} + αx_n(y_n − σ((w^{(t)})^{T} x_n+b^{t}))$ <br>
        $b^{(t+1)} ← (b^t +  α(y_n - σ((w^{(t)})^{T} x_n+b^{t}))$ 
        - calculate the log loss for train and test with the updated weights (you can check the python assignment 10th question)
        - And if you wish, you can compare the previous loss and the current loss, if it is not updating, then
        you can stop the training
        - append this loss in the list ( this will be used to see how loss is changing for each epoch after the training is over )


- Plot the train and test loss i.e on x-axis the epoch number, and on y-axis the loss

- <strong>GOAL</strong>: compare your implementation and SGDClassifier's the weights and intercept, make sure they are as close as possible i.e difference should be in terms of 10^-3

# Custom Implementation of SGD 

In [118]:
alpha = 0.0001   # This will be the learining rate of the gradient descent 
intercept = 0
weights = np.zeros_like(x_train[0])
lamda = 0.0001   # this is the constant we use with the regulariser
iterations = 20
N = len(x_train)


def sigmoid(weights , intercept, x):
    return 1/(1 + np.exp(-(np.dot(weights,x) + intercept)))

# here we will iterate and constantly update the weights and the intercept
for i in tqdm(range(iterations)):
    y_train_pred = []
    y_test_pred = []
    train_loss = 0
    initial_loss = 0
    for batch_index in range(N):
        batch_index = np.random.randint(0,N)
        weights = ((1 - (alpha*lamda)/N)*weights) + (alpha*x_train[batch_index])*(y_train[batch_index] - sigmoid(weights,intercept,x_train[batch_index]))
        intercept = intercept +alpha*(y_train[batch_index] - sigmoid(weights,intercept,x_train[batch_index]))

    for i in range(len(x_train)):
        y_train_pred.append(sigmoid(weights,intercept,x_train[i]))
        initial_loss = train_loss
        train_loss += -((y_train[i]*(np.log(y_train_pred[i])))+ ((1-y_train[i])*(np.log(1-y_train_pred[i]))))
    #loss_diff.append(train_loss - initial_loss)
        
        

    print("The training loss is : {}".format(train_loss/N))

  5%|▌         | 1/20 [00:01<00:22,  1.18s/it]

The training loss is : 0.4032607611786856


 10%|█         | 2/20 [00:02<00:21,  1.20s/it]

The training loss is : 0.388338902422902


 15%|█▌        | 3/20 [00:03<00:21,  1.25s/it]

The training loss is : 0.38338257239300433


 20%|██        | 4/20 [00:05<00:20,  1.26s/it]

The training loss is : 0.38129082458463864


 25%|██▌       | 5/20 [00:06<00:19,  1.27s/it]

The training loss is : 0.3795411425772238


 30%|███       | 6/20 [00:07<00:17,  1.28s/it]

The training loss is : 0.3789712838449671


 35%|███▌      | 7/20 [00:08<00:16,  1.29s/it]

The training loss is : 0.37860523413825387


 40%|████      | 8/20 [00:10<00:15,  1.28s/it]

The training loss is : 0.37861791198482947


 45%|████▌     | 9/20 [00:11<00:14,  1.29s/it]

The training loss is : 0.3785304245497787


 50%|█████     | 10/20 [00:12<00:12,  1.28s/it]

The training loss is : 0.37882581597515685


 55%|█████▌    | 11/20 [00:14<00:11,  1.26s/it]

The training loss is : 0.37857325149071125


 60%|██████    | 12/20 [00:15<00:09,  1.23s/it]

The training loss is : 0.37898957815074985


 65%|██████▌   | 13/20 [00:16<00:08,  1.23s/it]

The training loss is : 0.378239039456217


 70%|███████   | 14/20 [00:17<00:07,  1.23s/it]

The training loss is : 0.3782983945717289


 75%|███████▌  | 15/20 [00:19<00:06,  1.27s/it]

The training loss is : 0.3785476116519117


 80%|████████  | 16/20 [00:20<00:05,  1.25s/it]

The training loss is : 0.37822005470630654


 85%|████████▌ | 17/20 [00:21<00:03,  1.26s/it]

The training loss is : 0.3784098620410656


 90%|█████████ | 18/20 [00:22<00:02,  1.28s/it]

The training loss is : 0.37828941581958986


 95%|█████████▌| 19/20 [00:24<00:01,  1.33s/it]

The training loss is : 0.3786144416977122


100%|██████████| 20/20 [00:25<00:00,  1.32s/it]

The training loss is : 0.3785039255254307





In [101]:
weights   # these weights are same as that of the sklearn's 

array([-0.42223145,  0.1863549 , -0.13800628,  0.34136424, -0.21182453,
        0.56164813, -0.44575044, -0.0970438 ,  0.2221966 ,  0.1755687 ,
        0.18661027,  0.00195088, -0.08079548,  0.34637624,  0.02601327])

In [102]:
intercept

-0.8887934398719837

In [41]:
intercept

-0.8839768607367605

In [61]:
def pred(w,b, X):
    N = len(X)
    predict = []
    for i in range(N):
        if sigmoid(w, b ,X[i]) >= 0.5:
            predict.append(1)
        else:
            predict.append(0)
    return np.array(predict)
#print(1-np.sum(y_train - pred(w,b,x_train))/len(x_train))
#print(1-np.sum(y_test  - pred(w,b,x_test))/len(x_test))

In [64]:
y_pred = pred(weights,intercept,x_test)

In [65]:
accuracy_score(y_test , y_pred)

0.8336