# Mini Batch Gradient Descent

In [4]:
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

In [152]:
class MBGDRegressor:

    def __init__(self, lr=0.01, epochs=1000, batch_size=32):
        self.intercept_ = None
        self.coef_ = None
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self, X_train, y_train):
        self.intercept_ = 0
        self.coef_ = np.ones(X_train.shape[1])
        
        for epoch in range(self.epochs):
            for i in range(X_train.shape[0] // self.batch_size):                                          # no. of updates per epoch = total_samples / batch_size
                random_idx = np.random.choice(range(X_train.shape[0]), self.batch_size, replace=False)    # random index of batch_size no. of rows
                y_pred = self.intercept_ + (X_train[random_idx] @ self.coef_)                             # y_pred for batch_size no. of rows 

                intercept_gradient = -2 * np.mean(y_train[random_idx] - y_pred)
                coef_gradient = -2 * ((y_train[random_idx] - y_pred) @ X_train[random_idx])

                self.intercept_ = self.intercept_ - (self.lr * intercept_gradient)
                self.coef_ = self.coef_ - (self.lr * coef_gradient)

        print(self.intercept_, self.coef_)

    def predict(self, X_test):
        return self.intercept_ + (X_test @ self.coef_)
    

In [153]:
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [157]:
X_train.shape[0]//32

11

In [155]:
mbgd = MBGDRegressor(lr=0.1, epochs=100, batch_size=32)
mbgd.fit(X_train, y_train)

154.671496351471 [  29.74437507 -227.57692211  546.46744266  335.44481373 -166.45883757
  -69.85467767 -162.16750236  148.26573923  459.13398203   64.23500322]


In [156]:
y_pred = mbgd.predict(X_test)
r2_score(y_test, y_pred)

0.46562702695533864

## For sklearn there is no Mini Batch GD so we have to do a workaround

In [79]:
sgd = SGDRegressor(learning_rate='adaptive', eta0=0.1)    # no need for max iter as we are gonna use partial fit
batch_size = 10

# total there will be 600 updates, instead of dividing into epochs and all
# each update will look at 10 samples 
for i in range(600):
    random_idx = np.random.choice(range(X_train.shape[0]), batch_size, replace=False)    # random index of batch_size no. of samples
    sgd.partial_fit(X_train[random_idx], y_train[random_idx])                            # partial fit method only does one epoch (one update) of GD for n samples

In [80]:
sgd.intercept_, sgd.coef_

(array([140.75462224]),
 array([  57.1443836 , -139.29114359,  439.49965371,  281.93198536,
         -35.99113592,  -87.31349628, -200.92641101,  139.48387181,
         325.30172143,  124.56389459]))

In [27]:
y_pred = sgd.predict(X_test)
r2_score(y_test, y_pred)

0.4621446214887822

In [161]:
sgd.intercept_ - mbgd.intercept_, sgd.coef_ - mbgd.coef_

(array([-13.91687411]),
 array([  27.40000853,   88.28577852, -106.96778894,  -53.51282837,
         130.46770165,  -17.45881861,  -38.75890865,   -8.78186743,
        -133.8322606 ,   60.32889136]))