<a href="https://colab.research.google.com/github/Tershire/Python_DL/blob/main/S01/L014_mini_batch_gradient_descent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
# 2023 OCT 30

In [47]:
import numpy as np
import pandas as pd
import sklearn

### get data

In [48]:
# california housing dataset
# from sklearn.datasets import load_boston
# from sklearn.datasets import fetch_california_housing
# cali = fetch_california_housing()

# print(cali["feature_names"])

# print(cali.DESCR)

# boston housing dataset
boston_df = pd.read_csv("./drive/MyDrive/Colab Notebooks/Python_DL/boston_housing.csv")

boston_df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

display(boston_df)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [49]:
boston_targets = boston_df.iloc[:, -1].values

# mini-batch gradient descent (MBGD) & linear regression

* define functions

In [50]:
# w0: <1 x 1>
# w: <M x 1>
# X: <M x N> -> pick batch -> <M X K>
# y: <1 x N> -> pick batch -> <1 X K>

def compute_gradient_of_cost(w0, w, X, y):
    N = y.shape[1]

    ones = np.ones((1, N))

    # y_hat = w0*ones + w.T @ X
    y_hat = w0 + w.T @ X
    error = y - y_hat

    partial_w0 = -(2/N)*(ones @ error.T)
    partial_w = -(2/N)*(X @ error.T)
    gradient = np.concatenate((partial_w0, partial_w), axis=0)

    return gradient

In [51]:
def gradient_descent_MBGD(X, y, alpha=0.01, max_epochs=1000, batch_size=30, verbose=True):
    M = X.shape[0]
    N = y.shape[1]

    w0 = 0
    w = np.zeros((M, 1))

    for i in range(max_epochs):
        # pick K (batch_size) number of data randomly out of X
        random_indices = np.random.choice(N, batch_size)
        X_batch = X[:, random_indices]
        y_batch = y[0, random_indices].reshape(1, -1)

        # descend
        gradient = compute_gradient_of_cost(w0, w, X_batch, y_batch)
        w0 -= alpha*gradient[0, [0]]
        w -= alpha*gradient[1:, [0]]

        # calculate loss (<!> in terms of the whole data <!>)
        y_hat = w0 + w.T @ X
        error = y - y_hat
        loss_MSE = np.mean(np.square(error))

        if verbose:
            if i == max_epochs or i%(max_epochs//10) == 0:
                print("Epoch:", i, "-", "loss (MSE):", loss_MSE, "w0:", w0, "w1:", w)

    return w0, w

* pre-process features and run gradient descent

In [52]:
# scale features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(boston_df[["RM", "LSTAT"]]).T
y = boston_targets.reshape(1, -1)
print("X.shape:", X.shape, "y.shape:", y.shape, "\n")

# run gradient descent
w0, w = gradient_descent_MBGD(X, y, alpha=0.01, max_epochs=1000, batch_size=30, verbose=True)

# result
print("\n=RESULT-")
print("w0:", w0)
print("w", w)

X.shape: (2, 506) y.shape: (1, 506) 

Epoch: 0 - loss (MSE): 564.0954805016049 w0: [0.4628] w1: [[0.25433588]
 [0.10732082]]
Epoch: 100 - loss (MSE): 75.87225601730339 w0: [15.62874828] w1: [[9.70300019]
 [2.01824283]]
Epoch: 200 - loss (MSE): 66.71508335084988 w0: [16.42238348] w1: [[11.45649462]
 [-0.18577898]]
Epoch: 300 - loss (MSE): 60.425247714394885 w0: [16.40406468] w1: [[12.7098868 ]
 [-2.36509413]]
Epoch: 400 - loss (MSE): 55.066439255206966 w0: [16.64522461] w1: [[14.02281809]
 [-4.31596295]]
Epoch: 500 - loss (MSE): 50.95359769898427 w0: [16.3771275] w1: [[14.85679184]
 [-6.16061104]]
Epoch: 600 - loss (MSE): 47.427578756106556 w0: [16.59989896] w1: [[15.82680448]
 [-7.72272328]]
Epoch: 700 - loss (MSE): 44.398807313052174 w0: [16.67408525] w1: [[16.81663379]
 [-9.2104923 ]]
Epoch: 800 - loss (MSE): 42.04515873623972 w0: [16.54685249] w1: [[ 17.47860174]
 [-10.56528879]]
Epoch: 900 - loss (MSE): 40.06269830776312 w0: [16.59250635] w1: [[ 18.20070887]
 [-11.76479274]]

=RESU

* predict

In [53]:
predictions = w0 + w.T @ X
print(predictions.shape)
# print(predictions)

boston_df["PREDICTED_MDEV"] = predictions.ravel()
display(boston_df)

(1, 506)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,PREDICTED_MDEV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,26.310760
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,24.263726
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,28.863278
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,28.574986
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2,28.260305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4,24.697976
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,23.193398
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,27.529132
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,26.568426


# mini-batch gradient descent (MBGD): practical version (Keras way) & linear regression

* define functions

In [54]:
# w0: <1 x 1>
# w: <M x 1>
# X: <M x N> -> pick batch -> <M X K>
# y: <1 x N> -> pick batch -> <1 X K>

def compute_gradient_of_cost(w0, w, X, y):
    N = y.shape[1]

    ones = np.ones((1, N))

    # y_hat = w0*ones + w.T @ X
    y_hat = w0 + w.T @ X
    error = y - y_hat

    partial_w0 = -(2/N)*(ones @ error.T)
    partial_w = -(2/N)*(X @ error.T)
    gradient = np.concatenate((partial_w0, partial_w), axis=0)

    return gradient

In [55]:
def gradient_descent_MBGDp(X, y, alpha=0.01, max_epochs=1000, batch_size=30, verbose=True):
    M = X.shape[0]
    N = y.shape[1]

    w0 = 0
    w = np.zeros((M, 1))

    for i in range(max_epochs):
        # pick K (batch_size) number of data out of X in series
        for j in range(0, N, batch_size):
            X_batch = X[:, j:j + batch_size]
            y_batch = y[0, j:j + batch_size].reshape(1, -1)

            # descend
            gradient = compute_gradient_of_cost(w0, w, X_batch, y_batch)
            w0 -= alpha*gradient[0, [0]]
            w -= alpha*gradient[1:, [0]]

            # calculate loss (<!> in terms of the whole data <!>)
            y_hat = w0 + w.T @ X
            error = y - y_hat
            loss_MSE = np.mean(np.square(error))

            if verbose:
                if i == max_epochs or i%(max_epochs//10) == 0:
                    print("Epoch:", i, "-", "loss (MSE):", loss_MSE, "w0:", w0, "w1:", w)

    return w0, w

* pre-process features and run gradient descent

In [56]:
# scale features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(boston_df[["RM", "LSTAT"]]).T
y = boston_targets.reshape(1, -1)
print("X.shape:", X.shape, "y.shape:", y.shape, "\n")

# run gradient descent
w0, w = gradient_descent_MBGDp(X, y, alpha=0.01, max_epochs=1000, batch_size=30, verbose=True)

# result
print("\n=RESULT-")
print("w0:", w0)
print("w", w)

X.shape: (2, 506) y.shape: (1, 506) 

Epoch: 0 - loss (MSE): 567.2250927629614 w0: [0.41353333] w1: [[0.21118173]
 [0.11625443]]
Epoch: 0 - loss (MSE): 542.7895112848258 w0: [0.83382878] w1: [[0.42504999]
 [0.21151116]]
Epoch: 0 - loss (MSE): 518.1392189598428 w0: [1.26864819] w1: [[0.65529552]
 [0.29207843]]
Epoch: 0 - loss (MSE): 494.12115632033255 w0: [1.69338585] w1: [[0.89614794]
 [0.39426111]]
Epoch: 0 - loss (MSE): 478.0108246502634 w0: [1.98799585] w1: [[1.02658639]
 [0.5371618 ]]
Epoch: 0 - loss (MSE): 452.07615796064226 w0: [2.47174746] w1: [[1.3122922]
 [0.6192097]]
Epoch: 0 - loss (MSE): 420.237402226281 w0: [3.0759416] w1: [[1.70976219]
 [0.69994868]]
Epoch: 0 - loss (MSE): 394.35643754954594 w0: [3.5818841] w1: [[2.04599772]
 [0.79636541]]
Epoch: 0 - loss (MSE): 369.5305871018186 w0: [4.08502278] w1: [[2.39049798]
 [0.87851357]]
Epoch: 0 - loss (MSE): 347.3936690398057 w0: [4.56461714] w1: [[2.69558737]
 [0.93472625]]
Epoch: 0 - loss (MSE): 332.48655726323415 w0: [4.90702

* predict

In [57]:
predictions = w0 + w.T @ X
print(predictions.shape)
# print(predictions)

boston_df["PREDICTED_MDEV"] = predictions.ravel()
display(boston_df)

(1, 506)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,PREDICTED_MDEV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,28.830077
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,25.370027
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,32.508278
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,32.272631
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2,31.477882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4,25.891933
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,23.896501
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,30.418568
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,28.961739


# mini-batch gradient descent (MBGD): Keras & linear regression

In [58]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [59]:
model = Sequential([Dense(1, input_shape=(2, ), activation=None, kernel_initializer="zeros", bias_initializer="ones")])

In [60]:
model.compile(optimizer=Adam(learning_rate=0.01), loss="mse", metrics=["mse"])
model.fit(X.T, y.T, batch_size=30, epochs=1000)  # default batch_size is 32

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.src.callbacks.History at 0x797c01353700>

In [61]:
# predict
predictions_keras = model.predict(X.T)

boston_df["PREDICTED_MDEV_KERAS"] = predictions_keras
display(boston_df)



Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,PREDICTED_MDEV,PREDICTED_MDEV_KERAS
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,28.830077,28.971420
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,25.370027,25.496929
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,32.508278,32.634369
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,32.272631,32.409843
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2,31.477882,31.599134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4,25.891933,26.010813
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,23.896501,24.033394
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,30.418568,30.543890
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,28.961739,29.089016
