<a href="https://colab.research.google.com/github/Tershire/Python_DL/blob/main/S01/L013_stochastic_and_mini_batch_gradient_descents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# 2023 OCT 28

In [14]:
import numpy as np
import pandas as pd
import sklearn

### get data

In [15]:
# california housing dataset
# from sklearn.datasets import load_boston
# from sklearn.datasets import fetch_california_housing
# cali = fetch_california_housing()

# print(cali["feature_names"])

# print(cali.DESCR)

# boston housing dataset
boston_df = pd.read_csv("./drive/MyDrive/Colab Notebooks/Python_DL/boston_housing.csv")

boston_df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

display(boston_df)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [16]:
boston_targets = boston_df.iloc[:, -1].values

# stochastic gradient descent (SGD) & linear regression

* define functions

In [17]:
# w0: <1 x 1>
# w: <M x 1>
# X: <M x N> -> pick one -> <M X 1>
# y: <1 x N> -> pick pne -> <1 X 1>

def compute_gradient_of_cost(w0, w, X, y):
    N = y.shape[1]

    ones = np.ones((1, N))

    # y_hat = w0*ones + w.T @ X
    y_hat = w0 + w.T @ X
    error = y - y_hat

    partial_w0 = -(2/N)*(ones @ error.T)
    partial_w = -(2/N)*(X @ error.T)
    gradient = np.concatenate((partial_w0, partial_w), axis=0)

    return gradient

In [28]:
def gradient_descent_SGD(X, y, alpha=0.01, max_epochs=1000, verbose=True):
    M = X.shape[0]
    N = y.shape[1]

    w0 = 0
    w = np.zeros((M, 1))

    for i in range(max_epochs):
        # pick one datum randomly out of X
        random_index = np.random.choice(N, 1)
        X_pick = X[:, random_index]
        y_pick = y[0, random_index].reshape(-1, 1)

        # descend
        gradient = compute_gradient_of_cost(w0, w, X_pick, y_pick)
        w0 -= alpha*gradient[0, [0]]
        w -= alpha*gradient[1:, [0]]

        # calculate loss (<!> in terms of the whole data <!>)
        y_hat = w0 + w.T @ X
        error = y - y_hat
        loss_MSE = np.mean(np.square(error))

        if verbose:
            if i == max_epochs or i%(max_epochs//10) == 0:
                print("Epoch:", i, "-", "loss (MSE):", loss_MSE, "w0:", w0, "w1:", w)

    return w0, w

* pre-process features and run gradient descent

In [29]:
# scale features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(boston_df[["RM", "LSTAT"]]).T
y = boston_targets.reshape(1, -1)
print("X.shape:", X.shape, "y.shape:", y.shape, "\n")

# run gradient descent
w0, w = gradient_descent_SGD(X, y, alpha=0.01, max_epochs=1000, verbose=True)

# result
print("\n=RESULT-")
print("w0:", w0)
print("w", w)

X.shape: (2, 506) y.shape: (1, 506) 

Epoch: 0 - loss (MSE): 532.526449655899 w0: [0.896] w1: [[0.80775628]
 [0.05958499]]
Epoch: 100 - loss (MSE): 79.80475535291548 w0: [15.30622307] w1: [[9.14419612]
 [2.83105474]]
Epoch: 200 - loss (MSE): 69.72018086635781 w0: [16.56207665] w1: [[11.07914493]
 [ 0.901949  ]]
Epoch: 300 - loss (MSE): 64.39161537509302 w0: [17.2751265] w1: [[12.75599792]
 [-1.01396478]]
Epoch: 400 - loss (MSE): 57.63206933054234 w0: [16.13463395] w1: [[13.47506034]
 [-3.36052247]]
Epoch: 500 - loss (MSE): 51.959632498516974 w0: [15.90021937] w1: [[14.7472216 ]
 [-5.89543977]]
Epoch: 600 - loss (MSE): 48.83156315368927 w0: [17.09917636] w1: [[16.92421549]
 [-7.17818829]]
Epoch: 700 - loss (MSE): 44.42580907650636 w0: [15.75833464] w1: [[17.12574856]
 [-9.28922474]]
Epoch: 800 - loss (MSE): 41.63880872672634 w0: [16.13333082] w1: [[ 18.23422647]
 [-10.44961327]]
Epoch: 900 - loss (MSE): 40.14649539841597 w0: [16.46731986] w1: [[ 18.9910577 ]
 [-11.38589721]]

=RESULT-
w

* predict

In [30]:
predictions = w0 + w.T @ X
print(predictions.shape)
# print(predictions)

boston_df["PREDICTED_MDEV"] = predictions.ravel()
display(boston_df)

(1, 506)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,PREDICTED_MDEV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,26.207787
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,24.167820
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,28.819197
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,28.505123
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2,28.219498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4,24.623140
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,23.065474
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,27.472060
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,26.496897
