In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow ≥2.4 is required in this notebook
# Earlier 2.x versions will mostly work the same, but with a few bugs
import tensorflow as tf
from tensorflow import keras

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deep"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Computing gradient with Autodiff

In [2]:
def f(w1, w2):
    return 3 * w1 **2 + 2 * w1 * w2


In [3]:
w1, w2 = 5, 3
eps = 1e-6
(f(w1 + eps,w2) - f(w1,w2))/eps


36.000003007075065

In [4]:
(f(w1, w2 + eps) - f(w1,w2)) / eps

10.000000003174137

In [5]:
w1, w2 =tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape :
    z = f(w1,w2)
gradients = tape.gradient(z,[w1,w2])

In [6]:
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

1. Default if want call tape is must delete 

In [7]:
# default after calls the tape the record has been del, if want call multiple 
# tape must be custom tape

# if want use the default
with tf.GradientTape() as tape:
    z = f(w1,w2)
dz_w1 = tape.gradient(z,w1)
dz_w2 = tape.gradient(z,w2)

RuntimeError: A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)

In [8]:
# if want used custom gradient
with tf.GradientTape(persistent=True)as tape:
    z = f(w1,w2)
dz_w1 = tape.gradient(z,w1)
dz_w2 = tape.gradient(z,w2)
del tape #dont be leak memory

In [9]:
dz_w1,dz_w2

(<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>)

2. Gradient tape only record the variable type

if want used the constant type isnt worked

-> just ill be result is none, the recorded on gradient tape isnt recored right 

In [10]:
c1,c2 = tf.constant(5.),tf.constant(3.)
with tf.GradientTape() as tape:
    z=f(c1,c2)
gradients=tape.gradient(z,[c1,c2])

In [11]:
gradients

[None, None]

solution is wused tf.watch(), thats ill be force the constant type to be readed on gradient tape

In [12]:
with tf.GradientTape() as tape:
    tape.watch(c1)
    tape.watch(c2)
    z = f(c1,c2)
gradients=tape.gradient(z,[c1,c2])

In [13]:
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

3. Multiple loss on the vector

isnt be multiple loss u must get the induvidual gradients on vector gradient case

So if you ever need to get the individual gradients (e.g., the gradients of each loss with regard to the model parameters), you must
call the tape’s jacobian() method

In [14]:
with tf.GradientTape() as tape:
    z1 = f(w1, w2 + 2.)
    z2 = f(w1, w2 + 5.)
    z3 = f(w1, w2 + 7.)

tape.gradient([z1, z2, z3], [w1, w2])

[<tf.Tensor: shape=(), dtype=float32, numpy=136.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=30.0>]

In [15]:
with tf.GradientTape(persistent=True) as tape:
    z1 = f(w1, w2 + 2.)
    z2 = f(w1, w2 + 5.)
    z3 = f(w1, w2 + 7.)

tf.reduce_sum(tf.stack([tape.gradient(z, [w1, w2]) for z in (z1, z2, z3)]), axis=0)
del tape

In [16]:
with tf.GradientTape(persistent=True) as hessian_tape:#one deriative
    with tf.GradientTape() as jacobian_tape:#second deriative
        z=f(w1,w2)
    jacobians = jacobian_tape.gradient(z,[w1,w2])
hessians = [hessian_tape.gradient(jacobians,[w1,w2])
            for jacobian in jacobians]
del hessian_tape

In [17]:
# thats the single value gradients must be converted from multiple loss 
jacobians

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

In [18]:
#mutliple loss -> vector loss
hessians

[[<tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=2.0>],
 [<tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=2.0>]]

4. stop gradients

In [19]:
def f(w1,w2):
    return 3 * w1 ** 2 + tf.stop_gradient(2 * w1 * w2)
with tf.GradientTape() as tape:
    z = f(w1,w2)
gradients =tape.gradient(z,[w1,w2])

In [20]:
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=30.0>, None]

-> thats gradient on 2 *w1*w2 must be stoped on backprogation proses of compute the gradient 

5. Record the activation function

-> will be resulted nan 

In [21]:
def my_softplus(z): # return value is just tf.nn.softplus(z)
    return tf.math.log(tf.exp(z) + 1.0)

In [22]:
x = tf.Variable(100.)
with tf.GradientTape() as tape:
    z = my_softplus(x)

tape.gradient(z, [x])

[<tf.Tensor: shape=(), dtype=float32, numpy=nan>]

In [26]:
# used decorator to solve thats problem
@tf.custom_gradient
def my_better_softplus(z):
    def my_softplus_gradients(grad):
        return grad / (1 + 1 / tf.exp(z))
    return tf.math.log(tf.exp(z)+1),my_softplus_gradients

In [24]:
def my_better_softplus(z):
    return tf.where(z > 30., z, tf.math.log(tf.exp(z) + 1.))

In [27]:
x = tf.Variable([1000.])
with tf.GradientTape() as tape:
    z = my_better_softplus(x)

z, tape.gradient(z, [x])

(<tf.Tensor: shape=(1,), dtype=float32, numpy=array([inf], dtype=float32)>,
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>])

# Custom training loops

In [3]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)


model with l2 (ridge regulation)

In [4]:
l2_reg = keras.regularizers.l2(5e-2)
model = keras.models.Sequential([
    keras.layers.Dense(30,activation="elu",kernel_initializer="he_normal",
                       kernel_regularizer=l2_reg),
    keras.layers.Dense(1,kernel_regularizer=l2_reg)
])

random sample dari setiap batch istance data

In [5]:
def random_batch(X,y,batch_size=32):
    idx = np.random.randint(len(X),size=batch_size)
    return X[idx],y[idx]


set the status bar per istance training

In [6]:
def print_status_bar(iteration,total,loss,metrics=None):
    metrics = " - ".join(["{} : {:.4f}".format(m.name,m.result())
                          for m in [loss] + (metrics or [])])
    end = "" if iteration <10 else "\n"
    print("\r{}/{} - ".format(iteration,total) + metrics,end=end)

In [7]:
data = " - ".join("fuck u")
print(data)

f - u - c - k -   - u


In [33]:
for m in [10,20,30] + ([1,2,3,4] or []):
    print(m)

10
20
30
1
2
3
4


In [36]:
tf.reduce_mean([10,20,30,40,50])


<tf.Tensor: shape=(), dtype=int32, numpy=30>

load dataset

In [13]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full,X_test,y_train_full,y_test = train_test_split(housing.data,
                                                           housing.target.reshape(-1,1),random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train_scalled = scaler.fit_transform(X_train)
X_valid_scalled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)


set the hyperparameter training

In [9]:
n_epochs = 5 # perulangan
batch_size = 32
# len(X_trian) 11610
n_steps = len(X_train) // batch_size # 362 -> jumlah step epochs n_epochs
optimizer = keras.optimizers.Nadam(learning_rate=1e-2)
loss_fn = keras.losses.mean_squared_error
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.MeanAbsoluteError()]


In [10]:
for epoch in range(1,n_epochs + 1): #5 perulangan
    print("Epochs {} / {}".format(epoch,n_epochs))
    for step in range(1,n_steps +1): # setiap perulangan 362 steps
        
        #random batch input data
        X_batch,y_batch = random_batch(X_train_scalled,y_train)
        
        # set up gradient
        with tf.GradientTape() as tape :
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch,y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss,model.trainable_variables)
        optimizer.apply_gradients(zip(gradients,model.trainable_variables))
        
        #assign variable
        for variable in model.variables:
            if variable.constraint is not None :
                variable.assign(variable.constraint(variable))
        mean_loss(loss)
        
        # assign set up metrics
        for metric in metrics:
            metric(y_batch, y_pred)
        
        print_status_bar(step * batch_size, len(y_train),mean_loss,metrics)
    print_status_bar(len(y_train),len(y_train),mean_loss,metrics)
    for metric in [mean_loss] + metrics:
        metric.reset_states()

Epochs 1 / 5
32/11610 - mean : 11.7587 - mean_absolute_error : 2.3382
64/11610 - mean : 11.5956 - mean_absolute_error : 2.1291
96/11610 - mean : 11.2571 - mean_absolute_error : 2.1410
128/11610 - mean : 11.6003 - mean_absolute_error : 2.1813
160/11610 - mean : 11.3514 - mean_absolute_error : 2.1610
192/11610 - mean : 10.5690 - mean_absolute_error : 2.0529
224/11610 - mean : 9.9891 - mean_absolute_error : 1.9601
256/11610 - mean : 9.9036 - mean_absolute_error : 1.9571
288/11610 - mean : 9.6919 - mean_absolute_error : 1.9425
320/11610 - mean : 9.4509 - mean_absolute_error : 1.8992
352/11610 - mean : 9.2153 - mean_absolute_error : 1.8720
384/11610 - mean : 8.9834 - mean_absolute_error : 1.8364
416/11610 - mean : 8.7050 - mean_absolute_error : 1.7991
448/11610 - mean : 8.4880 - mean_absolute_error : 1.7707
480/11610 - mean : 8.3065 - mean_absolute_error : 1.7439
512/11610 - mean : 8.1106 - mean_absolute_error : 1.7093
544/11610 - mean : 7.9486 - mean_absolute_error : 1.6849
576/11610 - mea

In [14]:
try:
    from tqdm.notebook import trange
    from collections import OrderedDict
    with trange(1, n_epochs + 1, desc="All epochs") as epochs:
        for epoch in epochs:
            with trange(1, n_steps + 1, desc="Epoch {}/{}".format(epoch, n_epochs)) as steps:
                for step in steps:
                    X_batch, y_batch = random_batch(X_train_scalled, y_train)
                    with tf.GradientTape() as tape:
                        y_pred = model(X_batch)
                        main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                        loss = tf.add_n([main_loss] + model.losses)
                    gradients = tape.gradient(loss, model.trainable_variables)
                    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
                    for variable in model.variables:
                        if variable.constraint is not None:
                            variable.assign(variable.constraint(variable))                    
                    status = OrderedDict()
                    mean_loss(loss)
                    status["loss"] = mean_loss.result().numpy()
                    for metric in metrics:
                        metric(y_batch, y_pred)
                        status[metric.name] = metric.result().numpy()
                    steps.set_postfix(status)
            for metric in [mean_loss] + metrics:
                metric.reset_states()
except ImportError as ex:
    print("To run this cell, please install tqdm, ipywidgets and restart Jupyter")

All epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/362 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/362 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/362 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/362 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/362 [00:00<?, ?it/s]