In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

tf.test.is_gpu_available()

2.0.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.17.4
pandas 0.25.3
sklearn 0.22
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


False

In [2]:
# 初中时的近似求导法:求一元函数的导数
def f(x):
    return 4. * x**2 + 2*x -3

def approximate_derivative(func,x,eps=1e-3):
    return (func(x+eps)- func(x-eps))/(2.*eps)

approximate_derivative(f,1)

9.999999999999343

In [3]:
# 用近似法求多元函数的导数
def g(x1,x2):
    return (x1 + 5) * (x2 ** 2)

def approximate_gradient(gfunc,x1,x2,eps=1e-3):
    dg_x1 = approximate_derivative(lambda x:g(x,x2),x1) # 求g对x1的偏导
    dg_x2 = approximate_derivative(lambda x:g(x1,x),x2) # 求g对x2的偏导
    return dg_x1,dg_x2
print(approximate_gradient(g,2.,3.))

(8.999999999993236, 41.999999999994486)


# 使用tf.GradientTape来实现自动求导

In [5]:
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    z = g(x1,x2)
# 使用tape求x1的偏导
dg_x1 = tape.gradient(z,x1)
print(dg_x1)

# 在没有指定tf.GradientTape()persistent=True时,这个tape只能使用一次
try:
    dg_x2 = tape.gradient(z,x2)
except RuntimeError as ex:
    print(ex)

tf.Tensor(9.0, shape=(), dtype=float32)
GradientTape.gradient can only be called once on non-persistent tapes.


In [8]:
# 如果要让tap使用多次，需要设置persistent=True，同时需要自己手动释放tape
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape(persistent=True) as tape:
    z = g(x1,x2)
# 使用tape求x1的偏导
dg_x1 = tape.gradient(z,x1)
print(dg_x1)

# 现在可以使用tape求x2的偏导
dg_x2 = tape.gradient(z,x2)
print(dg_x2)

# 需要手动释放tape
del tape

tf.Tensor(9.0, shape=(), dtype=float32)
tf.Tensor(42.0, shape=(), dtype=float32)


In [10]:
# 上面是分别对x1和x2求偏导，那么可以同时对x1和x2求偏导吗
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    z = g(x1,x2)
dg_x1x2 = tape.gradient(z,[x1,x2])
print(dg_x1x2)

[<tf.Tensor: id=155, shape=(), dtype=float32, numpy=9.0>, <tf.Tensor: id=161, shape=(), dtype=float32, numpy=42.0>]


In [11]:
# 如果x1和x2定义的是常量类型，那么可以求偏导吗
x1 = tf.constant(2.0)
x2 = tf.constant(3.0)
with tf.GradientTape() as tape:
    # 需要对常量设置单独的关注，不然是无法对常量类型求偏导的
    tape.watch(x1)
    tape.watch(x2)
    z = g(x1,x2)
dg_x1x2 = tape.gradient(z,[x1,x2])
print(dg_x1x2)

[<tf.Tensor: id=170, shape=(), dtype=float32, numpy=9.0>, <tf.Tensor: id=176, shape=(), dtype=float32, numpy=42.0>]


In [12]:
# 多核函数公用一个变量，可以分别求每个函数对于这个变量的导数
x = tf.Variable(5.0)
with tf.GradientTape() as tape:
    z1 = 3 * x
    z2 = x ** 2
dz1z2_x = tape.gradient([z1,z2],x)
print(dz1z2_x) # 得出来的结果是dz1_x的值3加上dz2_x的值10的和：13

tf.Tensor(13.0, shape=(), dtype=float32)


In [15]:
# 求多阶导数
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
# 才用嵌套的方式求多阶导数
with tf.GradientTape(persistent=True) as out_tape:
    with tf.GradientTape(persistent=True) as inner_tape:
        z = g(x1,x2)
    # 在求z对于x1和x2的一阶导数
    inner_grads = inner_tape.gradient(z,[x1,x2]) 
out_grads = [out_tape.gradient(inner_grad,[x1,x2]) for inner_grad in inner_grads]
print(out_grads)
# 返回结果是一个2 * 2 的矩阵：
# 矩阵[0,0]位置代表的是先对x1求一阶导数然后在对x1求二阶导数
# 矩阵[0,1]位置代表的是先对x2求一阶导数然后在对x1求一阶导数
del inner_tape
del out_tape

[[None, <tf.Tensor: id=294, shape=(), dtype=float32, numpy=6.0>], [<tf.Tensor: id=305, shape=(), dtype=float32, numpy=6.0>, <tf.Tensor: id=303, shape=(), dtype=float32, numpy=14.0>]]


# 使用自动求导来模拟梯度下降，寻找极值点

In [16]:
learning_rate = 0.01
x = tf.Variable(0.0)
for _ in range(100): # 梯度下降100次
    with tf.GradientTape() as tape:
        z = f(x) # 这个就相当于损失函数，我们要找的就是损失函数的极小值点时x的取值：理论上应该是-0.25
    dz_x = tape.gradient(z,x) # 求导
    # 梯度下降
    x.assign_sub(learning_rate * dz_x)
# 在训练100次后x的取值
print(x) # 理论值应该是0.25

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.24994019>


# 将梯度下降和optimizer结合起来一起使用

In [17]:
learning_rate = 0.01
x = tf.Variable(0.0)
# 定义一个optimizer
optimizer = tf.optimizers.SGD(learning_rate)
for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z,x)
    # 使用optimizer来进行梯度下降
    optimizer.apply_gradients([(dz_dx,x)])
print(x) # 结果和上面一致

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.24994019>
