# Ex. 12

## a)

In [2]:
import tensorflow as tf

In [3]:
# def build(self, input_shape):
#     self.alpha = self.add_weight(name="alpha", shape=[input_shape[-1:]], dtype=tf.float32, initializer="ones")
#     self.beta = self.add_weight(name="beta", shape=[input_shape[-1:]], dtype=tf.float32, initializer="zeros")

## b)

In [5]:
class MyDense(tf.keras.layers.Layer):
    def __init__(self, eps=0.001, **kwargs):
        super().__init__(**kwargs)
        self.eps = eps

    def build(self, input_shape):
        self.alpha = self.add_weight(name="alpha", shape=input_shape[-1:], dtype=tf.float32, initializer="ones")
        self.beta = self.add_weight(name="beta", shape=input_shape[-1:], dtype=tf.float32, initializer="zeros")

    def call(self, X):
        mu, var = tf.nn.moments(X, axes=-1, keepdims=True)
        return self.alpha * (X - mu)/(tf.sqrt(var + self.eps)) + self.beta

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "eps": self.eps}

## c)

In [7]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

In [14]:
import numpy as np
from tensorflow.keras.layers import LayerNormalization
X = X_train.astype(np.float32)

custom_layer_norm = MyDense()
keras_layer_norm = tf.keras.layers.LayerNormalization()

tf.reduce_mean(tf.keras.losses.mean_absolute_error(keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=3.9357733e-08>

In [15]:
tf.keras.utils.set_random_seed(42)
random_alpha = np.random.rand(X.shape[-1])
random_beta = np.random.rand(X.shape[-1])

custom_layer_norm.set_weights([random_alpha, random_beta])
keras_layer_norm.set_weights([random_alpha, random_beta])

tf.reduce_mean(tf.keras.losses.mean_absolute_error(keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=1.7553074e-08>

# Ex. 13

## a)

In [16]:
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full.astype(np.float32) / 255.
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test.astype(np.float32) / 255.

In [17]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax"),
])

In [18]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [25]:
from collections import OrderedDict

def random_batch(X, y, batch_size=32):
    idx = np.random.randint(len(X), size=batch_size)
    return X[idx], y[idx]

def print_status_bar(step, total, loss, metrics=None):
    metrics = " - ".join([f"{m.name}: {m.result():.4f}" for m in [loss] + (metrics or [])])
    end = "" if step < total else "\n"
    print(f"\r{step}/{total} - " + metrics, end=end)

for epoch in range(1, n_epochs + 1):
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train, y_train)
        with tf.GradientTape() as tape:
            y_pred = model(X_batch, training=True)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        for variable in model.variables:
            if variable.constraint is not None:
                variable.assign(variable.constraint(variable))
        status = OrderedDict()
        mean_loss(loss)
        status["loss"] = mean_loss.result().numpy()
        for metric in metrics:
            metric(y_batch, y_pred)
            status[metric.name] = metric.result().numpy()
        
        print_status_bar(step, n_steps, mean_loss, metrics)
        # steps.set_postfix(status)
        
        y_pred = model(X_valid)
        status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
        status["val_accuracy"] = np.mean(tf.keras.metrics.sparse_categorical_accuracy(
            tf.constant(y_valid, dtype=np.float32), y_pred))
        # steps.set_postfix(status)
    for metric in [mean_loss] + metrics:
        metric.reset_states()

1718/1718 - mean: 0.4785 - sparse_categorical_accuracy: 0.8277
1718/1718 - mean: 0.3926 - sparse_categorical_accuracy: 0.8567
1718/1718 - mean: 0.3728 - sparse_categorical_accuracy: 0.8648
1718/1718 - mean: 0.3703 - sparse_categorical_accuracy: 0.8685
1718/1718 - mean: 0.3583 - sparse_categorical_accuracy: 0.8715


## b)

In [26]:
lower_layers = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu"),
])
upper_layers = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="softmax"),
])
model = tf.keras.Sequential([
    lower_layers, upper_layers
])

In [27]:
lower_optimizer = tf.keras.optimizers.SGD(learning_rate=1e-4)
upper_optimizer = tf.keras.optimizers.Nadam(learning_rate=1e-3)

In [28]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [31]:
for epoch in range(1, n_epochs + 1):
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train, y_train)
        with tf.GradientTape(persistent=True) as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        for layers, optimizer in ((lower_layers, lower_optimizer),
                                    (upper_layers, upper_optimizer)):
            gradients = tape.gradient(loss, layers.trainable_variables)
            optimizer.apply_gradients(zip(gradients, layers.trainable_variables))
        del tape
        for variable in model.variables:
            if variable.constraint is not None:
                variable.assign(variable.constraint(variable))                    
        status = OrderedDict()
        mean_loss(loss)
        status["loss"] = mean_loss.result().numpy()
        for metric in metrics:
            metric(y_batch, y_pred)
            status[metric.name] = metric.result().numpy()

        print_status_bar(step, n_steps, mean_loss, metrics)

    for metric in [mean_loss] + metrics:
        metric.reset_states()

1718/1718 - mean: 1.0526 - sparse_categorical_accuracy: 0.6892
1718/1718 - mean: 0.6439 - sparse_categorical_accuracy: 0.7870
1718/1718 - mean: 0.5734 - sparse_categorical_accuracy: 0.8055
1718/1718 - mean: 0.5419 - sparse_categorical_accuracy: 0.8118
1718/1718 - mean: 0.5264 - sparse_categorical_accuracy: 0.8183
