In [None]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical

import warnings
warnings.filterwarnings("ignore")

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
train_dataset = np.load('train_dataset_10k_60.npz')
val_dataset = np.load('validation_dataset_10k_40.npz')
test_dataset = np.load('test_dataset.npz')

# =========== Loading Datasets ===============

x_train = train_dataset['x'].reshape(6000, 784).astype("float32") / 255
y_train = train_dataset['y'].astype("float32")
  
x_val = val_dataset['x'].reshape(4000, 784).astype("float32") / 255
y_val = val_dataset['y'].astype("float32")   
                    
x_test = test_dataset['x'].reshape(10010, 784).astype("float32") / 255
y_test = test_dataset['y'].astype("float32")                    


x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape 

((6000, 784), (6000,), (4000, 784), (4000,), (10010, 784), (10010,))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(784,)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(10)    # didn't use softmax since it will be called when (logits=true) in below step
])
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
@tf.function
def loss(w1,w2,lamda1, lamda2,loss_fn,y_train,logits): # Lambda
    total_loss = loss_fn(y_train,logits)
    return total_loss + (tf.math.exp(lamda1)*tf.nn.l2_loss(w1) + tf.math.exp(lamda2)*tf.nn.l2_loss(w2))/(2*y_train.shape[0])

In [None]:
wt_layer1_init = model.layers[1].get_weights()
wt_layer2_init = model.layers[2].get_weights()

train_df = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_df = train_df.shuffle(buffer_size = 1024).batch(64)

def fmin_loss(lamda1, lamda2, l_rate, momentum, epochs= 50, nesterov = True):      # lamda, not exp(lamda), Works with both tf.Variable and tf.constant type lambda input, (or just scalar)
    tf.keras.backend.clear_session()
    optimizer = keras.optimizers.SGD(learning_rate=l_rate,momentum = momentum , nesterov =nesterov )
    total_loss0 = 1e20

    for epoch in range(epochs):
        for step,(x_train,y_train) in enumerate(train_df):
            with tf.GradientTape() as tape:
                logits = model(x_train, training=True)
                w1 = model.layers[1].weights[0]
                w2 = model.layers[2].weights[0]
                total_loss1 = loss(w1,w2,lamda1, lamda2,loss_fn,y_train,logits)
                
            vars_list = model.trainable_weights
            grads = tape.gradient(total_loss1, vars_list)      # for ref  - https://www.tensorflow.org/tutorials/customization/custom_training_walkthrough 
            optimizer.apply_gradients(zip(grads,vars_list))

        total_loss0 = total_loss1

    wt_layer1 = model.layers[1].get_weights()
    wt_layer2 = model.layers[2].get_weights()
    model.layers[1].set_weights(wt_layer1_init)
    model.layers[2].set_weights(wt_layer2_init)

    return [total_loss1, wt_layer1, wt_layer2]

In [None]:
l_rates = np.linspace(0.1,0.9,9)
momentas = np.arange(0.01,0.1,0.01,dtype='float32')

mn_loss = 1e12
glbl_l_rate = None
glbl_momentum = None
for l_rate in l_rates:
  for momentum in momentas:
    # Setting Initial Weights For Every Run
    model.layers[1].set_weights(wt_layer1_init)
    model.layers[2].set_weights(wt_layer2_init)
#     model.layers[1].set_weights(wt_layer1)
#     model.layers[2].set_weights(wt_layer2)
    loss_ , _ , _ = fmin_loss(-5.0, -4.0 , l_rate, momentum )
    print(f'loss = {loss_} , l_rate = {l_rate}, momentum = {momentum}')
    if loss_ < mn_loss:
      glbl_l_rate = l_rate
      glbl_momentum = momentum
      mn_loss = loss_

print('###############################################################')
print('--------------------Result-------------------------------------')
print(f'final_loss = {mn_loss} , l_rate = {glbl_l_rate} , glbl_momentum = {glbl_momentum}')

loss = 0.03374863788485527 , l_rate = 0.1, momentum = 0.009999999776482582
loss = 0.03230967000126839 , l_rate = 0.1, momentum = 0.019999999552965164
loss = 0.0493498221039772 , l_rate = 0.1, momentum = 0.029999999329447746
loss = 0.0330284982919693 , l_rate = 0.1, momentum = 0.03999999910593033
loss = 0.03666314110159874 , l_rate = 0.1, momentum = 0.04999999701976776
loss = 0.03719747066497803 , l_rate = 0.1, momentum = 0.059999994933605194
loss = 0.03735111281275749 , l_rate = 0.1, momentum = 0.07000000029802322
loss = 0.03319472447037697 , l_rate = 0.1, momentum = 0.07999999821186066
loss = 0.03180369362235069 , l_rate = 0.1, momentum = 0.08999999612569809
loss = 0.02997908741235733 , l_rate = 0.2, momentum = 0.009999999776482582
loss = 0.0312800258398056 , l_rate = 0.2, momentum = 0.019999999552965164
loss = 0.025714974850416183 , l_rate = 0.2, momentum = 0.029999999329447746
loss = 0.02788432687520981 , l_rate = 0.2, momentum = 0.03999999910593033
loss = 0.033578187227249146 , l_r

In [None]:
lamdas = np.linspace(-10,0,30, dtype='float32')
lamda_grid = [[i,j] for i in lamdas for j in lamdas]

In [None]:
loss_from_val = 1000000000
loss_from_train = None
loss_from_test = None
corres_lamda = None

for lamda in lamda_grid:
  model.layers[1].set_weights(wt_layer1_init)
  model.layers[2].set_weights(wt_layer2_init)
  
  min_loss,final_wt1,final_wt2 = fmin_loss(lamda[0],lamda[1],glbl_l_rate,glbl_momentum)
  
  model.layers[1].set_weights(final_wt1)
  model.layers[2].set_weights(final_wt2)
  
  val_logits = model(x_val,training=False)
  val_loss = loss_fn(y_val,val_logits)
  train_logits = model(x_train,training=False)
  training_loss = loss_fn(y_train,train_logits)
  
  test_logits = model(x_test,training=False)
  test_loss = loss_fn(y_test,test_logits)
  if val_loss < loss_from_val:
    loss_from_val = val_loss
    loss_from_train = training_loss
    loss_from_test = test_loss
    corres_lamda = lamda
    print(f'min  loss at lamda = {lamda} is train_loss = {training_loss}, val_loss = {val_loss} , test_loss = {test_loss}')

print('###############################################################')
print('--------------------Result-------------------------------------')
print(f'min  loss at lamda = {corres_lamda} is train_loss = {loss_from_train}, val_loss = {loss_from_val} , test_loss = {loss_from_test}')


min  loss at lamda = [-10.0, -10.0] is train_loss = 0.004425769671797752, val_loss = 0.1872900128364563 , test_loss = 0.25222790241241455
min  loss at lamda = [-10.0, -9.310345] is train_loss = 0.004428269807249308, val_loss = 0.18608330190181732 , test_loss = 0.24929757416248322
min  loss at lamda = [-10.0, -8.275862] is train_loss = 0.0044098771177232265, val_loss = 0.1858312040567398 , test_loss = 0.2473851591348648
min  loss at lamda = [-10.0, -7.2413793] is train_loss = 0.004395514726638794, val_loss = 0.1837134063243866 , test_loss = 0.24948015809059143
min  loss at lamda = [-10.0, -4.827586] is train_loss = 0.004941008519381285, val_loss = 0.18226398527622223 , test_loss = 0.24274249374866486
min  loss at lamda = [-10.0, -3.7931035] is train_loss = 0.005830907262861729, val_loss = 0.17781183123588562 , test_loss = 0.23657570779323578
min  loss at lamda = [-10.0, -3.4482758] is train_loss = 0.0064658476039767265, val_loss = 0.17337410151958466 , test_loss = 0.23160186409950256
mi

In [None]:
random_lamda = np.random.random(30)     #value will be only between [0,1)
ran_lambda = np.sort(10*random_lamda.astype('float32') - 10)
ran_grid = [[i,j] for i in ran_lambda for j in ran_lambda]

In [None]:
loss_from_val = 1000000000
loss_from_train = None
loss_from_test = None
corres_lamda = None

for lamda in ran_grid:
  model.layers[1].set_weights(wt_layer1_init)
  model.layers[2].set_weights(wt_layer2_init)
  
  min_loss,final_wt1,final_wt2 = fmin_loss(lamda[0],lamda[1],glbl_l_rate,glbl_momentum)
  
  model.layers[1].set_weights(final_wt1)
  model.layers[2].set_weights(final_wt2)
  
  val_logits = model(x_val,training=False)
  val_loss = loss_fn(y_val,val_logits)
  train_logits = model(x_train,training=False)
  training_loss = loss_fn(y_train,train_logits)
  
  test_logits = model(x_test,training=False)
  test_loss = loss_fn(y_test,test_logits)
  if val_loss < loss_from_val:
    loss_from_val = val_loss
    loss_from_train = training_loss
    loss_from_test = test_loss
    corres_lamda = lamda
    print(f'min  loss at lamda = {lamda} is train_loss = {training_loss}, val_loss = {val_loss} , test_loss = {test_loss}')

print('###############################################################')
print('--------------------Result-------------------------------------')
print(f'min  loss at lamda = {corres_lamda} is train_loss = {loss_from_train}, val_loss = {loss_from_val} , test_loss = {loss_from_test}')


min  loss at lamda = [-9.645136, -9.645136] is train_loss = 0.004438948817551136, val_loss = 0.18946228921413422 , test_loss = 0.250110924243927
min  loss at lamda = [-9.645136, -8.403118] is train_loss = 0.0043986160308122635, val_loss = 0.18637166917324066 , test_loss = 0.24713745713233948
min  loss at lamda = [-9.645136, -6.304161] is train_loss = 0.004547660704702139, val_loss = 0.1848156452178955 , test_loss = 0.2505200505256653
min  loss at lamda = [-9.645136, -5.3244348] is train_loss = 0.004686613101512194, val_loss = 0.18370741605758667 , test_loss = 0.24619096517562866
min  loss at lamda = [-9.645136, -4.009692] is train_loss = 0.005494807381182909, val_loss = 0.18079005181789398 , test_loss = 0.24162936210632324
min  loss at lamda = [-9.645136, -3.8369074] is train_loss = 0.005848594009876251, val_loss = 0.1783258467912674 , test_loss = 0.2404467910528183
min  loss at lamda = [-9.645136, -3.5579185] is train_loss = 0.006328620482236147, val_loss = 0.17770907282829285 , test_