In [2]:
!pip install gurobipy

import gurobipy as gp
from gurobipy import GRB

import numpy as np
import pandas as pd
import pickle as pkl
from datetime import datetime
import math

import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gurobipy
  Downloading gurobipy-10.0.1-cp310-cp310-manylinux2014_x86_64.whl (12.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gurobipy
Successfully installed gurobipy-10.0.1


In [3]:
# GOLBAL VARIABLES NEEDED TO RUN THE EXPERIMENT

# 1] ENTER DATA DIRECTORIES
base_dir  = "/content/drive/MyDrive/CIFAR_Dataset/CIFAR_10/Batch3_1+10K"
train_dir = base_dir+'/cifar10_train.npz'
val_dir   = base_dir+'/cifar10_val.npz'
test_dir  = base_dir+'/cifar10_test.npz'
# 2] ENTER DIRECTORY OF OPTUNA's TRAINED MODEL
OPTUNA_MODEL_DIRECTORY = "/content/drive/MyDrive/CIFAR_Dataset/CIFAR_10/resnet_optuna.pickle"

#3] GUROBI ENVIRONMENT WITH ACADEMIC LICENSE DETAILS
# In case you do not have a web license set up and find it difficult, Please see readmi file for further details.
ENV = gp.Env( empty=True )
ENV.setParam( 'WLSACCESSID', 'xxxxxxxxxxx' )   
ENV.setParam( 'WLSSECRET', 'xxxxxxxxx' )    
ENV.setParam( 'LICENSEID', xxxx )
ENV.setParam( 'OutputFlag', 0 )      # To Turn-off Logs
ENV.start()

#4] RESULTS OUTPUT DIRECTORY
full_output_directory   = "/content/drive/MyDrive/CIFAR_Dataset/CIFAR_10/resnet_result.xlsx"  # Second part is the name of the file. USE .xlsx in the end to save the result as excel file.

#**Hyper Local Search**

##**Loading Data**

In [4]:
# Enter training, validation and testing dataset directories for CIFAR-10 datasets.
def preprocess_image_input(input_images): # Only used when resent50 is selected as model_type below
  input_images = input_images.astype('float32')
  output_ims = tf.keras.applications.resnet50.preprocess_input(input_images)
  return output_ims

def data_loader( train_dir, val_dir, test_dir, model_type ): # model_type : 'cnn' OR 'resnet50' ( Please specify only one of these in sttring format )

    train_dataset = np.load(train_dir)
    val_dataset   = np.load(val_dir)
    test_dataset  = np.load(test_dir)

    y_train = train_dataset['y_train'].astype("float32")
    y_val   = val_dataset['y_val'].astype("float32") 
    y_test  = test_dataset['y_test'].astype("float32") 

    x_train = train_dataset['x_train'].astype("float32")
    x_val   = val_dataset['x_val'].astype("float32")
    x_test  = test_dataset['x_test'].astype("float32") 

    if model_type == 'cnn':
        x_train, x_val, x_test = x_train/255, x_val/255, x_test/255
    elif model_type == 'resnet50':
        x_train = preprocess_image_input(x_train)
        x_val = preprocess_image_input(x_val)
        x_test = preprocess_image_input(x_test)
    elif model_type not in ['cnn', 'resnet50']:
        raise ValueError('Error: Please enter correct \'model_type\' variable value. Correct values are \'cnn\' or \'resnet50\' (strings).')
    return x_train, x_val, x_test, y_train, y_val, y_test

## ============================================  LOADING DATA  =====================================================
x_train, x_val, x_test, y_train, y_val, y_test = data_loader( train_dir, val_dir, test_dir, model_type='resnet50' )

input_shape  = x_train.shape[-3:]
output_shape = 10

####**ResNet50 Model** 

In [5]:
class CNN:
    def __init__(self, input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes
    def generate_model(self, layer_info = None ):
        UpSampling = 224/self.input_shape[0]

        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Input(shape=self.input_shape))
        model.add( tf.keras.layers.UpSampling2D(size=(UpSampling,UpSampling)))
        model.add(tf.keras.applications.resnet50.ResNet50(include_top = False, weights = 'imagenet', input_shape = (224, 224, 3),  pooling = 'avg'))
        if layer_info is not None:
            for layer in layer_info[:-1]:
                layer_params = layer['params']
                if layer['type'] == 'dense':
                    model.add(tf.keras.layers.Dense(**layer_params))
                    model.add(tf.keras.layers.BatchNormalization())
            for layer in layer_info[-1:]:
                layer_params = layer['params']
                if layer['type'] == 'dense':
                    model.add(tf.keras.layers.Dense(**layer_params))
        model.layers[1].trainable = False
        # Calculate the number of trainable parameters in the model
        trainable_count = sum(tf.keras.backend.count_params(weights) for weights in model.trainable_weights)
        print(f"Trainable parameters: {trainable_count:,}")
        return model

In [6]:
def layer_information( output_classes, dense_units, dense_kernel_regularizers=None, output_kernel_regularizer=None, include_flatten=True ):
    layers = []
    if include_flatten: layers.append({'type': 'flatten', 'params': {}})
    if dense_kernel_regularizers != None:
        for units,lamda in zip(dense_units,dense_kernel_regularizers):
            layers.append( {'type': 'dense', 'params': {'units': units, 'activation': 'relu', 'kernel_regularizer':tf.keras.regularizers.l2(lamda)}} )
    else:
        for units in dense_units:
            layers.append( {'type': 'dense', 'params': {'units': units, 'activation': 'relu'}} )
    if output_kernel_regularizer == None:
        layers.append( {'type': 'dense', 'params': {'units': output_classes, 'activation': 'softmax'}} )
    else: layers.append( {'type': 'dense', 'params': {'units': output_classes, 'activation': 'softmax', 'kernel_regularizer':tf.keras.regularizers.l2(output_kernel_regularizer)}} )

    return layers

###**Loading OPTUNA trained models**

In [7]:
# LOADING OPTUNA TRAINED MODELS
with open(OPTUNA_MODEL_DIRECTORY, "rb") as fout:
        list__ = pkl.load(fout)
[ trainable_weights_list, full_weights_list, init_hyperparameters, weight_sets, grad_sets, optuna_time ] = list__

##**Getting Gradients and Hessian**

In [8]:
# WARNING : "model_" variable is used globally, so do not move the cell without editing the code.
tf.keras.backend.clear_session()
hidden_dense_layers = [16] # Defining new dense layers in the end.
# model_= NeuralNetwork( input_shape, layer_info_).generate_model()
layer_info_ = layer_information( output_shape, hidden_dense_layers, include_flatten=False )
model_= CNN( input_shape, output_shape).generate_model( layer_info_ )
model_.build(input_shape) # Unless .build is called, gradient tape watch list will be empty
model_.set_weights( full_weights_list )

# GETTING INDEXES OF TRAINABLE WEIGHTS ONLY
all_weights = model_.get_weights()
trainable_weights = model_.trainable_weights
index_set = []
for idx in range(len(all_weights)):
    weight = all_weights[idx]
    var_name = model_.weights[idx].name.split(':')[0]
    if var_name in [t.name.split(':')[0] for t in trainable_weights]:
        index_set.append(idx)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Trainable parameters: 32,986


In [9]:
@tf.function
# Layer_Weights_ToRegularize : Only give list of weights to be used on regularization term. (Do not include bias weights)
def loss_function( y_dataset, logits, Layer_Weights_ToRegularize = None,
                  Regularization_Parameters = None, loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) ): # logits = model(x_dataset)

    total_loss = loss(y_dataset, logits)
    total_loss = tf.cast( total_loss, dtype=tf.float32 )

    if Regularization_Parameters == None or Layer_Weights_ToRegularize == None:  return total_loss
    for weight, lamda in zip( Layer_Weights_ToRegularize, Regularization_Parameters ): 

        if not tf.is_tensor(weight): tf.convert_to_tensor(weight)

        regularization  = lamda * tf.reduce_sum(tf.square(weight))   # Element-wise square -> Adding all terms -> Multiply by lamda
        regularization /= tf.cast(tf.size(weight), dtype=tf.float32) # Get number of parameters (N)
        total_loss     += regularization
        
    return total_loss

In [10]:
# Extra functions required for operations during hessian approximation
def compute_outer_product(tensors1, tensors2): # USES CPU RAM

    arr1, arr2 = tensors1[0].reshape(-1), tensors2[0].reshape(-1)
    for tensor1, tensor2 in zip( tensors1[1:], tensors2[1:] ):
        arr1 = np.concatenate((arr1, tensor1), axis=None)
        arr2 = np.concatenate((arr2, tensor2), axis=None)
    return np.outer(arr1, arr2)

def compute_inner_product( tensors1, tensors2 ):

    flattened_vector1 = [tf.reshape(t, [-1]) for t in tensors1]
    flattened_vector1 = tf.concat(flattened_vector1, axis=-1)

    flattened_vector2 = [tf.reshape(t, [-1]) for t in tensors2]
    flattened_vector2 = tf.concat(flattened_vector2, axis=-1)

    flattened_shape = flattened_vector1.shape.as_list()

    matrix1 = tf.linalg.LinearOperatorFullMatrix(
        tf.reshape(flattened_vector1, [flattened_shape[0], 1])
    )
    matrix2 = tf.linalg.LinearOperatorFullMatrix(
        tf.reshape(flattened_vector2, [1,flattened_shape[0]])
    )
    return tf.linalg.matmul(matrix2, matrix1).to_dense()

# COMPUTING VALIDATION LOSS COEFFICIENTS
def gradient_validation( batch_size = 32 ):

    val_df = tf.data.Dataset.from_tensor_slices((x_val,y_val))
    val_df = val_df.shuffle(buffer_size = 1024).batch(batch_size)

    Step_Gradient, Num_batch = [], 0
    for step,(x_t,y_t) in enumerate(val_df):
        with tf.GradientTape(persistent=True) as tape:

            logits = model_(x_t)
            loss_  = loss_function( y_t, logits )

        vars_list = model_.trainable_weights
        grads = tape.gradient(loss_, vars_list)

        if step == 0 : Step_Gradient = grads
        else:
            for idx in range(len(Step_Gradient)):
                Step_Gradient[idx] =  tf.add(Step_Gradient[idx], grads[idx])
        Num_batch = step

    Step_Gradient = [ i/Num_batch for i in Step_Gradient ] 
    return Step_Gradient

# SYMMETRIC RANK-1 HESSIAN APPROXIMATION 
def sr1_hessian_approximation( weight_sets, grad_sets  ):

    trainable_count = sum(tf.keras.backend.count_params(weights) for weights in model_.trainable_weights)
    B_k             = np.identity(trainable_count, dtype = 'float32')

    weight_sets, grad_sets = weight_sets[:50], grad_sets[:50]

    total_iterations = len(weight_sets) - 1
    total_weights    = len(grad_sets[0])

    for iter in range( 1, total_iterations + 1  ):

        y_k, s_k = [], []
        for idx in range( total_weights ):

            y_k.append( (grad_sets[iter][idx] - grad_sets[iter-1][idx]).numpy() )
            s_k.append( weight_sets[iter][idx] - weight_sets[iter-1][idx] )

        arr1, arr2 = y_k[0].reshape(-1), s_k[0].reshape(-1)
        for tensor1, tensor2 in zip( y_k[1:], s_k[1:] ):
            arr1 = np.concatenate((arr1, tensor1), axis=None)
            arr2 = np.concatenate((arr2, tensor2), axis=None)
        del(y_k, s_k)
        Bk_sk = B_k.dot(arr2)
        term0 = arr1 - Bk_sk
        del(Bk_sk)
        TERM2 = np.outer( term0, term0 ) / term0.dot(arr2)
        del(term0)
        B_k  +=  TERM2
    return B_k

# using separate loss expression for regularization loss
def part_loss( regularized_weight_vars, Regularization_Parameters ):

    total_loss = tf.constant(0, dtype = tf.float32)
    for weight, lamda in zip( regularized_weight_vars, Regularization_Parameters ): 
        regularization  = lamda * tf.reduce_sum(tf.square(weight))
        regularization /= tf.cast(tf.size(weight), dtype=tf.float32)
        total_loss     += regularization

    return total_loss

def remaining_approximation( trainable_weights, Regularization_Parameters ):

    # Making the Hyperparameters tf variables (if not)
    for idx in range(len(Regularization_Parameters)):
        Regularization_Parameters[idx] = tf.Variable( Regularization_Parameters[idx] )
    
    # Getting indexes of variables being regularized based on dense layer structure. ( NOTE: This only works if only dense layers are regularized )
    Weights_Index_ToRegularize = []
    for idx in range(len(trainable_weights)):
        if len(trainable_weights[idx].shape) == 2:
            Weights_Index_ToRegularize.append(idx)

    All_Weights = [ tf.Variable(weight) for weight in trainable_weights ]
    weight_vars = [ All_Weights[idx] for idx in Weights_Index_ToRegularize]
    del(trainable_weights)

    with tf.GradientTape(persistent = True) as outer_tape:
        with tf.GradientTape() as inner_tape:
            loss_ = part_loss( weight_vars, Regularization_Parameters )
        grads = inner_tape.gradient(loss_, All_Weights, unconnected_gradients="zero")

    second_derivatives = []
    for g in grads:
        jacob = outer_tape.jacobian( g, Regularization_Parameters, unconnected_gradients ='zero') 
        second_derivatives.append( tf.convert_to_tensor(jacob) )

    # Combining the second-derivative entries
    for idx in range(len(second_derivatives)):
        tensor = second_derivatives[idx]
        if len(tensor.shape) >= 3:
            second_derivatives[idx] = tf.reshape(tensor, (tensor.shape[0], -1))
    # Concatenate matrix tensors row-wise
    second_derivatives = tf.concat(second_derivatives, axis=1)
    return second_derivatives

##**Direction Problem**

In [11]:
# Gurobi optimization
def Bilevel_Descent_Direction( GradUObj, Hessian_Follower, delta ): 

    (Rows_, Columns_)        = Hessian_Follower.shape
    Num_regularization_param = len(init_hyperparameters)
    
    # MODEL AND VARIABLE DECLARATION
    m = gp.Model(env = ENV )

    Model_Variables = m.addMVar( (Columns_), lb = -1, ub = 1, vtype = 'C' )
    # Note: Coefficients are scaled to avoid numerical issues.
    m.setObjective( 1e+6 * (GradUObj @ Model_Variables[:-Num_regularization_param]), 1 )
    m.addConstr( 1e+5 * (Hessian_Follower @ Model_Variables) <= delta*1e+5 )
    m.addConstr( 1e+5 * (Hessian_Follower @ Model_Variables) >= - delta*1e+5 )
    # OUTPUT HANDLING
    try:
        m.optimize()
        return m.X, m.ObjVal, m.Runtime
    except gp.GurobiError:
        m.computeIIS()
        m.write("IIS_System.ilp")
        return "Error in LB : GurobiError :: ", m.status

# Function to combine the hessian data and giving submatrix with random rows.
def random_constraints(percent_rows_used):

    hessian_part1     = sr1_hessian_approximation( weight_sets, grad_sets )
    remaining_columns = remaining_approximation( trainable_weights_list, init_hyperparameters )
    # hessian_full      = tf.concat( [ hessian_part1, tf.transpose(remaining_columns) ], axis = 1 )
    hessian_full      = np.concatenate((hessian_part1, remaining_columns.numpy().T),axis=1)
    del(hessian_part1, remaining_columns)
    num_hyperparams = len(init_hyperparameters)
    total_rows     = hessian_full.shape[0]

    # ======= PRE-PROCESSING ========
    imp_hessian_full = []
    for row in hessian_full:
        if list(row[-num_hyperparams:])!=[0 for i in range(num_hyperparams)]:
            imp_hessian_full.append(row)
    del(hessian_full)

    imp_hessian_full      = np.array(imp_hessian_full)
    Non_zero_rows         =  len(imp_hessian_full)
    percent_rows_remained =  Non_zero_rows/total_rows
    # print( f"\nAfter Pre-processing rows remained :: {percent_rows_remained*100} percent\n" )
    if percent_rows_remained <= percent_rows_used:
        return imp_hessian_full
    else:
        rows_asked     = percent_rows_used*total_rows
        np.random.shuffle(imp_hessian_full)     
        # Rows are shuffled. Each row remains unchanged.
        imp_hessian_full = imp_hessian_full[ : int(rows_asked), : ]

    return imp_hessian_full

def unflatten( full_weight_direction ): # Converts flattened directions into weight shapes
    result = []
    start = 0
    for param_size, shape in zip( layer_wise_params, layer_wise_shapes ):

        end = start + param_size[0]
        flat_list_params = np.array(full_weight_direction[start:end])
        start = end

        # Converting to tensor object just to use tf.reshape() function.
        flat_list_params = tf.convert_to_tensor(flat_list_params)
        flat_list_params = tf.reshape(flat_list_params, list(shape))
        result.append( flat_list_params )
    return result

def loss_value( new_weights, new_hyperparams, full_old_weights, Without_GSS = True ): # At every new point, gives loss value by using separate loss object

    new_hyperparams = [ float(p.numpy()) for p in new_hyperparams ]
    layer_info_ = layer_information( output_shape, hidden_dense_layers, dense_kernel_regularizers=new_hyperparams[:-1], output_kernel_regularizer=new_hyperparams[-1], include_flatten=False )
    # model= NeuralNetwork( input_shape, layer_info_).generate_model()
    model = CNN( input_shape, output_shape).generate_model( layer_info_)

    # Set the new weights as the model's weights. Non-trainable weights fixed as in optuna training.
    for idx, wt in zip(index_set,new_weights):
        full_old_weights[idx] = wt
    model.set_weights(full_old_weights)

    # Getting Scores
    cce = tf.keras.losses.SparseCategoricalCrossentropy() 
    y_pred_val = model.predict(x_val)
    val_loss_unregularized = cce(y_val, y_pred_val).numpy()

    if Without_GSS:
        # Remaining LOSS
        y_pred_train = model.predict(x_train)
        train_loss_unregularized = cce(y_train, y_pred_train).numpy()
        y_pred_test = model.predict(x_test)
        test_loss_unregularized = cce(y_test, y_pred_test).numpy()
        # Accuracy
        sca = tf.keras.metrics.SparseCategoricalAccuracy()
        train_acc = sca(y_train, y_pred_train).numpy()
        val_acc = sca(y_val, y_pred_val).numpy()
        test_acc = sca(y_test, y_pred_test).numpy()
        accuracy = [ train_acc, val_acc, test_acc ]
        loss     = [ train_loss_unregularized, val_loss_unregularized, test_loss_unregularized ]
        return loss, accuracy
    else:
        return val_loss_unregularized

##**Running Code**

In [12]:
# GETTING OBJECTIVE COEFFICIENTS
time1 = datetime.now()

Validation_Coefficients = gradient_validation( batch_size = 128 )
Validation_Coefficients = tf.concat( [tf.reshape(tensor, [-1]) for tensor in Validation_Coefficients], axis=0 ).numpy()

time2 = datetime.now()
coef_time = (time2 - time1).total_seconds()

In [13]:
percentage_of_submatrix = [ 0.01 ]
num_hyperparams         = len(init_hyperparameters)

trials = 1
for trial in range(trials):
    for rows_ in percentage_of_submatrix:
        # GETTING OBJECTIVE COEFFICIENTS AND CONSTRAINT MATRIX
        time1 = datetime.now()
        constraint_matrix = random_constraints(rows_) # Set percentage of hessian to be used in the direction problem
        time2 = datetime.now()
        data_collection_time = (time2 - time1).total_seconds() + coef_time

        # GETTING DIRECTIONS FROM LINEAR PROGRAM
        Directions_ = Bilevel_Descent_Direction( Validation_Coefficients, constraint_matrix, 1e-4)
        linear_problem_runtime = Directions_[-1]

        Directions_ = np.array(Directions_[0])
        Directions_ = Directions_/np.linalg.norm(Directions_)

        time1 = datetime.now()

        layer_wise_shapes = [ val.shape for val in weight_sets[0] ]  # Only trainable layers
        layer_wise_params = [ val.flatten().shape for val in weight_sets[0] ]

        Weight_directions     = unflatten( Directions_[:-num_hyperparams] )
        Hyperparam_directions = Directions_[-num_hyperparams:]

        validation_loss_ = 1e10

        def minimize_function(t,Without_GSS=False):
            hyperparams_t   = [ tf.math.add( i,t*j ) for i,j in zip( init_hyperparameters, Hyperparam_directions  ) ]
            layer_weights_t = []
            for i,j in zip( trainable_weights_list, Weight_directions ):
                new_weight = tf.math.add( tf.convert_to_tensor(i.numpy(), dtype = tf.float64),t*j )
                layer_weights_t.append( new_weight )
            if Without_GSS:
                loss, acc = loss_value( layer_weights_t, hyperparams_t, full_weights_list, Without_GSS )
                return loss, acc
            else:
                loss = loss_value( layer_weights_t, hyperparams_t, full_weights_list, Without_GSS )
                return loss

        def interval_search( init_t = 0, step = 0.1 ):
            a = init_t
            b = init_t + step

            loss_a, loss_b = minimize_function(a), minimize_function(b)
            i=0
            while loss_b < loss_a:
                i+=1
                a,loss_a = b, loss_b
                b = init_t + (2**i) * step
                loss_b = minimize_function(b)
            if i==0:
                return a,b
            elif i==1:
                return init_t,b
            else:
                return init_t + (2**(i-2)) * step, b

        def gss(tol=1e-5):
            a,b = interval_search()
            invphi = (math.sqrt(5) - 1) / 2  # 1 / phi
            invphi2 = (3 - math.sqrt(5)) / 2  # 1 / phi^2

            (a, b) = (min(a, b), max(a, b))
            h = b - a
            if h <= tol:
                return (a, b)

            # Required steps to achieve tolerance
            n = int(math.ceil(math.log(tol / h) / math.log(invphi)))

            c = a + invphi2 * h
            d = a + invphi * h
            yc = minimize_function(c)
            yd = minimize_function(d)

            for k in range(n - 1):
                if yc < yd:  # yc > yd to find the maximum
                    b = d
                    d = c
                    yd = yc
                    h = invphi * h
                    c = a + invphi2 * h
                    yc = minimize_function(c)
                else:
                    a = c
                    c = d
                    yc = yd
                    h = invphi * h
                    d = a + invphi * h
                    yd = minimize_function(d)

            if yc < yd:
                return (a, d)
            else:
                return (c, b)


        interval_ = gss(tol=1e-4 ) 
        time2 = datetime.now()

        optimal_t = (interval_[0] + interval_[1])/2
        loss, accuracy = minimize_function( optimal_t, Without_GSS=True) 
        init_loss, init_accuracy = minimize_function( 0, Without_GSS=True)
        print( "\nOptimal LOSS-> ", loss, "\nAccuracy -> ", accuracy, "\nt_star-> ", optimal_t )
        solution_search_runtime = (time2-time1).total_seconds()
        print("\n Total time taken for final improvement ::", solution_search_runtime, "\n\n")

        # ==================== Saving Results ================================
        DF = { "T_star"            : [ 0, optimal_t ],
              "Training_Loss"      : [ init_loss[0], loss[0] ],
              "Validation_Loss"    : [ init_loss[1], loss[1] ],
              "Testing_Loss"       : [ init_loss[2], loss[2] ],
              "Training_Accuracy"  : [ init_accuracy[0], accuracy[0] ],
              "Validation_Accuracy": [ init_accuracy[1], accuracy[1] ],
              "Testing_Accuracy"   : [ init_accuracy[2], accuracy[2] ],
              "Runtime"            :[ optuna_time.total_seconds(), data_collection_time + linear_problem_runtime + solution_search_runtime ]}
        
        DF = pd.DataFrame.from_dict(DF)
        DF.to_excel(full_output_directory)

        print("\n\n",DF)



Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986
Trainable parameters: 32,986

Optimal LOSS->  [0.79676545, 1.2854482, 1.324376] 
Accuracy ->  [0.76, 0.686, 0.5660909] 
t_star->  0.1668637919568067

 Total time taken for final improvement :: 77.168993 




      T_star  Training_Loss  Validation_Loss  Testing_Loss  Training_Accuracy  \
0  0.000000       0.889145         1.442169 