In [1]:
### Import the required libraries
import numpy as np
import scipy
import matplotlib.pyplot as plt
import cmocean

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore")

import tensorflow as tf
import tensorflow.keras as keras
from keras import metrics
import innvestigate

import os
from os.path import join
import sys

import xarray as xr
import xmitgcm
from xmitgcm import open_mdsdataset
import ecco_v4_py as ecco

import random

# See if GPUs are available
from keras import backend as K
if bool(K._get_available_gpus()):
    print("Running on GPU")
else:
    print("Running on CPU")

# Append to sys.path the absolute path to src/XAIRT
path_list = os.path.abspath('').split('/')
path_src_XAIRT = ''
for link in path_list[:-1]:
    path_src_XAIRT = path_src_XAIRT+link+'/'
sys.path.append(path_src_XAIRT+'/src')

# Now import module XAIRT
from XAIRT import *

### https://stackoverflow.com/questions/36288235/how-to-get-stable-results-with-tensorflow-setting-random-seed ###
### https://keras.io/examples/keras_recipes/reproducibility_recipes/ ###
SEED = 42
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

# Enable eager execution for OI using trainOI_useGradientTape
# tf.compat.v1.disable_eager_execution()
tf.compat.v1.enable_eager_execution()

2024-01-29 12:23:34.174264: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-29 12:23:35.659296: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38221 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:21:00.0, compute capability: 8.0
2024-01-29 12:23:35.662370: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38221 MB memory:  -> device: 1, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:81:00.0, compute capability: 8.0
2024-01-29 12:23:35.664508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/rep

Running on GPU


## Basic example of optimization with Adam

In [2]:
x = tf.Variable(tf.convert_to_tensor(np.array([1.,-1.])))
with tf.GradientTape() as g:
    g.watch(x)
    loss = tf.norm(x)
# This has to be outside the with statement for efficiency, unless you want higher order derivatives.
grads = g.gradient(loss, x)
# Apply gradient using in-built tf optimizer
adam = keras.optimizers.Adam(learning_rate=0.1)
for i in range(10):
    adam.apply_gradients(zip([grads], [x]))
    print(x)

<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.90000045, -0.90000045])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.80000076, -0.80000076])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.70000102, -0.70000102])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.60000124, -0.60000124])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.50000144, -0.50000144])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.40000162, -0.40000162])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.30000179, -0.30000179])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.20000194, -0.20000194])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 0.10000209, -0.10000209])>
<tf.Variable 'Variable:0' shape=(2,) dtype=float64, numpy=array([ 2.23251777e-06, -2.23251777e-06])>


## Optimal input for Classification problem using Adam

In [3]:
N = 1000000
X = np.random.rand(N,2)
y = np.dot(X,np.array([1,2]))[:,np.newaxis]
oneHot = np.zeros([y.shape[0],2])
oneHot[:,0] = ((y[:,0] >= 1.) & (y[:,0] <= 2.))
oneHot[:,1] = ((y[:,0] < 1.) | (y[:,0] > 2.))

np.sum(oneHot[:,0]), np.sum(oneHot[:,1])

(500041.0, 499959.0)

In [4]:
Layers = [{'size': X.shape[1], 'activation': None     , 'use_bias': None},
          {'size': 8         , 'activation': 'relu'   , 'use_bias': True},
          {'size': 8         , 'activation': 'relu'   , 'use_bias': True},
          {'size': 2         , 'activation': 'softmax', 'use_bias': True}]
Losses = [{'kind': 'categorical_crossentropy', 'weight': 1.0}]

NNkwargs = {'losses': Losses, 'metrics': ['mae'],
            'batch_size': 2048, 'epochs': 5, 'validation_split': 0.2,
            'filename': 'model_simpleTests_OI', 'dirname': os.path.abspath(''),
            'random_nn_seed': 42}

# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.01
    drop = 0.5
    epochs_drop = 25
    lrate = initial_lrate * drop**np.floor((1+epoch)/epochs_drop)
    return lrate

keras.backend.clear_session()
sgd = keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
NNkwargs['optimizer'] = sgd

K = TrainFullyConnectedNN(X, oneHot, layers = Layers, **NNkwargs)
best_model = K.quickTrain(step_decay)
oneHot_NN = best_model.predict(X)

2024-01-29 12:23:37.859015: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.



Epoch 1: val_loss improved from inf to 0.71864, saving model to /home1/07665/shrey911/XAIRT/examples/model_simpleTests_OI.h5

Epoch 2: val_loss improved from 0.71864 to 0.69563, saving model to /home1/07665/shrey911/XAIRT/examples/model_simpleTests_OI.h5

Epoch 3: val_loss improved from 0.69563 to 0.67233, saving model to /home1/07665/shrey911/XAIRT/examples/model_simpleTests_OI.h5

Epoch 4: val_loss improved from 0.67233 to 0.61536, saving model to /home1/07665/shrey911/XAIRT/examples/model_simpleTests_OI.h5

Epoch 5: val_loss improved from 0.61536 to 0.56010, saving model to /home1/07665/shrey911/XAIRT/examples/model_simpleTests_OI.h5


Class 1: $x_1 + 2x_2 \in [1,2]$, Class 2 is the opposite.

Gradient descent might be too slow. Let's try to use the Adam optimizer instead. Note that this requires eager execution to be enabled right at the start of program execution.

In [8]:
x = tf.Variable(tf.convert_to_tensor(np.array([[0.0,0.1]])))
dl = tf.Variable(tf.convert_to_tensor(np.array([[1.,0.]])))
bce = tf.keras.losses.BinaryCrossentropy()
adam = keras.optimizers.Adam(learning_rate=0.01)
sgd = keras.optimizers.SGD(learning_rate=0.01)

for i in range(2000):
    with tf.GradientTape() as g:
        g.watch(x)
        preds = best_model(x)
        loss = bce(dl, preds)
    # This has to be outside the with statement for efficiency, unless you want higher order derivatives.
    grads = g.gradient(loss, x)
    # Apply gradient using in-built tf optimizer

    sgd.apply_gradients(zip([grads], [x]))
    x_numpy = x.numpy()
    print(x_numpy)

[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.  0.1]]
[[0.

In [6]:
model = keras.models.clone_model(best_model)
inp = tf.Variable(tf.convert_to_tensor(np.array([[5.0,10.0]])))
desired_labels = tf.Variable(tf.convert_to_tensor(np.array([[1.,0.]])))
bce = tf.keras.losses.BinaryCrossentropy()
adam = keras.optimizers.Adam(learning_rate=0.01)
sgd = keras.optimizers.SGD(learning_rate=0.01)

for i in range(2000):
    loss_val = TrainOI_useGradientTape(model, inp, desired_labels, bce, sgd)
    if (i+1) % 1 == 0:
        print(f"Iter {i+1}: OI {inp}")

Iter 1: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.9971961 , 10.00151415]])>
Iter 2: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.99439219, 10.0030283 ]])>
Iter 3: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.99158829, 10.00454245]])>
Iter 4: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.98878559, 10.00605606]])>
Iter 5: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.98598289, 10.00756967]])>
Iter 6: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.9831802 , 10.00908328]])>
Iter 7: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.9803775 , 10.01059689]])>
Iter 8: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.97757588, 10.01211004]])>
Iter 9: OI <tf.Variable 'Variable:0' shape=(1, 2) dtype=float64, numpy=array([[ 9.97477568, 10.01362186]])>
Iter 10: OI <tf.Variable 'Va