In [13]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [14]:
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
    except RuntimeError as e:
        print(e)

In [15]:
class LinearDeepQNetwork(keras.Model):
    def __init__(self,
                 lr,
                 n_actions,
                 input_dims):
        super(LinearDeepQNetwork, self).__init__()
        
        self.fc1 = layers.Dense(128, activation='relu', input_shape=input_dims)
        self.fc2 = layers.Dense( n_actions, activation=None)
        
        self.optimizer = keras.optimizers.Adam(learning_rate=lr)
        self.loss = keras.losses.MeanSquaredError()
    
    def call(self, inputs, training=False, **kwargs):
        with tf.device('/GPU:0'):  

            x = self.fc1(inputs)
            x = self.fc2(x)
            return x

In [16]:
class Agent():
    def __init__(self,
                 lr,
                 n_actions,
                 input_dims,
                 gamma=0.99,
                 initial_epsilon=1,
                 epsilon_decay=0.99,
                 final_epsilon=0.01):
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.gamma = gamma
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.action_space = [i for i in range(self.n_actions)]
        
        self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims)
        
    def choose_action(self, obs):
        with tf.device('/GPU:0'):  # or '/CPU:0' if you're using CPU
            if np.random.random() > self.epsilon:
                state = tf.convert_to_tensor([obs], dtype=tf.float32)
                actions = self.Q(state)
                actions = tf.squeeze(actions)
                action = tf.argmax(actions)
                print('All Actions: ', actions)
                print('Selected Action: ', action.numpy())
            else:
                action = np.random.choice(self.action_space)
            
            return action
    
    def decrement_epsilon(self):
        self.epsilon = self.epsilon * self.epsilon_decay \
                       if self.epsilon > self.final_epsilon else self.final_epsilon
        
    @tf.function
    def learn(self, state, action, reward, next_state):
        with tf.GradientTape() as tape:
            states = tf.convert_to_tensor([state], dtype=tf.float32)
            actions = tf.convert_to_tensor([action], dtype=tf.int32)
            rewards = tf.convert_to_tensor([reward], dtype=tf.float32)
            next_states = tf.convert_to_tensor([next_state], dtype=tf.float32)
            
            q_pred = tf.gather_nd(self.Q(states), tf.expand_dims(actions, -1))
            print(f'q_pred: {q_pred.shape}')
            q_next = tf.reduce_max(self.Q(next_states), axis=1)
            print(f'q_next: {q_next.shape}')        
            q_target = rewards + self.gamma * q_next 
            print(f'q_target: {q_target.shape}')
        
            loss = self.Q.loss(q_pred, q_target)
        gradients = tape.gradient(loss, self.Q.trainable_variables)
        self.Q.optimizer.apply_gradients(zip(gradients, self.Q.trainable_variables))
        self.decrement_epsilon()

In [17]:
nadir= Agent(lr=0.1, n_actions=4, input_dims=[128,128,3])
nadir.choose_action([12,12,2])


0

In [18]:
nadir.learn(np.random.random((128,128,3)),2,2,np.random.random((128,128,3))) 

q_pred: (1, 128, 128, 4)
q_next: (1, 128, 4)
q_target: (1, 128, 4)
trainable weights : [<tf.Variable 'linear_deep_q_network_2/dense_4/kernel:0' shape=(3, 128) dtype=float32>, <tf.Variable 'linear_deep_q_network_2/dense_4/bias:0' shape=(128,) dtype=float32>, <tf.Variable 'linear_deep_q_network_2/dense_5/kernel:0' shape=(128, 4) dtype=float32>, <tf.Variable 'linear_deep_q_network_2/dense_5/bias:0' shape=(4,) dtype=float32>]
q_pred: (1, 128, 128, 4)
q_next: (1, 128, 4)
q_target: (1, 128, 4)
trainable weights : [<tf.Variable 'linear_deep_q_network_2/dense_4/kernel:0' shape=(3, 128) dtype=float32>, <tf.Variable 'linear_deep_q_network_2/dense_4/bias:0' shape=(128,) dtype=float32>, <tf.Variable 'linear_deep_q_network_2/dense_5/kernel:0' shape=(128, 4) dtype=float32>, <tf.Variable 'linear_deep_q_network_2/dense_5/bias:0' shape=(4,) dtype=float32>]
