In [51]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [52]:
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
    except RuntimeError as e:
        print(e)

In [53]:
class LinearDeepQNetwork(keras.Model):
    def __init__(self,
                 lr,
                 n_actions,
                 input_dims):
        super(LinearDeepQNetwork, self).__init__()
        
        self.fc1 = layers.Dense(128, activation='relu', input_shape=input_dims)
        self.fc2 = layers.Dense( n_actions, activation=None)
        self.flatten = layers.Flatten() 
        self.optimizer = keras.optimizers.Adam(learning_rate=lr)
        self.loss = keras.losses.MeanSquaredError()
    
    def call(self, inputs, training=False, **kwargs):
        with tf.device('/GPU:0'):  

            x = self.fc1(inputs)
            x = self.fc2(x)
            return x

In [54]:
class Agent():
    def __init__(self,
                 input_dims,
                 n_actions,
                 gamma=0.99,
                 lr=0.0001,
                 initial_epsilon=1,
                 epsilon_decay=1e-4,
                 final_epsilon=0.01):
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.gamma = gamma
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.action_space = [i for i in range(self.n_actions)]
        
        self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims)
        
    def choose_action(self, obs):
            if np.random.random() > self.epsilon:
                state = tf.convert_to_tensor([obs], dtype=tf.float32)
                actions = self.Q(state)
                actions = tf.squeeze(actions)
                action = tf.argmax(actions)
                # print('All Actions: ', actions)
                # print('Selected Action: ', action.numpy())
                action = int(action.numpy())
            else:
                action = np.random.choice(self.action_space)
            
            return action
    
    def decrement_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)
        
    def learn(self, state, action, reward, next_state):
        with tf.GradientTape() as tape:
            states = tf.convert_to_tensor([state], dtype=tf.float32)
            actions = tf.convert_to_tensor([action], dtype=tf.int32)
            rewards = tf.convert_to_tensor([reward], dtype=tf.float32)
            next_states = tf.convert_to_tensor([next_state], dtype=tf.float32)
            
            # print(f'states {states.shape}, next_states {next_states.shape}')
            q_pred = self.Q.call(states)
#             print(f'q_pred: {q_pred.shape}')
            q_next = tf.reduce_max(self.Q.call(next_states))
            q_next_max = tf.reduce_max(q_next)
            # print(f'q_next_max: {q_next_max.shape}')        
            q_target = rewards + self.gamma * q_next_max
            # print(f'q_target: {q_target.shape}')
            loss = self.Q.loss(q_pred, q_target)
            
        gradients = tape.gradient(loss, self.Q.trainable_weights)
        self.Q.optimizer.apply_gradients(zip(gradients, self.Q.trainable_weights))
        self.decrement_epsilon()

In [55]:
if __name__ == '__main__':
    nb_episodes = 500 
    env = gym.make('CartPole-v1')
    env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=nb_episodes)
    scores = []
    eps_history = []
    agent = Agent(input_dims= env.observation_space.shape,
                  n_actions=env.action_space.n,
                  lr=0.0001
                  )
    avg_score =0
    for i in range (nb_episodes):
        if avg_score > 100 : 
            break
        else:
            score = 0
            done = False
            obs, _ = env.reset()
            while not done:
                action = agent.choose_action(obs)
                next_obs, reward, terminated,truncated, _ = env.step(action)
                score+= reward
                agent.learn(obs, action, reward, next_obs)
                obs = next_obs
                done = truncated or terminated
            scores.append(score)
            eps_history.append(agent.epsilon)
            
            if i % 100 == 0:
                avg_score = np.mean(scores[-100:])
                
                print('Episode', i, ' avg score %.1f epsilon %.3f'% ( avg_score,agent.epsilon) )
        
            

Episode 0  avg score 27.0 epsilon 0.997
Episode 100  avg score 21.4 epsilon 0.783
Episode 200  avg score 22.8 epsilon 0.555
Episode 300  avg score 22.7 epsilon 0.328
Episode 400  avg score 24.7 epsilon 0.081


In [56]:
import os
rolling_length = 20
plt.title("Episode rewards")
# compute and assign a rolling average of the data to provide a smoother graph
reward_moving_average = (
    np.convolve(
        np.array(env.return_queue).flatten(), np.ones(rolling_length), mode="valid"
    )
    / rolling_length
)
plt.plot(range(len(reward_moving_average)), reward_moving_average)
figs_dir = "figs"
if not os.path.exists(figs_dir):
    os.makedirs(figs_dir)

# Save the plot to the "figs" directory
plt.savefig(os.path.join(figs_dir, "rewards.png"))
plt.show()

In [57]:
import matplotlib
matplotlib.use('TkAgg')  # or 'Qt5Agg' if you prefer Qt
env = gym.make('CartPole-v1', render_mode = 'rgb_array')
obs, info = env.reset()

plt.ion()
fig, ax = plt.subplots(figsize=(8,8))
action_text = ax.text(510, 20, '', color='white', fontsize=12, bbox=dict(facecolor='blue', alpha=0.8))
actions = ['Left','Right']
img = ax.imshow(env.render())
rewards = 0
num_epochs= 10
for step in range(num_epochs):
    obs, info = env.reset()
    done = False
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        
        rewards += reward
        frame = env.render()
        img.set_data(frame)
        action_text.set_text(f'Step: {actions[action] }')

        fig.canvas.draw()
        fig.canvas.flush_events()
        done = terminated or truncated
        obs = next_obs

plt.ioff()  # Turn off interactive mode
# plt.show()  # Keep the window open after the animation finishes
plt.close()
env.close()