In [63]:
import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import numpy as np
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict # allows access to undefined keys
matplotlib.use('TkAgg')  # or 'Qt5Agg' if you prefer Qt

In [64]:
class LunarLanderAgent:
    def __init__(self,
                 learning_rate: float,
                 initial_epsilon: float,
                 epsilon_decay: float,
                 final_epsilon: float,
                 discount_factor: float = 0.95,
                 discrete_actions: int = 4):
        
        self.lr = learning_rate
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.discount_factor = discount_factor
        self.discrete_actions = discrete_actions
        
        # Initialize Q-table
        self.q_values = defaultdict(lambda: np.zeros(self.discrete_actions))
        
        self.training_error = []
    
    def discretize_state(self, state):
        # Round each value in the state to 1 decimal place
        # Convert to tuple for hashability
        # print(state)
        # Append the original terminated flag (boolean)
        return tuple(np.append(state, state[-1])) 
    
    def choose_action(self, state):
        discretized_state = self.discretize_state(state)
        
        if np.random.random() < self.epsilon:
            return np.random.randint(self.discrete_actions)
        else:
            return int(np.argmax(self.q_values[discretized_state]))
    
    def update_q_values(self, state, action, reward, terminated, next_state):
        state = self.discretize_state(state)
        # print('state shape : ', state[0].shape)
        next_state = self.discretize_state(next_state)
        
        if not terminated:          
            future_q_value = np.max(self.q_values[next_state])
        else:
            future_q_value = 0
        temporal_difference = (reward + (self.discount_factor * future_q_value)) - self.q_values[state][action]
        self.q_values[state][action] += self.lr * temporal_difference
        self.training_error.append(temporal_difference)
        
    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon * self.epsilon_decay)

In [65]:
learning_rate = 1
n_episodes = 100
start_epsilon = 1
epsilon_decay = 0.99
final_epsilon = 0.05

agent = LunarLanderAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    final_epsilon=final_epsilon,
    epsilon_decay=epsilon_decay,
    
    
)

In [66]:
env = gym.make("ALE/Breakout-v5", render_mode='rgb_array')


In [67]:
print(f'action space shape : {env.action_space.n}') # Number of possible actions is 4
print(f'observation space shape : {env.observation_space}') 
#-------------- obesrvation is a tupe of 3 values : --------------
#1) player cards value
#2) dealer's face up card
#3) usable ace for player, equal 1 if ace is considered an 11 without busting

print(f'reward range : {env.reward_range}') # default reward range is set to -inf +inf
# print(f'\nEnv specs : {env.spec}') 
print(f'\nEnv metadata : {env.metadata}') # render_modes adn render_fps

action space shape : 4
observation space shape : Box(0, 255, (210, 160, 3), uint8)
reward range : (-inf, inf)

Env metadata : {'render_modes': ['human', 'rgb_array'], 'obs_types': {'grayscale', 'ram', 'rgb'}}


In [68]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
env = gym.wrappers.TimeLimit(env, max_episode_steps=60)

rewards = 0 
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False
    # play one episode
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        rewards += reward
        # update the agent
        agent.update_q_values(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

100%|██████████| 100/100 [01:43<00:00,  1.03s/it]


In [69]:
q_values = np.array([value for key, value in agent.q_values.items()])
print(np.argmax(q_values,axis=1))


[0 0 0 ... 0 0 0]


In [70]:
print(f'total rewards = {rewards}')


total rewards = 18.0


In [73]:

# Create and wrap the environment
env = gym.make("ALE/Breakout-v5",render_mode='rgb_array')
# env = CustomRewardWrapper(env)

obs, info = env.reset()

plt.ion()
fig, ax = plt.subplots(figsize=(8,8))
action_text = ax.text(510, 20, '', color='white', fontsize=12, bbox=dict(facecolor='blue', alpha=0.8))
img = ax.imshow(env.render())
actions = ['Move Up','Move Right','Move Down','Move Left']
rewards = 0
num_epochs= 10
for step in range(num_epochs):
    obs, info = env.reset()
    done = False
    print(f'step {step}:  obs = {next_obs} , reward = {reward}')
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        rewards += reward
        if reward >10 : 
            print(f'step {step}:  obs = {next_obs} , reward = {reward}')

        frame = env.render()
        img.set_data(frame)
        action_text.set_text(f'Step: {actions[action] }')

        fig.canvas.draw()
        fig.canvas.flush_events()
        done = terminated or truncated
        obs = next_obs

plt.ioff()  # Turn off interactive mode
plt.show()  # Keep the window open after the animation finishes
plt.close()
env.close()

step 0:  obs = [[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]] , reward = 0.0


TclError: can't invoke "update" command: application has been destroyed

In [72]:
print(f'mean episode rewards = {rewards/num_epochs}')

mean episode rewards = 1.2
