In [1]:
import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import numpy as np
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict # allows access to undefined keys
matplotlib.use('TkAgg')  # or 'Qt5Agg' if you prefer Qt

In [2]:
class LunarLanderAgent:
    def __init__(self,
                 learning_rate: float,
                 initial_epsilon: float,
                 epsilon_decay: float,
                 final_epsilon: float,
                 discount_factor: float = 0.95,
                 discrete_actions: int = 4):
        
        self.lr = learning_rate
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.discount_factor = discount_factor
        self.discrete_actions = discrete_actions
        
        # Initialize Q-table
        self.q_values = defaultdict(lambda: np.zeros(self.discrete_actions))
        
        self.training_error = []
    
    def discretize_state(self, state):
        # Round each value in the state to 1 decimal place
        # Convert to tuple for hashability
       
        rounded_state = np.round(state, 1)  # Slice to exclude last element
        # Append the original terminated flag (boolean)
        return tuple(np.append(rounded_state, state[-1])) 
    
    def choose_action(self, state):
        discretized_state = self.discretize_state(state)
        
        if np.random.random() < self.epsilon:
            return np.random.randint(self.discrete_actions)
        else:
            return int(np.argmax(self.q_values[discretized_state]))
    
    def update_q_values(self, state, action, reward, terminated, next_state):
        state = self.discretize_state(state)
        next_state = self.discretize_state(next_state)
        
        if not terminated:          
            future_q_value = np.max(self.q_values[next_state])
        else:
            future_q_value = 0
        temporal_difference = (reward + (self.discount_factor * future_q_value)) - self.q_values[state][action]
        self.q_values[state][action] += self.lr * temporal_difference
        self.training_error.append(temporal_difference)
        
    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon * self.epsilon_decay)

In [3]:
learning_rate = .1
n_episodes = 100_00
start_epsilon = 1
epsilon_decay = 0.999
final_epsilon = 0.05

agent = LunarLanderAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    final_epsilon=final_epsilon,
    epsilon_decay=epsilon_decay,
    
    
)

In [4]:
env = gym.make("LunarLander-v2", render_mode='rgb_array')


In [5]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
env = gym.wrappers.TimeLimit(env, max_episode_steps=60)

rewards = 0 
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False
    # play one episode
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        rewards += reward
        # update the agent
        agent.update_q_values(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

100%|██████████| 10000/10000 [01:47<00:00, 93.17it/s]


In [6]:
q_values = np.array([value for key, value in agent.q_values.items()])
print(np.argmax(q_values,axis=1))


[1 1 1 ... 2 2 2]


In [7]:
print(f'total rewards = {rewards}')


total rewards = -225771.23506791456


In [8]:

# Create and wrap the environment
env = gym.make("LunarLander-v2",render_mode='rgb_array')
# env = CustomRewardWrapper(env)

obs, info = env.reset()

plt.ion()
fig, ax = plt.subplots(figsize=(8,8))
action_text = ax.text(510, 20, '', color='white', fontsize=12, bbox=dict(facecolor='blue', alpha=0.8))
img = ax.imshow(env.render())
actions = ['Move Up','Move Right','Move Down','Move Left']
rewards = 0
num_epochs= 3
for step in range(num_epochs):
    obs, info = env.reset()
    done = False
    print(f'step {step}:  obs = {next_obs} , reward = {reward}')
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        rewards += reward
        if reward >10 : 
            print(f'step {step}:  obs = {next_obs} , reward = {reward}')

        frame = env.render()
        img.set_data(frame)
        action_text.set_text(f'Step: {actions[action] }')

        fig.canvas.draw()
        fig.canvas.flush_events()
        done = terminated or truncated
        obs = next_obs

plt.ioff()  # Turn off interactive mode
plt.show()  # Keep the window open after the animation finishes
plt.close()
env.close()

step 0:  obs = [ 0.31820184  1.0875041   0.30053353 -0.41212827  0.37081236  0.32681724
  0.          0.        ] , reward = 0.7095849306204058
step 1:  obs = [-0.16466483 -0.00452212 -0.06644684 -0.28559577  0.57780063 -3.7748184
  0.          0.        ] , reward = -100
step 2:  obs = [ 8.22419152e-02  1.59128048e-02 -3.84804085e-02  1.71540456e-03
  2.29143596e+00  1.15736976e-01  0.00000000e+00  1.00000000e+00] , reward = -100
step 2:  obs = [ 0.14008875  0.02842799 -0.49921054 -1.1228324   1.8841434   4.565578
  0.          1.        ] , reward = 35.696896095282796
step 3:  obs = [ 0.13448295  0.00990977 -0.39614922  0.0747238   2.0831795   1.236011
  0.          0.        ] , reward = -100
step 4:  obs = [-0.05357037  0.00526329  0.47250214 -0.18862799 -1.2630514  -5.0811567
  1.          0.        ] , reward = -100
step 5:  obs = [-0.3058497  -0.04613066 -0.4588543  -0.20637606 -1.637465    1.226955
  1.          0.        ] , reward = -100
step 5:  obs = [-0.17261915  0.0461478

TclError: can't invoke "update" command: application has been destroyed

In [None]:
print(f'mean episode rewards = {rewards/num_epochs}')

In [None]:
print(f'action space shape : {env.action_space.n}') # Number of possible actions is 4
print(f'observation space shape : {env.observation_space}') 
print(f'observation space numbers : {env.nS}') 
#-------------- obesrvation is a tupe of 3 values : --------------
#1) player cards value
#2) dealer's face up card
#3) usable ace for player, equal 1 if ace is considered an 11 without busting

print(f'reward range : {env.reward_range}') # default reward range is set to -inf +inf
# print(f'\nEnv specs : {env.spec}') 
print(f'\nEnv metadata : {env.metadata}') # render_modes adn render_fps