In [2]:
import time
import flappy_bird_gym


In [3]:
# Creation of the environment
env = flappy_bird_gym.make("FlappyBird-v0")

In [4]:
env.reset() # reset the environment to a new random state
env.render() 

In [5]:
print("Action Space {}". format(env.action_space))
print("State Space {}.". format(env.observation_space))

Action Space Discrete(2)
State Space Box(-inf, inf, (2,), float32).


* Action Space Discrete(2):
This indicates that the action space of your environment consists of two discrete actions.
The actions are typically represented by integers, and in this case, the agent can choose between two distinct actions.
For example, in the context of Flappy Bird, these actions might represent "flap" and "do nothing".
* State Space Box(-inf, inf, (2,), float32):
This indicates that the observation space (or state space) of your environment is continuous, represented by a multidimensional array.
The observation space has two dimensions, as indicated by the (2,).
The values in each dimension can range from negative infinity to positive infinity (-inf to inf), as specified by -inf, inf.
The values within the observation space are represented as floating-point numbers (float32).

In [6]:
from IPython.display import clear_output
from time import sleep

frames = [] # for animation

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        #print(frame['frame'].getvalue())
        print(f"{frame['frame']}")
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

In [7]:
import numpy as np
# q_table = np.zeros([2, np.inf])

In [8]:
import numpy as np

# Discretize the observation space
n_bins = [3000, 3000]  # Number of bins for each dimension
n_actions = 2  # Number of discrete actions

# Define the range for each dimension of the observation space
obs_low = env.observation_space.low
obs_high = env.observation_space.high

# Calculate the bin widths for each dimension
bin_widths = (obs_high - obs_low) / n_bins

# Create the Q-table with the appropriate dimensions
n_states = tuple((n+1 for n in n_bins))
q_table = np.zeros(n_states + (n_actions,))
print("Q-table shape:", q_table.shape)


Q-table shape: (3001, 3001, 2)


In [15]:
import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()
    state= state[0]

    epochs, penalties, reward, = 0, 0, 0
    done = False


    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values
        

        next_state, reward, done, truncate, info = env.step(action) 
  
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
  
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        #print(state[0], action, new_value)
        q_table[state,action] = new_value
        #print(state[0], action, new_value)
        #print(reward)
        
        if reward == -10:
            penalties += 1

        state= next_state
        epochs += 1
        #print(state)
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

: 

In [10]:
q_table

array([[[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       ...,

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]]])

In [None]:
#env.close()

In [None]:

"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

framesql = [] # for animation

for _ in range(episodes):
    state = env.reset()
    state= state[0]
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1
        
        epochs += 1

    total_penalties += penalties
    total_epochs += epochs
    # Put each rendered frame into dict for animation
    framesql.append({
    'frame': env.render(),
    'state': state,
    'action': action,
    'reward': reward,
    'epoch': epochs   
        }
    )

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 12.87
Average penalties per episode: 0.0
