In [71]:
import os, sys
import gymnasium as gym
import time

import text_flappy_bird_gym

if __name__ == '__main__':

    # initiate environment
    env = gym.make('TextFlappyBird-v0', height = 15, width = 20, pipe_gap = 4)
    obs = env.reset()

    # iterate
    while True:

        # Select next action
        action = env.action_space.sample()  # for an agent, action = agent.policy(observation)

        # Appy action and return new observation of the environment
        obs, reward, done, _, info = env.step(action)

        # Render the game
        os.system("clear")
        sys.stdout.write(env.render())
        time.sleep(0.2) # FPS

        # If player is dead break
        if done:
            break

    env.close()

[H[2JText Flappy Bird!
Score: 0
----------------------
[                  [32m|[0m ]
[                    ]
[                    ]
[                    ]
[                    ]
[                  [32m|[0m ]
[                  [32m|[0m ]
[      [33m@[0m           [32m|[0m ]
[                  [32m|[0m ]
[                  [32m|[0m ]
[                  [32m|[0m ]
[                  [32m|[0m ]
[                  [32m|[0m ]
[                  [32m|[0m ]
[                  [32m|[0m ]
^^^^^^^^^^^^^^^^^^^^^^
Player Action (Idle)
Distance From Pipe (dx=12,dy=4)
[H[2JText Flappy Bird!
Score: 0
----------------------
[                 [32m|[0m  ]
[                    ]
[                    ]
[                    ]
[                    ]
[                 [32m|[0m  ]
[      [33m@[0m          [32m|[0m  ]
[                 [32m|[0m  ]
[                 [32m|[0m  ]
[                 [32m|[0m  ]
[                 [32m|[0m  ]
[                 [32m|[0m  ]


In [86]:
env = gym.make('TextFlappyBird-v0', height = 15, width = 20, pipe_gap = 4)
obs = env.reset()
print("initial obs", obs)

initial obs ((13, 1), {'score': 0, 'player': [6, 7], 'distance': 13.038404810405298})


In [87]:
os.system("clear")
sys.stdout.write(env.render())
time.sleep(0.2) # FPS

[H[2JText Flappy Bird!
Score: 0
----------------------
[                   [32m|[0m]
[                   [32m|[0m]
[                   [32m|[0m]
[                   [32m|[0m]
[                    ]
[                    ]
[                    ]
[      [33m@[0m             ]
[                   [32m|[0m]
[                   [32m|[0m]
[                   [32m|[0m]
[                   [32m|[0m]
[                   [32m|[0m]
[                   [32m|[0m]
[                   [32m|[0m]
^^^^^^^^^^^^^^^^^^^^^^
Player Action (Idle)
Distance From Pipe (dx=13,dy=1)


In [91]:
obs, a, b, c, d = env.step(1)
print("obs", obs)

obs (9, -3)


In [4]:
import scipy
import numpy as np

# Ausiliary function
def discount_cumsum(x, discount):
    """
    magic for computing discounted cumulative sums of vectors.
    input:
        vector x,
        [x0,
         x1,
         x2]
    output:
        [x0 + discount * x1 + discount^2 * x2,
         x1 + discount * x2,
         x2]
    """
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [5]:
class first_visit_MC_Evaluation():
    def __init__(self, size=5, gamma=1):
        
        # the discount factor
        self.gamma = gamma
        # size of system
        self.size = size

        # where to save returns
        self.returns = [np.empty(0) for i in range(self.size)]
    
    # -------------------   
    def single_episode_update(self, traj_states, traj_rew):
        """
        Uses a single trajectory to update the values, using first-visit MC.
        """
        visited = np.empty(0)
        
        # calculates the returns for each step: DISCOUNTed CUMulative SUM.
        ret = discount_cumsum(traj_rew, gamma)
        
        
        for t_step, s in enumerate(traj_states):
            
            # ---------------------------------
            # Q : 
            
            if not (s in visited):
                self.returns[s] = np.append(self.returns[s], ret[t_step])
                visited = np.append(visited, s)
        
            # no need to go further: we have first-visited 
            if visited.shape == self.size:
                break
        
    # -------------------
    def estimate_values(self):
        n_obs = self.size
        value = np.zeros(n_obs)
        for s in range(n_obs):
            value[s] = np.mean(self.returns[s])
        return value
    
class learning_rate_MC_Evaluation():
    def __init__(self, size=5, gamma=1, lr_v=0.1):
        
        # the discount factor
        self.gamma = gamma
        # size of system
        self.size = size
        # where to save returns
        self.values = np.zeros(self.size)
    
    # -------------------   
    def single_episode_update(self, traj_states, traj_rew):
        """
        Uses a single trajectory to update the values, using constant learning_rate.
        """
        
        # calculates the returns for each step: DISCOUNTed CUMulative SUM.
        ret = discount_cumsum(traj_rew, gamma)
        
        for t_step, s in enumerate(traj_states):
            self.values[s] += lr_v*(ret[t_step]- self.values[s])        
                

In [6]:
import matplotlib.pyplot as plt
import numpy as np

n_episodes = 100
# action space =         [  L,   R]
random_policy = np.array([0.5, 0.5])
# gamma
gamma = 1.0
# learning rate
lr_v = 0.05

# what is the real value?
real_values = [1/6, 2/6, 3/6, 4/6, 5/6]

# error in time
empirical_error = np.empty(0)

# create environment
env = gym.make('TextFlappyBird-v0', height = 15, width = 20, pipe_gap = 4)
obs = env.reset()

empirical_error_MC = np.zeros(n_episodes)
empirical_error_MC2 = np.zeros(n_episodes)

traj_act = np.empty(0)

# number of runs for average errors
n_average = 50

# loop over episodes
for i_av in range(n_average):
    
    # initialize the algorithm
    MC_first_visit = first_visit_MC_Evaluation(env.observation_size, gamma)
    MC_learning_rate = learning_rate_MC_Evaluation(env.observation_size, gamma, lr_v=0.1)
    
    for i in range(n_episodes):
        traj_states = np.empty(0, dtype=int)
        traj_rew = np.empty(0)
        done = False

        env.reset()
        s = env.current_state

        while not done:
            traj_states = np.append(traj_states, s)
            a = np.random.choice(env.action_space, p=random_policy)
            traj_act = np.append(traj_act, a)
            new_s, r, done = env.step(a)
            traj_rew = np.append(traj_rew, r)
            #print(s, a, new_s, r, done)
            s = new_s

        MC_first_visit.single_episode_update(traj_states, traj_rew)
        MC_learning_rate.single_episode_update(traj_states, traj_rew)

        values = MC_first_visit.estimate_values()
        values2 = MC_learning_rate.values

        empirical_error_MC[i] += np.mean(np.sqrt((values-real_values)*(values-real_values))) / n_average
        empirical_error_MC2[i] += np.mean(np.sqrt((values2-real_values)*(values2-real_values))) /n_average
    
    #print(values, values2, empirical_error_MC[-1], empirical_error_MC2[-1]) 
    
fig, ax = plt.subplots()  # Create a figure and an axes.
ax.plot(np.arange(n_episodes), empirical_error_MC, label='first-visit MC')
ax.plot(np.arange(n_episodes), empirical_error_MC2, label='MC - learning rate')
ax.legend()

AttributeError: 'TextFlappyBirdEnvSimple' object has no attribute 'observation_size'