# Instalando

In [1]:
%%capture
!pip install pyglet==1.5.1 
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

In [2]:
%%capture
!pip install gym==0.23.1#0.24 # We install the newest gym version for the Taxi-v3 "rgb_array version"
!pip install pygame
!pip install numpy

!pip install huggingface_hub
!pip install pickle5
!pip install pyyaml==6.0 # avoid key error metadata
!pip install imageio imageio_ffmpeg

In [3]:
import numpy as np
import gym
import random
import imageio
import os

import pickle5 as pickle
from tqdm.notebook import tqdm

# Juguemos al Black Jack

https://www.gymlibrary.dev/environments/toy_text/blackjack/

### Action Space
There are two actions: stick (0), and hit (1).

### Observation Space
The observation consists of a 3-tuple containing: the player’s current sum, the value of the dealer’s one showing card (1-10 where 1 is ace), and whether the player holds a usable ace (0 or 1).

...

### Rewards
win game: +1

lose game: -1

draw game: 0

win game with natural blackjack:

* +1.5 (if natural is True)

* +1 (if natural is False)



In [4]:
env = gym.make("Blackjack-v1")

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


In [5]:
env.reset()
print("Espacion de observación", env.observation_space)
print("Muestra", env.observation_space.sample()) # Get a random observation

_____ESPACIO DE OBSERVACIÓN_____ 

Espacion de observación Tuple(Discrete(32), Discrete(11), Discrete(2))
Muestra (28, 10, 1)


In [6]:
#@title
import itertools
s=[]
for i in env.observation_space:
  s.append(np.arange(i.n))

print("Hay", len(list(itertools.product(*s))), "posibles estados")
state_space = (len(i) for i in s)

Hay 704 posibles estados


In [7]:
action_space = env.action_space.n
print("Hay", action_space, "acciones posibles")

Hay 2 acciones posibles


# Preparamos la Q-Table

In [8]:
# Creamos la QTable con las dimensiones del espacio de estados y las acciones posibles 
# (*state_space, action_space) e inicializamos todo a 0 con np.zeros
def initialize_q_table(state_space, action_space):
  Qtable = np.zeros((*state_space, action_space))
  return Qtable

In [9]:
Qtable_blackjack = initialize_q_table(state_space, action_space)
Qtable_blackjack.shape

(32, 11, 2, 2)

# Políticas de Explotación

In [11]:
def greedy_policy(Qtable, state):
  # Explotación: toma la accion con mayor valor de estado y accion
  action = np.argmax(Qtable[state])
  
  return action

In [10]:
def epsilon_greedy_policy(Qtable, state, epsilon):
  # Generar número al azar
  random_int = random.uniform(0,1)
  # Si random_int mayor que epsilon --> explotación
  if random_int > epsilon:
    # Acción con mayor valor en un estado dado
    action = np.argmax(Qtable[state])
  # else --> exploracion
  else:
    action = np.random.choice(Qtable.shape[-1],1) #La accion es la última dimesión el último
  
  return action

# Entrenamiento

In [None]:
def reinforcement_learning(Qtable,state,action,new_state,learning_rate,reward,gamma):
  Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])   
  return Qtable

In [14]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
  for episode in tqdm(range(n_training_episodes)):
    # Bajar epsilor (cada vez menos exploración)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    # Reset el entorno
    state = env.reset()
    state = state[:2] + (int(state[2]),)# gym devuelve un boolean, lo transformamos a entero
    step = 0
    done = False

    # Repetir
    for step in range(max_steps):
      # Elegir la acción (At) usando una política
      action = epsilon_greedy_policy(Qtable, state, epsilon)

      # Hacer acción At y ver Rt+1 y St+1
      new_state, reward, done, info = env.step(bool(action))# convertimos la accion a boolean
      new_state = new_state[:2] + (int(new_state[2]),)# gym devuelve un boolean, lo transformamos a entero

      # Actualizar qtable Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
      Qtable=reinforcement_learning(Qtable,state,action,new_state,learning_rate,reward,gamma)
      # Si finaliza el juego, seguimos con la siguiente iteración
      if done:
        break
      
      # Ahora el estado es St+1
      state = new_state
  return Qtable

In [12]:
# Parámetros de entrenamiento
n_training_episodes = 10000  # Episodios totales de entrenamiento
learning_rate = 0.7          # Learning rate

# Parámetros de evaluación
n_eval_episodes = 100        # Episodios de test

# Parámetros de entorno
max_steps = 99               # Máximos pasos por episodio
gamma = 0.95                 # Discounting rate

# Parámetros de exploración
max_epsilon = 1.0             # Ratio de exploración al inicio
min_epsilon = 0.05            # Ratio mínimo de exploración
decay_rate = 0.0005           # Exponential decay rate para la exploración

In [15]:
Qtable_blackjack = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_blackjack)

  0%|          | 0/10000 [00:00<?, ?it/s]

# Evaluación

In [16]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q):
  episode_rewards = []
  for episode in tqdm(range(n_eval_episodes)):
    state = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
    
    for step in range(max_steps):
      # Tomar la acción con mayor recompensa esperada en el estado dado
      state = state[:2] + (int(state[2]),)# gym devuelve un boolean, lo transformamos a entero
      action = np.argmax(Q[state][:])
      new_state, reward, done, info = env.step(bool(action))
      total_rewards_ep += reward
        
      if done:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [17]:
# Evaluando
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_blackjack)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

  0%|          | 0/100 [00:00<?, ?it/s]

Mean_reward=-0.20 +/- 0.96


In [18]:
def record_video(env, Qtable, out_directory, fps=1):
  images = []  
  done = False
  state = env.reset(seed=random.randint(0,500))
  img = env.render(mode='rgb_array')
  images.append(img)
  while not done:
    # Tomar la acción con mayor recompensa esperada en el estado dado
    state = state[:2] + (int(state[2]),)# gym devuelve un boolean, lo transformamos a entero
    action = np.argmax(Qtable[state][:])
    state, reward, done, info = env.step(bool(action)) # next_state = state porque estamos jugando
    img = env.render(mode='rgb_array')
    images.append(img)
  imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [32]:
# Grabando unos vídeos
import uuid
for _ in range(5):
  video_path=f'./{uuid.uuid1()}.mp4'
  record_video(env, Qtable_blackjack, video_path, 1)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  "The argument mode in render method is deprecated; "


In [33]:
import glob, io, base64
from IPython import display as ipythondisplay
from IPython.display import HTML

def show_video():
  mp4list = sorted(glob.glob('*.mp4'), key=os.path.getmtime)
  for mp4 in mp4list:
    # mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<h1>{1}</h1><video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'),mp4)))
  else: 
    pass

In [34]:
show_video()

Basado en:


https://colab.research.google.com/github/huggingface/deep-rl-class/blob/main/unit2/unit2.ipynb#scrollTo=ZNPG0g_UGCfh