<a href="https://colab.research.google.com/github/PTrillat/Reinforcement-Learning/blob/main/RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classe des environnements
Par compatibilité avec Gym, ces classe crééront un objet avec deux méthode:
 - etat = jeu.reset() -> np.ndarray
 - etat, gain, fin, _ = jeu.step(action)ndarray]

In [40]:
from collections import deque
import tqdm

In [41]:
import numpy as np
from numpy import random as rd


class Jeu2048(np.ndarray):
  def __new__(cls, m, n, p1, p2):
    new = np.zeros((m,n), dtype=int).view(cls)
    new.p1 = p1
    new.p2 = p2
    return new

  def reset(self):
    self *= 0
    self._peupler()
    return self._etat()
  
  def step(self, action):
    fin = False
    if   action==0: depl, fusion = self._gauche()
    elif action==1: depl, fusion = self._droite()
    elif action==2: depl, fusion = self._haut()
    else:           depl, fusion = self._bas()
    if depl: self._peupler()
    #else: fusion = -1000
    return self._etat(), fusion, self._fin(), 'ich bin ein kartoffel'
  
  def _fin(self):
    copie = 0 + self # un peu sale
    if copie._gauche()[0]: return False
    if copie._droite()[0]: return False
    if copie._haut()[0]: return False
    if copie._bas()[0]: return False
    return True

  def _etat(self):
    return np.log2(1*(self==0)+self).flatten()
  
  def _peupler(self):
    cases_vides = list(self._cases_vides())
    i, j = cases_vides.pop(rd.randint(len(cases_vides)))
    self[i,j] = 2 if rd.rand() < self.p2 else 4
    if len(cases_vides) == 0 or rd.rand() < self.p1: return
    i, j = cases_vides.pop(rd.randint(len(cases_vides)))
    self[i,j] = 2 if rd.rand() < self.p2 else 4
  
  def _cases_vides(self):
    m, n = self.shape
    for i in range(m):
      for j in range(n):
        if self[i,j] == 0: yield (i, j)
  
  def _gauche(self):
    depl, fusion = False, 0
    m, n = self.shape
    for i in range(m):
      j, k = 0, 1
      while j<k and k<n:
        if self[i,k]==0: # [j=?,...,k=0,...]
          k += 1
        elif self[i,j]==0: # [j=0,...,k=2,...]
          depl = True
          self[i,j] = self[i,k]
          self[i,k] = 0
          k += 1
        elif self[i,j]==self[i,k]: # [j=2,...,k=2,...]
          depl = True
          fusion += self[i,j]
          self[i,j] *= 2
          self[i,k] = 0
          j += 1
          k += 1
        elif j+1==k: # [j=2,k=4,...]
          j += 1
          k += 1
        else: # [j=2,...,k=4,...]
          j += 1
    return depl, fusion
  
  def _droite(self):
    self = self[:,::-1]
    depl, fusion = self._gauche()
    self = self[:,::-1]
    return depl, fusion
  
  def _haut(self):
    self = self.T
    depl, fusion = self._gauche()
    self = self.T
    return depl, fusion
  
  def _bas(self):
    self = self[::-1,:]
    depl, fusion = self._haut()
    self = self[::-1,:]
    return depl, fusion

from gym.envs.classic_control.cartpole import CartPoleEnv

class Zavatta(CartPoleEnv):
  
  M = np.diag([1, 1, 1, 1])
  
  def step(self, action):
    state, reward, done, info = super(Zavatta, self).step(action)
    return state, 1/(1 + state @ Zavatta.M @ state), done, info

import gym

# Le plagiat j'adore ça

In [42]:
import tensorflow as tf
from typing import Any, List, Sequence, Tuple

# Wrap OpenAI Gym's `env.step` call as an operation in a TensorFlow function.
# This would allow it to be included in a callable TensorFlow graph.

#env = Zavatta(); dim_state = 4; dim_action = 2; max_steps = 1000
env = Jeu2048(4,4,0.9,0.9); dim_state = env.size; dim_action = 4; max_steps = 700

def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """Returns state, reward and done flag given an action."""
  state, reward, done, _ = env.step(action)
  return (state.astype(np.float32), np.array(reward, np.float32), np.array(done, np.int32))

def tf_env_step(action: tf.Tensor) -> List[tf.Tensor]:
  return tf.numpy_function(env_step, [action], [tf.float32, tf.float32, tf.int32])

In [43]:
from tensorflow.keras import layers

class ActorCritic(tf.keras.Model):
  def __init__(
      self,  
      common_1: int,
      common_2: int,
      actor_1: int,
      actor_2: int,
      critic_1: int,
      critic_2: int):
    super().__init__()
    self.common_1 = layers.Dense(common_1, activation="relu")
    self.common_2 = layers.Dense(common_2, activation="relu")
    self.actor_1 = layers.Dense(actor_1)
    self.actor_2 = layers.Dense(actor_2)
    self.critic_1 = layers.Dense(critic_1)
    self.critic_2 = layers.Dense(critic_2)

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    x = self.common_2(self.common_1(inputs))
    return self.actor_2(self.actor_1(x)), self.critic_2(self.critic_1(x))

"""
from tensorflow.keras import layers

class ActorCritic(tf.keras.Model):
  def __init__(
      self, 
      num_common_1: int,
      num_common_2: int,
      num_common_3: int,
      num_actor_1: int,
      num_actor_2: int,
      num_actor_3: int,
      num_critic_1: int,
      num_critic_2: int,
      num_critic_3: int):
    super().__init__()
    self.common_1 = layers.Dense(num_common_1, activation="relu")
    self.common_2 = layers.Dense(num_common_2, activation="relu")
    self.common_3 = layers.Dense(num_common_3, activation="relu")
    self.actor_1 = layers.Dense(num_actor_1, activation="relu")
    self.actor_2 = layers.Dense(num_actor_2, activation="relu")
    self.actor_3 = layers.Dense(num_actor_3, activation="relu")
    self.critic_1 = layers.Dense(num_critic_1, activation="relu")
    self.critic_2 = layers.Dense(num_critic_2, activation="relu")
    self.critic_3 = layers.Dense(num_critic_3, activation="relu")

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    x = self.common_3(self.common_2(self.common_1(inputs)))
    return self.actor_3(self.actor_2(self.actor_1(x))), self.critic_3(self.critic_2(self.critic_1(x)))
"""

"""
from tensorflow.keras.layers import InputLayer, Dense

class ActorCritic(tf.keras.Model):
  
  def __init__(
      self,
      num_common: Tuple[int, ...], 
      num_actor: Tuple[int, ...], 
      num_critic: Tuple[int, ...]):
    
    super().__init__()
    state, num_common, middle = num_common[0], num_common[1:], num_common[-1]
    
    input_common = InputLayer(input_shape=(state,))
    for num in num_common:
      input_common = Dense(units=num, )

    units = [32, 16, 8]

    for unit in range(len(units)):
        inp =  tf.layers.dense(inp, units=units[unit], kernel_initializer=tf.initializers.he_uniform(),activation=tf.nn.relu,name="hidden" + str(unit + 1))
        inp = tf.layers.batch_normalization(inputs=inp, name="bn"+str(unit + 1))


    out = tf.layers.dense(inp, units=1, kernel_initializer=tf.initializers.he_uniform(), activation=None, name="out")  

    self.common = Sequential([Input(shape = (state, ))] + [Dense(num, activation='relu') for num in num_common])
    self.actor = Sequential([Input(shape = (middle, ))] + [Dense(num, activation='relu') for num in num_actor])
    self.critic = Sequential([Input(shape = (middle, ))] + [Dense(num, activation='relu') for num in num_critic])

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    x = self.common(inputs)
    return self.actor(x), self.critic(x)
"""

'\nfrom tensorflow.keras.layers import InputLayer, Dense\n\nclass ActorCritic(tf.keras.Model):\n  \n  def __init__(\n      self,\n      num_common: Tuple[int, ...], \n      num_actor: Tuple[int, ...], \n      num_critic: Tuple[int, ...]):\n    \n    super().__init__()\n    state, num_common, middle = num_common[0], num_common[1:], num_common[-1]\n    \n    input_common = InputLayer(input_shape=(state,))\n    for num in num_common:\n      input_common = Dense(units=num, )\n\n    units = [32, 16, 8]\n\n    for unit in range(len(units)):\n        inp =  tf.layers.dense(inp, units=units[unit], kernel_initializer=tf.initializers.he_uniform(),activation=tf.nn.relu,name="hidden" + str(unit + 1))\n        inp = tf.layers.batch_normalization(inputs=inp, name="bn"+str(unit + 1))\n\n\n    out = tf.layers.dense(inp, units=1, kernel_initializer=tf.initializers.he_uniform(), activation=None, name="out")  \n\n    self.common = Sequential([Input(shape = (state, ))] + [Dense(num, activation=\'relu\') f

In [44]:
def run_episode(
    initial_state: tf.Tensor,  
    model: tf.keras.Model, 
    max_steps: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
  """Runs a single episode to collect training data."""

  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)

  initial_state_shape = initial_state.shape
  state = initial_state

  for t in tf.range(max_steps):
    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)
  
    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model(state)
  
    # Sample next action from the action probability distribution
    action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)

    # Store critic values
    values = values.write(t, tf.squeeze(value))

    # Store log probability of the action chosen
    action_probs = action_probs.write(t, action_probs_t[0, action])
  
    # Apply action to the environment to get next state and reward
    state, reward, done = tf_env_step(action)
    state.set_shape(initial_state_shape)
  
    # Store reward
    rewards = rewards.write(t, reward)

    if tf.cast(done, tf.bool):
      break

  action_probs = action_probs.stack()
  values = values.stack()
  rewards = rewards.stack()
  
  return action_probs, values, rewards

In [45]:
def get_expected_return(
    values: tf.Tensor,
    rewards: tf.Tensor, 
    gamma: float,
    lamed: float) -> tf.Tensor:
  """Compute expected returns per timestep."""

  n = tf.shape(rewards)[0]
  returns = tf.TensorArray(dtype=tf.float32, size=n)

  # Start from the end of `rewards` and accumulate reward sums
  # into the `returns` array
  values = tf.cast(values[::-1], dtype=tf.float32)
  rewards = tf.cast(rewards[::-1], dtype=tf.float32)
  discounted_sum = tf.constant(0.0)
  discounted_sum_shape = discounted_sum.shape
  for i in tf.range(n):
    value = values[i]
    reward = rewards[i]
    discounted_sum = (1-lamed)*value + lamed*(reward + gamma * discounted_sum)
    discounted_sum.set_shape(discounted_sum_shape)
    returns = returns.write(i, discounted_sum)
  returns = returns.stack()[::-1]

  return (returns - tf.math.reduce_mean(returns)) / (tf.math.reduce_std(returns) + 1e-10)

In [46]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
    action_probs: tf.Tensor,  
    values: tf.Tensor,  
    returns: tf.Tensor) -> tf.Tensor:
  """Computes the combined actor-critic loss."""

  advantage = returns - values

  action_log_probs = tf.math.log(action_probs)
  actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)

  critic_loss = huber_loss(values, returns)

  return actor_loss + critic_loss

In [47]:
@tf.function
def train_step(
    initial_state: tf.Tensor, 
    model: tf.keras.Model, 
    optimizer: tf.keras.optimizers.Optimizer, 
    gamma: float, 
    lamed: float, 
    max_steps_per_episode: int) -> tf.Tensor:
  """Runs a model training step."""

  with tf.GradientTape() as tape:

    # Run the model for one episode to collect training data
    action_probs, values, rewards = run_episode(
        initial_state, model, max_steps_per_episode) 

    # Calculate expected returns
    returns = get_expected_return(values, rewards, gamma, lamed)

    # Convert training data to appropriate TF tensor shapes
    action_probs, values, returns = [
        tf.expand_dims(x, 1) for x in [action_probs, values, returns]] 

    # Calculating loss values to update our network
    loss = compute_loss(action_probs, values, returns)

  # Compute the gradients from the loss
  grads = tape.gradient(loss, model.trainable_variables)

  # Apply the gradients to the model's parameters
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  episode_reward = tf.math.reduce_sum(rewards)

  return episode_reward

In [None]:
%%time

model = ActorCritic(200, 200, 50, dim_action, 50, 1)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

max_episodes = 10000

# Discount factor for future rewards
gamma = 0.99
lamed = 0.999

# Keep last episodes reward
serie_gains = deque(maxlen=max_steps)
lissage_gain = 0

with tqdm.trange(1,max_episodes) as t:
  for i in t:
    initial_state = tf.constant(env.reset(), dtype=tf.float32)
    gain_episode = train_step(initial_state, model, optimizer, gamma, lamed, max_steps)
    
    gain_episode = float(gain_episode)
    lissage_gain += (gain_episode-lissage_gain)/i
    serie_gains.append(gain_episode)
      
    t.set_description(f'Episode {i}')
    t.set_postfix(episode_reward=gain_episode, running_reward=lissage_gain)

    #if lissage_gain > 195: break

print(f'\nSolved at episode {i}: average reward: {lissage_gain:.2f}!')

Episode 1353:  14%|█▎        | 1353/9999 [05:19<52:47,  2.73it/s, episode_reward=332, running_reward=538]

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(serie_gains, label='gains instantanés')
plt.plot(np.cumsum(serie_gains)/range(1,1+len(serie_gains)), label='gains moyens')
plt.legend()
plt.grid()
plt.show()

In [None]:
import matplotlib.animation as animation

fig = plt.figure() # initialise la figure
plt.axis('square')
plt.xlim(-2.4, 2.4)
plt.ylim(-1, 1)
line, = plt.plot([], [])

state = tf.constant(env.reset(), dtype=tf.float32)
def animate(i):
  global state
  state = tf.expand_dims(state, 0)
  action_probs, _ = model(state)
  action = np.argmax(np.squeeze(action_probs))
  state, reward, done, _ = env.step(action)
  x, x_dot, theta, theta_dot = state
  line.set_data([x,x+np.sin(theta)], [0,np.cos(theta)])
  return line,

anim = animation.FuncAnimation(fig, animate, frames=300, blit=True, interval=20, repeat=True)

from IPython.display import HTML
HTML(anim.to_html5_video())

In [None]:
initial_state = tf.constant(env.reset(), dtype=tf.float32)
initial_state_shape = initial_state.shape
state = initial_state
for t in tf.range(max_steps):

    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)
  
    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model(state)
  
    # Sample next action from the action probability distribution
    action = tf.math.argmax(action_logits_t, 1)[0]
    
    # Apply action to the environment to get next state and reward
    state, reward, done = tf_env_step(action)
    state.set_shape(initial_state_shape)

    print(env)

    if tf.cast(done, tf.bool):
      break