<a href="https://colab.research.google.com/github/R-hab/ETH_FAKE/blob/main/Smart-taxi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [3]:
!pip install -U colabgymrender
!pip install imageio==2.4.1
!pip install --upgrade AutoROM
# AutoROM --accept-license
!pip install gym[atari,accept-rom-license]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colabgymrender
  Downloading colabgymrender-1.1.0.tar.gz (3.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: colabgymrender
  Building wheel for colabgymrender (setup.py) ... [?25l[?25hdone
  Created wheel for colabgymrender: filename=colabgymrender-1.1.0-py3-none-any.whl size=3130 sha256=03391950faa528594a2cb357d59ef213d5c298562d95fbad3eb99247c7e57c1f
  Stored in directory: /root/.cache/pip/wheels/34/dc/eb/4d1d8ef70b7d696391f62606424619637bf61d6bd43f7d2298
Successfully built colabgymrender
Installing collected packages: colabgymrender
Successfully installed colabgymrender-1.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting imageio==2.4.1
  Downloading imageio-2.4.1.tar.gz (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31

In [7]:
# !pip install gym[atari,accept-rom-license]==0.21.0


In [1]:
!pip install pyvirtualdisplay

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0


In [2]:
import gym
import numpy as np
import PIL.Image
from collections import namedtuple, deque

import tensorflow as tf
from pyvirtualdisplay import Display
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam

import random

Display(visible=False, size=(840, 480)).start()
tf.random.set_seed(0)

env = gym.make("Taxi-v3")
env.reset()

MEMORY_SIZE = 100_000
GAMMA = 0.95
ALPHA = 0.001
NUM_STEPS_FOR_UPDATE = 4

experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

num_states = env.observation_space.n
num_actions = env.action_space.n

q_network = Sequential([
    Input(shape=num_states),
    Dense(64, activation="relu"),
    Dense(64, activation="relu"),
    Dense(num_actions, activation="linear")
])

target_q_network = Sequential([
    Input(shape=num_states),
    Dense(64, activation="relu"),
    Dense(64, activation="relu"),
    Dense(num_actions, activation="linear")
])

optimizer = Adam(learning_rate=ALPHA)

def compute_loss(experiences, gamma, q_network, target_q_network):
  states, actions, rewards, next_states, done_vals = experiences
  max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)
  y_targets = rewards + (gamma * max_qsa * (1-done_vals))
  q_values = q_network(states)
  q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))
  loss = MSE(y_targets, q_values)

  return loss

def update_target_network(q_network, target_q_network):
  TAU=1e-3
  for target_weights, q_network_weights in zip(target_q_network.weights, q_network.weights):
    target_weights.assign(TAU * q_network_weights + (1.0-TAU) * target_weights)

@tf.function
def agent_learn(experiences, gamma, q_network, target_q_network, optimizer):
  with tf.GradientTape() as tape:
    loss = compute_loss(experiences, gamma, q_network, target_q_network)
  gradients = tape.gradient(loss, q_network.trainable_variables)
  optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))
  update_target_network(q_network, target_q_network)

def get_action(q_values, epsilon=0):
  if random.random() > epsilon:
    return np.argmax(q_values.numpy()[0])
  else:
    return random.choice(np.arange(6))

def check_update_conditions(j, NUM_STEPS_FOR_UPDATE, memory_buffer):
  if(j+1) % NUM_STEPS_FOR_UPDATE == 0 and len(memory_buffer) > 64:
    return True
  else:
    return False

def get_experiences(memory_buffer):
    experiences = random.sample(memory_buffer, k=64)
    states = tf.convert_to_tensor(np.array([e.state for e in experiences if e is not None]),dtype=tf.float32)
    actions = tf.convert_to_tensor(np.array([e.action for e in experiences if e is not None]), dtype=tf.float32)
    rewards = tf.convert_to_tensor(np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32)
    next_states = tf.convert_to_tensor(np.array([e.next_state for e in experiences if e is not None]),dtype=tf.float32)
    done_vals = tf.convert_to_tensor(np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
                                     dtype=tf.float32)
    return (states, actions, rewards, next_states, done_vals)

def get_new_epsilon(epsilon):
  E_MIN = 0.01
  E_DECAY = 0.05
  return max(E_MIN, E_DECAY * epsilon)

def get_one_hot_encoding(state, next_state):

  state_arr = np.zeros(500)
  next_state_arr = np.zeros(500)

  state_arr[state] = 1
  next_state_arr[next_state] = 1
  
  return state_arr, next_state_arr


from gym.envs.toy_text.frozen_lake import generate_random_map

def train():

  NUM_EPISODES = 50000
  MAX_TIMESTEPS = 1000

  memory_buffer = deque(maxlen=MEMORY_SIZE)
  target_q_network.set_weights(q_network.get_weights())

  epsilon = 1.0

  points_history = []

  for i in range(NUM_EPISODES):

    state = env.reset()
    state, _ = get_one_hot_encoding(state, 0)
    total_points = 0

    for j in range(MAX_TIMESTEPS):

      state_qn = np.expand_dims(state, axis=0)
      q_values = q_network(state_qn)
      action = get_action(q_values, epsilon)
      next_state, reward, done, _ = env.step(action)

      _, next_state = get_one_hot_encoding(0, next_state)

      memory_buffer.append(experience(state, action, reward, next_state, done))

      update = check_update_conditions(j, NUM_STEPS_FOR_UPDATE, memory_buffer)

      if update:
        experiences = get_experiences(memory_buffer)
        agent_learn(experiences, GAMMA, q_network, target_q_network, optimizer)

      state = next_state.copy()
      total_points += reward

      if done:
        break

    points_history.append(total_points)
    avg_points = np.mean(points_history[-100:])

    epsilon = get_new_epsilon(epsilon)

    print(f"\rEpisode {i+1} | Total point average of the last {100} episodes: {avg_points:.2f}", end="")

    if (i+1) % 100 == 0:
        print(f"\rEpisode {i+1} | Total point average of the last {100} episodes: {avg_points:.2f}")

    if(avg_points >= 8):
      print(f"Environment solved in {i+1} episodes!")
      q_network.save('taxiye_model.h5')
      break

train()

  deprecation(
  deprecation(


Episode 100 | Total point average of the last 100 episodes: -211.16
Episode 200 | Total point average of the last 100 episodes: -214.76
Episode 300 | Total point average of the last 100 episodes: -205.31
Episode 400 | Total point average of the last 100 episodes: -205.76
Episode 500 | Total point average of the last 100 episodes: -205.58
Episode 600 | Total point average of the last 100 episodes: -205.67
Episode 700 | Total point average of the last 100 episodes: -204.75
Episode 800 | Total point average of the last 100 episodes: -204.31
Episode 900 | Total point average of the last 100 episodes: -206.39
Episode 1000 | Total point average of the last 100 episodes: -205.77
Episode 1100 | Total point average of the last 100 episodes: -203.46
Episode 1200 | Total point average of the last 100 episodes: -204.77
Episode 1300 | Total point average of the last 100 episodes: -187.59
Episode 1400 | Total point average of the last 100 episodes: -184.47
Episode 1500 | Total point average of the l






In [3]:
import gym
import numpy as np
import tensorflow as tf
import logging
import imageio

In [4]:
logging.getLogger().setLevel(logging.ERROR)
env = gym.make("Taxi-v3")
q_network = tf.keras.models.load_model('taxiye_model.h5')

In [8]:
def get_one_hot_encoding(state, next_state):

  state_arr = np.zeros(500)
  next_state_arr = np.zeros(500)

  state_arr[state] = 1
  next_state_arr[next_state] = 1
  
  return state_arr, next_state_arr

def create_video(filename, env, q_network, fps=30):
  video = imageio.get_writer(filename, fps=fps)
  done = False
  state = env.reset()
  frame = env.render(mode="rgb_array")
  video.append_data(frame)
  while not done:
    state, _ = get_one_hot_encoding(state, 0)
    state = np.expand_dims(state, axis=0)
    q_values = q_network(state)
    action = np.argmax(q_values.numpy()[0])
    state, _, done, _ = env.step(action)
    frame = env.render(mode="rgb_array")
    video.append_data(frame)
    for k in range(20):
      video.append_data(frame)

filename = "taxi.mp4"

create_video(filename, env, q_network)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  self._proc.stdin.write(im.tostring())
