<a href="https://colab.research.google.com/github/ProfessorQu/Reinforcement-Learning/blob/main/Q_Learning_with_Taxi_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Q-Learning with Taxi-v3
This is a modified version of [this notebook](https://colab.research.google.com/gist/simoninithomas/466c81aa1c2a07dd14793240c6d033c5/q-learning-with-taxi-v3.ipynb#scrollTo=WlJYOh0yBHZO), which is a notebook created by Thomas Simonini. I am currently following his tutorials so I thought I would give my own spin on his notebook.

## Implementation
I basically moved everything into one class: TaxiAgent, so that it is easier to have multiple agents, train more, etc.
I also removed some unnecessary code and changed the incredibly complicated formula:

```python
epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*episode)
```

And simply replaced it with:

```python
if self.epsilon <= self.min_epsilon:
      self.epsilon = self.min_epsilon
    else:
      self.epsilon *= self.epsilon_decay
```

If you want to check out the course on your own it's available here: [Deep Reinforcement Learning Course](https://simoninithomas.github.io/deep-rl-course/)

In [2]:
import numpy as np
import gym

In [3]:
class TaxiAgent(object):
  def __init__(self, state_space, action_space,
               learning_rate, gamma, epsilon, min_epsilon, epsilon_decay):
    """
    Initialize all the hyperparameters and the Q-table
    """

    self.action_space = action_space

    self.learning_rate = learning_rate
    self.gamma = gamma

    self.epsilon = epsilon
    self.min_epsilon = min_epsilon
    self.epsilon_decay = epsilon_decay

    self.Q = np.zeros((state_space, action_space))
  
  def _choose_action(self, state):
    """
    Choose a function with the ε=greedy algorithm
    """

    if np.random.uniform(0, 1) < self.epsilon:
      action = np.random.randint(self.action_space)
    else:
      action = np.argmax(self.Q[state])
    
    return action
  
  def train(self, env, episodes, steps):
    """
    Train the agent with env,
    For episodes amount of episodes,
    With each episode having a maximum of steps amount of steps
    """
    # Start the episodes
    for episode in range(episodes):
      state = env.reset()
      step = 0
      done = False

      # Update epsilon
      if self.epsilon <= self.min_epsilon:
        self.epsilon = self.min_epsilon
      else:
        self.epsilon *= self.epsilon_decay

      # Start the episode
      for step in range(steps):
        # Get an action
        action = self._choose_action(state)

        # Get the s', r, T, and info (new state, reward, terminal, and info)
        new_state, reward, done, info = env.step(action)

        # Update the Q-table
        self.Q[state, action] += self.learning_rate * (
            reward + self.gamma * np.max(self.Q[new_state]) - self.Q[state, action])

        # If the episode is terminal, stop
        if done:
          break

        state = new_state
  
  def test(self, env, episodes, steps):
    """
    Test the agent env,
    And with episodes amount of episodes,
    And finally with steps amount of steps
    """
    total_reward = 0

    # Start the episodes
    for episode in range(episodes):
        state = env.reset()
        step = 0
        done = False
        episode_reward = 0

        print("=" * 20)
        print(f"EPISODE {episode}")
        print("=" * 20)

        # Start the episode
        for step in range(steps):
            # Render
            env.render()

            # Choose an action
            action = np.argmax(self.Q[state, :])

            # Get the s', r, T, and info (new state, reward, terminal, and info)
            new_state, reward, done, info = env.step(action)
            episode_reward += reward
            
            # If the episode is terminal, stop
            if done:
                total_reward += episode_reward
                break

            state = new_state

    env.close()
    print (f"Score over time: {total_reward / episode}")

In [4]:
env = gym.make("Taxi-v3")
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+



In [9]:
# Create agent
agent = TaxiAgent(env.observation_space.n, env.action_space.n,
                  0.01, 0.99, 1.0, 0.001, 0.1)

In [18]:
# Train agent
agent.train(env, 25000, 1000)

In [19]:
# Test agent
agent.test(env, 100, 200)

[1;30;43mStreaminguitvoer ingekort tot de laatste 5000 regels.[0m
|R: | : :[35mG[0m|
| : | : :[42m_[0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35m[42mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
EPISODE 52
+---------+
|[35mR[0m: | : :[43mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|[35mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+--