In [1]:
# In order to force the reload of modules
# %load_ext autoreload
# %autoreload 2

# In order to make the import of local modules
import sys
sys.path.append('../..')

import abc
from collections import *
from dataclasses import dataclass
import enum
import gym
import heapq
import numpy as np
import pandas as pd
import random
from typing import *

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.tensorboard import SummaryWriter

from ml.rl.core import *

%matplotlib inline
import imageio
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
"""
Showing information about the environment
"""


with gym.make("MountainCar-v0") as env:
    print("observation shape:", env.reset().shape)
    print("action space:", {env.action_space.sample() for _ in range(100)})

observation shape: (2,)
action space: {0, 1, 2}


In [12]:
"""
Helpers to try out some random agents
"""


class PolicyAgent(Agent):
    def __init__(self, net):
        self.net = net
        
    def get_action(self, state: np.ndarray):
        action_values = self.net(torch.FloatTensor(state))
        _, i = torch.max(action_values, dim=-1)
        return i.item()


def try_agent(agent: Agent, show=True):
    with gym.make("MountainCar-v0") as env:
        try_agent_on(env, agent, show=show)


with gym.make("MountainCar-v0") as env:
    try_agent_on(env, RandomAgent(env))

Total reward -200.00


In [4]:
"""
Cross Entropy Method:
- Start with a random policy
- Play N episodes with the current policy
- Take the episodes above a reward boundary (typically percentile 70th)
- Train on these "Elite" episodes (throw away the uninteresting ones)

The problem here is that it will never make progress, because we never have episodes
with something else than -200 as reward. We could change the way the reward is done
but this is not the case here.

DOOMED TO FAIL
"""


pass

In [5]:
"""
IMITATION LEARNING
"""


pass # TODO

In [23]:
"""
Neural net to learn the policy
"""


class QValueNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(nn.Linear(2, 100), nn.ReLU(), nn.Linear(100, 3))

    def forward(self, xs):
        return self.fc(xs)
    
    def clone(self):
        cloned = QValueNet()
        cloned.load_state_dict(self.state_dict())
        return cloned

    
"""
Q-Learning training loop
"""


class ProgressMonitor:
    def __init__(self):
        self.writer = SummaryWriter(comment='mountain-car-v0')
        self.episode = 0
        self.success_count = 0
    
    def track(self, values: Dict[str, Union[int, float]]):
        for name, value in values.items():
            self.writer.add_scalar(name, value, self.episode)
        self.episode += 1
    
    def close(self):
        self.writer.close()


def epsilon_greedy_action(env: gym.Env, net: nn.Module, state: torch.FloatTensor,
                          epsilon: float) -> Tuple[int, float]:
    action_values = net(state)
    if np.random.rand(1) < epsilon:
        action = env.action_space.sample()
        return action, action_values[action].item()
    else:
        action_value, action = torch.max(action_values, dim=-1)
        return action.item(), action_value.item()

    
def get_next_state_value(target_net: nn.Module, next_state: torch.FloatTensor):
    action_values = target_net(next_state)
    max_q, _ = torch.max(action_values, dim=-1)
    return max_q.item()


def sample_minibatch(net: nn.Module, replay_buffer):
    observations, action_indices, target_values = replay_buffer.sample(size=8)
    current_values = net(observations)
    action_indices = action_indices.unsqueeze(dim=-1)
    current_values = torch.gather(current_values, dim=-1, index=action_indices).squeeze(dim=-1)
    return current_values, target_values


def train_q_learning(
    env: gym.Env, net: nn.Module,
    episodes: int, discount: float,
    learning_rate: float, learning_rate_decay: float,
    weight_decay: float,
    epsilon: float, epsilon_decay: float):
    
    net.train()
    target_net = net.clone()
    target_net.eval()
        
    count_success = 0
    furthest_distance = float('-inf')
    monitor = ProgressMonitor()
    
    criterion = nn.MSELoss()
    optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=learning_rate_decay)
    
    replay_buffer = PrioritizedReplayBuffer(max_size=1_000)

    for episode_nb in prange(episodes):
        episode_loss = 0.
        episode_reward = 0.
        
        done = False
        state = torch.FloatTensor(env.reset())
        while not done:            
            action, action_value = epsilon_greedy_action(env, net, state, epsilon)
            next_state, reward, done, _ = env.step(action)
            next_state = torch.FloatTensor(next_state)
            
            # Compute the target action value
            target_value = reward + 0. if done else discount * get_next_state_value(target_net, next_state)
            replay_buffer.add(state, action, prev_value=action_value, value=target_value)
            
            # Sample a mini-batch from the replay buffer and apply the bellman update
            if len(replay_buffer) >= 500:
                current_values, target_values = sample_minibatch(net, replay_buffer)
                loss = criterion(current_values, target_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                episode_loss += loss.item()
            
            # Moving to next state
            episode_reward += reward
            state = next_state
        
        # Switching the model every now and then
        if (episode_nb + 1) % 100 == 0:
            target_net = net.clone()
        
        # Update the hyper-parameters on success
        if episode_reward > -200.:
            count_success += 1
            epsilon *= epsilon_decay
            scheduler.step()

        # Record history
        furthest_distance = max(furthest_distance, state[0])
        monitor.track({
            'Episode/reward': episode_reward,
            'Episode/successes_ratio': count_success / (episode_nb + 1),
            'Episode/furthest': furthest_distance,
            'Training/loss': episode_loss,
            'Training/epsilon': epsilon})
    
    monitor.close()

    
"""
Training
"""

with gym.make("MountainCar-v0") as env:
    q_net = QValueNet()
    train_q_learning(
        env, net=q_net, episodes=2000, discount=0.99,
        learning_rate=1e-3, learning_rate_decay=0.9, weight_decay=0.0,
        epsilon=0.3, epsilon_decay=0.95)

[A2000/2000 (100.00%) - 6.36 it/s352/2000 (67.60%) - 5.00 it/s1469/2000 (73.45%) - 4.99 it/s


In [28]:
"""
Trying the Q learning agent
"""


with gym.make("MountainCar-v0") as env:
    try_agent_on(env, PolicyAgent(q_net))

Total reward -150.00
