In [7]:
# In order to force the reload of modules
# %load_ext autoreload
# %autoreload 2

# In order to make the import of local modules
import sys
sys.path.append('../..')

import abc
from collections import *
from dataclasses import dataclass
import enum
import gym
import heapq
import numpy as np
import pandas as pd
import random
from typing import *

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.tensorboard import SummaryWriter

from ml.rl.core import *

%matplotlib inline
import imageio
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
"""
Showing information about the environment
"""


with gym.make("MountainCar-v0") as env:
    print("observation shape:", env.reset().shape)
    print("action space:", {env.action_space.sample() for _ in range(100)})

observation shape: (2,)
action space: {0, 1, 2}


In [3]:
def try_agent(agent: Agent, show=True):
    with gym.make("MountainCar-v0") as env:
        total_reward = 0.0
        obs = env.reset()
        if show:
            env.render()
        done = False
        while not done:
            action = agent.get_action(env, obs)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            if show:
                env.render()
        if show:
            print("Total reward {0:.2f}".format(total_reward))

In [4]:
"""
Random agent
"""

class RandomAgent(Agent):
    def get_action(self, env, state):
        return env.action_space.sample()


try_agent(RandomAgent())

Total reward -200.00


In [7]:
"""
Cross Entropy Method:
- Start with a random policy
- Play N episodes with the current policy
- Take the episodes above a reward boundary (typically percentile 70th)
- Train on these "Elite" episodes (throw away the uninteresting ones)

The problem here is that it will never make progress, because we never have episodes
with something else than -200 as reward. We could change the way the reward is done
but this is not the case here.

DOOMED TO FAIL
"""

pass

In [5]:
"""
IMITATION LEARNING
"""

pass # TODO

In [52]:
"""
With Deep Q-Learning
"""


class PolicyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(2, 50, bias=False),
            nn.ReLU(),
            nn.Linear(50, 3, bias=False))

    def forward(self, xs):
        ys = self.fc(xs)
        return ys
    
    def clone(self):
        cloned = PolicyNet()
        cloned.load_state_dict(self.state_dict())
        return cloned
        

    
class PolicyAgent:
    def __init__(self, sarsa_net):
        self.sarsa_net = sarsa_net
        
    def get_action(self, env, state):
        action_values = self.sarsa_net(state)
        _, i = torch.max(action_values, dim=-1)
        return i.item()

    
"""
Training
"""


class PrioritizedReplayBuffer:
    def __init__(self, max_size: int):
        self.fifo = deque(maxlen=max_size)
        self.weights = deque(maxlen=max_size)
    
    def add(self, observation, action, prev_value, value):
        self.fifo.append((observation, action, value))
        self.weights.append(abs(value - prev_value))
    
    def sample(self, size) -> Tuple['observations', 'actions', 'values']:
        observations = []
        actions = []
        values = []
        for observation, action, value in random.choices(self.fifo, weights=self.weights, k=size):
            observations.append(observation)
            actions.append(action)
            values.append(value)
        return torch.stack(observations), torch.LongTensor(actions), torch.FloatTensor(values)
    
    def __len__(self):
        return len(self.fifo)
    
def epsilon_greedy_action(policy: nn.Module, state, epsilon) -> Tuple[int, float, torch.FloatTensor]:
    action_values = policy(state)
    if np.random.rand(1) < epsilon:
        action = env.action_space.sample()
        return action, action_values[action], action_values
    else:
        action_value, action = torch.max(action_values, dim=-1)
        return action.item(), action_value, action_values
    

def train_sarsa_learning(
    env, policy: nn.Module, episodes: int, discount: float,
    learning_rate: float, weight_decay: float,
    epsilon: float, epsilon_decay: float):
    
    policy.train()
    
    count_success = 0
    furthest_distance = float('-inf')
    writer = SummaryWriter(comment='mountain-car-v0')
    
    criterion = nn.MSELoss()
    optimizer = optim.SGD(policy.parameters(), lr=learning_rate, weight_decay=weight_decay)
    replay_buffer = PrioritizedReplayBuffer(max_size=1_000)

    frozen_policy = policy.clone()
    frozen_policy.eval()

    for episode in prange(episodes):
        episode_loss = 0.
        episode_reward = 0.
        episode_min_x = float('inf')
        episode_max_x = float('-inf')
        
        done = False
        state = env.reset()
        state = torch.FloatTensor(state)
        while not done:
            episode_min_x = min(episode_min_x, state[0])
            episode_max_x = max(episode_max_x, state[0])
            
            action, action_value, _ = epsilon_greedy_action(policy, state, epsilon)
            next_state, reward, done, _ = env.step(action)
            next_state = torch.FloatTensor(next_state)
            
            # Compute the target values
            if not done:
                next_action_values = frozen_policy(next_state)
                max_q, _ = torch.max(next_action_values, dim=-1)
                max_q = min(max(-200, max_q.item()), 0) # Clamp to avoid explosion?
                target_value = reward + discount * max_q
            else:
                target_value = reward
            replay_buffer.add(state, action, action_value.item(), target_value)
            
            # Sample a mini-batch from the replay buffer
            if len(replay_buffer) >= 500:
                observations, action_indices, target_values = replay_buffer.sample(size=8)
                current_values = policy(observations)
                action_indices = action_indices.unsqueeze(dim=-1)
                current_values = torch.gather(current_values, dim=-1, index=action_indices).squeeze(dim=-1)
                optimizer.zero_grad()
                loss = criterion(current_values, target_values)
                loss.backward()
                optimizer.step()
                episode_loss += loss.item()
            
            # Moving to next state
            episode_reward += reward
            state = next_state
        
        # Switching the model every now and then
        if (episode + 1) % 100 == 0:
            frozen_policy = policy.clone()
            frozen_policy.eval()
        
        # Criteria of success
        if episode_reward > -200.:
            count_success += 1
            epsilon *= epsilon_decay

        # Record history
        distance = state[0]
        furthest_distance = max(furthest_distance, state[0])
        writer.add_scalar('Episode/reward', episode_reward, episode)
        writer.add_scalar('Episode/successes_ratio', count_success / (episode + 1), episode)
        writer.add_scalar('Episode/furthest', furthest_distance, episode)
        writer.add_scalar('Episode/distance_lo', episode_min_x, episode)
        writer.add_scalar('Episode/distance_hi', episode_max_x, episode)
        writer.add_scalar('Training/loss', episode_loss, episode)
        writer.add_scalar('Training/epsilon', epsilon, episode)
    
    writer.close()

    
"""
Training
"""

q_net = PolicyNet()
with gym.make("MountainCar-v0") as env:
    train_sarsa_learning(
        env, policy=q_net, episodes=2000, discount=0.99,
        learning_rate=1e-1, weight_decay=0.0,
        epsilon=0.3, epsilon_decay=0.95)

[A2000/2000 (100.00%) - 5.61 it/s
