# Deep Learning with PyTorch


Author: [Anand Saha](http://teleported.in/)

### 6. Reinforcement Learning - Balance cartpole using DQN

### Exploring Gym environment

* To install: https://gym.openai.com/docs/
* Basically do a `pip install gym`

In [None]:
# Test if importable
import gym

In [None]:
# List environments
from gym import envs

for e in envs.registry.all(): 
    print(e)

In [None]:
# Let's try to load the cartpole environment
import gym
env = gym.make('CartPole-v0')
env.reset()
env.render()

In [None]:
# Let's see the action and state spaces
# The Discrete space allows a fixed range of non-negative numbers, so in this case valid actions are either 0 or 1. 
# The Box space represents an n-dimensional box, so valid observations will be an array of 4 numbers.

print(env.action_space)
print(env.observation_space)

In [None]:
# A loop where we take random actions
import gym
env = gym.make('CartPole-v0')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        # env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

### Balancing the cartpole

Adapted from https://github.com/keon/deep-q-learning/blob/master/dqn.py

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import gym
import numpy as np
from collections import deque
import random

**Build the neural network which will map the state space to action space**

In [2]:
class DQN(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.linear1 = nn.Linear(input_dim, 120)
        self.linear2 = nn.Linear(120, output_dim)
        
    def forward(self, input):
        out = self.linear1(input)
        out = F.relu(out)
        out = self.linear2(out)
        return out

**Build the agent**

In [3]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        
        # Our DQN
        self.model = DQN(state_size, action_size)
        self.criteria = nn.MSELoss()
        self.opt = optim.Adam(self.model.parameters(), lr=self.learning_rate)
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.model(Variable(torch.Tensor(state))).data.numpy()
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state_v = Variable(torch.Tensor(next_state))
                target = (reward + self.gamma * np.amax(self.model(next_state_v).data.numpy()[0]))
                
            target_actual = self.model(Variable(torch.Tensor(state))).data.numpy()
            target_actual[0][action] = target
            
            self.opt.zero_grad()
            out = self.model(Variable(torch.Tensor(state)))
            loss = self.criteria(out, Variable(torch.Tensor(target_actual)))
            loss.backward()
            self.opt.step()
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def save(self):
        torch.save(self.model.state_dict(), "./6.cartpole.pth")
        
    def load(self):
        self.model = DQN(self.state_size, self.action_size)
        self.model.load_state_dict(torch.load("./6.cartpole.pth"))

**Train it**

In [None]:
EPISODES = 3000
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    for time in range(500):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, time, agent.epsilon))
            break
            
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
        
agent.save()

**Use the trained Agent**

In [4]:
import time
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
agent.epsilon = 0. # Important
agent.load()
state = env.reset()
state = np.reshape(state, [1, state_size])

for t in range(500):
    env.render()
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    reward = reward if not done else -10
    next_state = np.reshape(next_state, [1, state_size])
    agent.remember(state, action, reward, next_state, done)
    state = next_state
    time.sleep(0.1)
    if done:
        e = 1
        print("score: {}, e: {:.2}"
              .format(t, agent.epsilon))
        break

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
score: 226, e: 0.0


**Homework**

* Run for more episodes
* Alter the DQN
* Try out a different environment
* Play with the hyper parameters (batch size, memory length etc.)