# [08] Deep Q-Learning in Cartpole 

### Imports & Constants

In [1]:
import gym
import ptan
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


HIDDEN_SIZE = 128
BATCH_SIZE = 16
TGT_NET_SYNC = 10
GAMMA = 0.9
REPLAY_SIZE = 1000
LR = 1e-3
EPS_DECAY=0.99

<br>

### Network & Helper Function

In [3]:
class Net(nn.Module):
    '''Neural Net'''
    
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x.float())


@torch.no_grad()
def unpack_batch(batch, net, gamma):
    '''
    Unpacks a batch of observations into:
    - `states`
    - `actions`
    - `rewards`
    - `done_masks`
    - `last_states`
    and converts some into tensors 
    '''
    
    states = []
    actions = []
    rewards = []
    done_masks = []
    last_states = []
    
    # Split up batches 
    for exp in batch:
        states.append(exp.state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        done_masks.append(exp.last_state is None)
        if exp.last_state is None:
            last_states.append(exp.state)
        else:
            last_states.append(exp.last_state)

    # Convert to tensors 
    states_v = torch.tensor(states)
    actions_v = torch.tensor(actions)
    rewards_v = torch.tensor(rewards)
    last_states_v = torch.tensor(last_states)
    
    # Q-Value of last state
    last_state_q_v = net(last_states_v)
    
    # Best Q-value of last state 
    best_last_q_v = torch.max(last_state_q_v, dim=1)[0]
    
    # Best Q-value of last state of last observation is 0
    best_last_q_v[done_masks] = 0.0
    
    # Return states, actions, and bellman approximation
    return states_v, actions_v, best_last_q_v * gamma + rewards_v

<br> 

### Main

In [9]:
# Directories 
model_dir = 'models/'
rec_dir = 'recordings/'

# Environment info 
env = gym.make("CartPole-v0")
env = gym.wrappers.Monitor(env, rec_dir, force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

# Network & Target Network 
net = Net(obs_size, HIDDEN_SIZE, n_actions)
tgt_net = ptan.agent.TargetNet(net)

# Selector 
selector = ptan.actions.ArgmaxActionSelector()
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1, selector=selector)

# Agent 
agent = ptan.agent.DQNAgent(net, selector)

# Experience Source 
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)

# Replay Buffer 
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)

optimizer = optim.Adam(net.parameters(), LR)
step = 0  # Keeps track of number of training iterations 
episode = 0 
solved = False 

while True:
    step += 1
    
    # Get a sample from the experience source & check for finished episode 
    buffer.populate(1)

    # 
    for reward, steps in exp_source.pop_rewards_steps():
        episode += 1
        print("Step %d: episode %d done, reward=%.3f, epsilon=%.2f" % (step, episode, reward, selector.epsilon))
        solved = reward > 150
    
    # If environment is solved 
    if solved:
        print("Environment solved!")
        break

    # 
    if len(buffer) < 2*BATCH_SIZE:
        continue

    # Sample `BATCH_SIZE` experiences from the buffer & unpack them 
    batch = buffer.sample(BATCH_SIZE)
    states_v, actions_v, tgt_q_v = unpack_batch(batch, tgt_net.target_model, GAMMA)
    
    # Get Q-values 
    optimizer.zero_grad()
    q_v = net(states_v)
    q_v = q_v.gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    
    # Calculate loss & backpropagate 
    loss_v = F.mse_loss(q_v, tgt_q_v)
    loss_v.backward()
    optimizer.step()
    
    # Decay epsilon in action selector 
    # Note: In this case, `epsilon` decays to 0 at training step 500
    selector.epsilon *= EPS_DECAY

    # Sync target network weights every 10 training iterations 
    if step % TGT_NET_SYNC == 0:
        tgt_net.sync()

Step 15: episode 1 done, reward=14.000, epsilon=1.00
Step 28: episode 2 done, reward=13.000, epsilon=1.00
Step 43: episode 3 done, reward=15.000, epsilon=0.90
Step 77: episode 4 done, reward=34.000, epsilon=0.64
Step 94: episode 5 done, reward=17.000, epsilon=0.54
Step 109: episode 6 done, reward=15.000, epsilon=0.46
Step 126: episode 7 done, reward=17.000, epsilon=0.39
Step 136: episode 8 done, reward=10.000, epsilon=0.35
Step 146: episode 9 done, reward=10.000, epsilon=0.32
Step 160: episode 10 done, reward=14.000, epsilon=0.28
Step 171: episode 11 done, reward=11.000, epsilon=0.25
Step 183: episode 12 done, reward=12.000, epsilon=0.22
Step 195: episode 13 done, reward=12.000, epsilon=0.19
Step 207: episode 14 done, reward=12.000, epsilon=0.17
Step 218: episode 15 done, reward=11.000, epsilon=0.15
Step 227: episode 16 done, reward=9.000, epsilon=0.14
Step 237: episode 17 done, reward=10.000, epsilon=0.13
Step 247: episode 18 done, reward=10.000, epsilon=0.12
Step 257: episode 19 done

<br>