# [07] Deep Q-Learning in Pong

### Imports & Constants 

In [1]:
import time, datetime, collections, os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from lib import wrappers, dqn_model


DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19  # Mean reward boundary 

GAMMA = 0.99  # Used for bellman approximation 
BATCH_SIZE = 32  # Batch size to be sampled from replay buffer 
REPLAY_SIZE = 10000  # Maximum size of replay buffer 

LEARNING_RATE = 1e-4
REPLAY_START_SIZE = 10000  # Number of frames we wait for, before starting training, to fill the replay buffer 

# Model sync freqeuncy (from training model --> to target model)
# Used to get the value of the next state 
SYNC_TARGET_FRAMES = 1000  


EPSILON_START = 1.0  # Start by selecting all actions randomly 

# In the first `EPSILON_DECAY_LAST_FRAME` frames, epsilon decays to 0.01
# This corresponds to a random action taken on 1% of the steps 
EPSILON_DECAY_LAST_FRAME = 150000
EPSILON_FINAL = 0.01  

<br> 

### Sanity-Checking

##### Calculating Conv2D Dimensions
<img src='images/conv2d_dimension_calculation.png' style="width:700px;height:400px;">

##### Testing `__init__`

In [2]:
env = wrappers.make_env(DEFAULT_ENV_NAME)
input_shape = env.observation_space.shape
n_actions = env.action_space.n
model = dqn_model.DQN(input_shape, n_actions)
print(model)

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


##### Testing `_get_conv_out`

In [3]:
conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), # (4, 84, 84) --> (32, 20, 20)
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2), # (32, 20, 20) --> (64, 9, 9)
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), # (64, 9, 9) --> (64, 7, 7)
            nn.ReLU()
        )
o = conv(torch.zeros(1, *input_shape))
num_parameters = int(np.prod(o.size()))

print(input_shape)
print(o.size())
print(num_parameters)

(4, 84, 84)
torch.Size([1, 64, 7, 7])
3136


##### Testing `forward`

In [4]:
fc = nn.Sequential(
            nn.Linear(3136, 512), # 3136 --> 512
            nn.ReLU(),
            nn.Linear(512, n_actions) # 512 --> 6
        )

state = env.reset()
state_a = np.array([state], copy=False)
state_v = torch.tensor(state_a)

conv_out = conv(state_v).view(state_v.size()[0], -1) # (1, 4, 84, 84) --> (1, 3136)
fc_out = fc(conv_out) # (1, 3136) --> (1, 6)

print(state.shape)
print(state_v.shape)
print(conv_out.shape)
print(fc_out.shape)

(4, 84, 84)
torch.Size([1, 4, 84, 84])
torch.Size([1, 3136])
torch.Size([1, 6])


<br> 

### Experience Replay Buffer 

In [5]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceBuffer:
    '''Experience Replay Buffer to break correlations between subsequent steps in the environment'''
    
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        '''Appends `experience` to breplay buffer'''
        
        self.buffer.append(experience)

    def sample(self, batch_size):
        '''
        Samples `batch_size` number of experiences from the buffer
        Requirement: len(self.buffer) > batch_size
        '''
        
        # Create list of random indices (from 1-`len(self.buffer)`) of size `batch_size`
        # Note: Without replacement, so there CAN'T be duplicates
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        
        # Sample experience tuples from buffers 
        # Then split them up in: `states`, `actions`, `rewards`, `dones`, and `next_states`
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        
        return np.array(states), \
               np.array(actions), \
               np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), \
               np.array(next_states)

##### Testing `ExperienceBuffer`

In [6]:
buffer = collections.deque(maxlen=100)
buffer.append(("s0", "a0", "r0", "d0", "s'0"))
buffer.append(("s1", "a1", "r1", "d1", "s'1"))
buffer.append(("s2", "a2", "r2", "d2", "s'2"))
buffer.append(("s3", "a3", "r3", "d3", "s'3"))

batch_size=2
indices = np.random.choice(len(buffer), batch_size, replace=False)
states, actions, rewards, dones, next_states = zip(*[buffer[idx] for idx in indices])

print(indices)
print([buffer[idx] for idx in indices])
print(states, actions, rewards, dones, next_states)
print(np.array(states), np.array(actions), np.array(rewards), np.array(dones), np.array(next_states))

[2 3]
[('s2', 'a2', 'r2', 'd2', "s'2"), ('s3', 'a3', 'r3', 'd3', "s'3")]
('s2', 's3') ('a2', 'a3') ('r2', 'r3') ('d2', 'd3') ("s'2", "s'3")
['s2' 's3'] ['a2' 'a3'] ['r2' 'r3'] ['d2' 'd3'] ["s'2" "s'3"]


<br> 

### Agent

In [7]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env  # Environment 
        self.exp_buffer = exp_buffer  # Replay Buffer 
        self._reset()  # Reset the `state` and `total_reward` 

    def _reset(self):
        '''Resets environment and reward '''
        
        self.state = env.reset()
        self.total_reward = 0.0

    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device="cpu"):
        '''Takes a step in the environment'''
        
        done_reward = None

        # Epsilon-Greedy Policy 
        if np.random.random() < epsilon:
            
            # Take random action 
            action = env.action_space.sample()
            
        else:
            
            # Get Q_values for all possible actions and choose the best one 
            state_a = np.array([self.state], copy=False)  # Action 
            state_v = torch.tensor(state_a).to(device)  # Action tensor
            q_vals_v = net(state_v)  # Q-values (tensor)
            _, act_v = torch.max(q_vals_v, dim=1)  # Highest Q-value (tensor)
            action = int(act_v.item())  # Extracting value 

        # Take step 
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        # Save experience 
        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        
        # Prepare for next step 
        self.state = new_state
        
        # If environment is over 
        if is_done:
            done_reward = self.total_reward
            self._reset()
            
        return done_reward

##### Testing `Agent`

In [8]:
env = wrappers.make_env(DEFAULT_ENV_NAME)
state = env.reset()
total_reward = 0

state_a = np.array([state], copy=False)
state_v = torch.tensor(state_a)
q_vals_v = model(state_v)
_, act_v = torch.max(q_vals_v, dim=1)
action = int(act_v.item())

print('Action Shape: \t\t', state_a.shape)
print('Action Tensor Shape: \t', state_v.shape)
print('Q-Values: \t\t', q_vals_v.data)
print('Highest Q-Value: \t', act_v)
print('Best Action: \t\t', action)

Action Shape: 		 (1, 4, 84, 84)
Action Tensor Shape: 	 torch.Size([1, 4, 84, 84])
Q-Values: 		 tensor([[-0.0343,  0.0217, -0.0394, -0.0175,  0.0236,  0.0226]])
Highest Q-Value: 	 tensor([4])
Best Action: 		 4


<br> 

### Calculate Loss

For steps not at the end of te episode: 
$ L = \Big( Q(s,a) - \big(r + \gamma \max_{a' \in A} \hat{Q}(s',a') \big) \Big)^2 $  
For final steps: 
$ L = \big( Q(s, a) - r \big)^2 $

In [9]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    '''
    Calculates loss for a batch of experiences
    
    net: 
    - Network we're training
    - Used to calculate gradients
    
    tgt_net: 
    - Target Network
    - Is periodically synced with `net`
    - Calculates values for the next states (this doesn't affect the gradients)
    '''
    
    # Extract `states`, `actions`, `rewards`, `dones`, and `next_states`
    states, actions, rewards, dones, next_states = batch

    # Convert to tensors and move to `device`
    states_v = torch.tensor(np.array(states, copy=False)).to(device) 
    next_states_v = torch.tensor(np.array(next_states, copy=False)).to(device) 
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    # Get Q-values for states
    # `actions_v.unsqueeze(-1)`: Tensor of indices of elements to be chosen 
    # `squeeze(-1)`: Remove unnecessary dimensions 
    # Note: `gather` keeps all gradients
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    
    with torch.no_grad():
        
        # Calculate maximum Q-values for next states, along action-dimension (1)
        # Note: `.max` calculates max values and their indices, but we only want max values, so we index [0]
        next_state_values = tgt_net(next_states_v).max(1)[0]
        
        # ? 
        next_state_values[done_mask] = 0.0
        
        # `detach` prevents gradients from flowing into the `tgt_net`'s graph
        next_state_values = next_state_values.detach()

    # Bellman approximation 
    expected_state_action_values = next_state_values * GAMMA + rewards_v
    
    # Mean squared error loss 
    return nn.MSELoss()(state_action_values, expected_state_action_values)

`state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)`:

<img src="images/gather.png" style="height:200px">

##### Testing `calc_loss`

In [10]:
import pprint

buffer = ExperienceBuffer(100)
buffer.append(Experience(0, 0, 0, False, 1))
buffer.append(Experience(1, 1, 1, False, 2))
buffer.append(Experience(2, 2, 2, False, 3))
buffer.append(Experience(3, 3, 3, False, 4))
batch = buffer.sample(2)

states, actions, rewards, dones, next_states = batch

states_v = torch.tensor(np.array(states, copy=False))
next_states_v = torch.tensor(np.array(next_states, copy=False))
actions_v = torch.tensor(actions)
rewards_v = torch.tensor(rewards)
done_mask = torch.BoolTensor(dones)

pprint.pprint(batch)
print('\nStates: \t', states_v)
print('Next States: \t', next_states_v)
print('Actions: \t', actions_v)
print('Rewards: \t', rewards_v)
print('Dones: \t\t', done_mask)

print('\nactions_v.shape: \t\t', actions_v.shape)
print('actions_v.unsqueeze(-1).shape: \t', actions_v.unsqueeze(-1).shape)

(array([2, 1]),
 array([2, 1]),
 array([2., 1.], dtype=float32),
 array([0, 0], dtype=uint8),
 array([3, 2]))

States: 	 tensor([2, 1])
Next States: 	 tensor([3, 2])
Actions: 	 tensor([2, 1])
Rewards: 	 tensor([2., 1.])
Dones: 		 tensor([False, False])

actions_v.shape: 		 torch.Size([2])
actions_v.unsqueeze(-1).shape: 	 torch.Size([2, 1])


<br> 

### Main

In [11]:
### GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Environment
env = wrappers.make_env(DEFAULT_ENV_NAME)
env_name = env.unwrapped.spec.id

# TensorBoard
log_dir = os.path.join('runs', datetime.datetime.now().strftime("%Y:%m:%d-%H:%M:%S"))
writer = SummaryWriter(log_dir)

model_dir = 'models/'

# Recording 
# rec_dir = "./recordings/07_recording"
# recording = gym.wrappers.monitoring.video_recorder.VideoRecorder(env, base_path=rec_dir)

net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) # (Training) Network: Calculates gradients 
tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) # Target Network: Calculates Q-values for the next states 
print(net, ' \n')

buffer = ExperienceBuffer(REPLAY_SIZE) # Replay buffer 
agent = Agent(env, buffer) # Agent 
epsilon = EPSILON_START # Initial epsilon 

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) 
total_rewards = [] # All (done-)rewards 
frame_idx = 0 # Frame counter (used when there's a reward and env is over)
ts_frame = 0 # Time counter (used when there's a reward and env is over)
ts = time.time() # Current time 
best_m_reward = None # Best mean reward 

while True:
    
    env.render()
    # recording.caputre_frame()
    
    # Increment number of iterations 
    frame_idx += 1
    
    # Decrease `epsilon` linearly from 0-`EPSILON_DECAY_LAST_FRAME`, then keep it constant at `EPSILON_FINAL`
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

    # Take step using current network 
    reward = agent.play_step(net, epsilon, device=device)    
    
    # If this stpe is the last step in the episode 
    if reward is not None:
        
        # Keep track of reward given 
        total_rewards.append(reward)
        
        # Calculate speed (as number of fps) 
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        
        # Number of episodes played and current time 
        ts_frame = frame_idx
        ts = time.time()
        
        # Mean reward for last 100 episodes 
        m_reward = np.mean(total_rewards[-100:])
        
        # Print & Save to TensorBord 
        print("Frame #%d: \tdone %d games, \treward %.3f, \tepsilon %.2f, \tspeed %.2f f/s" % (frame_idx, len(total_rewards), m_reward, epsilon, speed))
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("speed", speed, frame_idx)
        writer.add_scalar("reward_100", m_reward, frame_idx)
        writer.add_scalar("reward", reward, frame_idx)
        
        # If the mean reward for the alst 100 episodes is a maximum 
        if best_m_reward is None or best_m_reward < m_reward:
            
            # Save model (rounds up/down)
            torch.save(net.state_dict(), model_dir + env_name + "_%.0f.dat" % m_reward)
                
            if best_m_reward is not None:
                print("\nBest reward updated %.3f -> %.3f\n" % (best_m_reward, m_reward))
            
            # Save mean reward 
            best_m_reward = m_reward
        
        # If the mean reward exceeds the boundary, stop training 
        if m_reward > MEAN_REWARD_BOUND:
            print("Solved in %d frames!" % frame_idx)
            break

    # If buffer ISN'T large enough for training, skip training 
    if len(buffer) < REPLAY_START_SIZE:
        continue

    # Every `SYNC_TARGET_FRAMES`, sync parameters from main network to target network 
    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    # Zero gradients, sample batch from buffer, calculate loss, optimize & minimize loss (for `net`)
    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()
        
writer.close()
# recording.close()

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)  

Frame #944: 	done 1 games, 	reward -21.000, 	epsilon 0.99, 	speed 232.34 f/s
Frame #1872: 	done 2 games, 	reward -20.500, 	epsilon 0.99, 	speed 274.93 f/s

Best reward updated -21.000 -> -20.500

Frame #2662: 	done 3 games, 	reward -20.667, 	epsilon 0.98, 	speed 301.69 f/s
Frame #3593: 	done 4 games, 	reward -20.750, 	epsilon 0.98, 	speed 498.88 f/s
Frame #4550: 	done 5 games, 	reward -20.600, 	epsilon 0.97, 	speed 490.56 f/s
Frame #5490: 	done 6 games, 	reward -20.667, 	epsilon 0.96, 	speed 421.10 f/s
Frame #6252: 	done 7 games, 	reward -20.714, 	epsilon 0.96, 	speed 41

<br>

### Final Screen of Training

<img src="recordings/training_final_screen.png" style="height:300px">

<br> 

### TensorBoard

In [None]:
!tensorboard dev upload --logdir='07_dqn/runs' --name='07_dqn_pong'

<img src="runs/epsilon.png" style="height:300px">

<img src="runs/reward_100.png" style="height:300px">

<img src="runs/reward.png" style="height:300px">

<img src="runs/speed.png" style="height:300px">

<br>