In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import time
import tensorwatch as tw

import gym
import math
import random
import cv2
import numpy as np
import collections
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from wrappers import make_env

NOTEBOOK_MODE = True
if NOTEBOOK_MODE:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm


env = gym.make('Pong-v0').unwrapped
if NOTEBOOK_MODE:
    # set up matplotlib to open viewing window
    is_ipython = 'inline' in matplotlib.get_backend()
    if is_ipython:
        from IPython import display

    plt.ion()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import gym
import argparse
import math
import os
import tensorwatch as tw

import torch
import torch.optim as optim
import torch.nn.functional as F

# from apex import amp # playing around with mixed-precision training

# Local Imports
from models import Qnet
from wrappers import make_env
from memory import ReplayBuffer
from helpers import saveTrainedGameplay, get_state
from settings import device

In [None]:
env.unwrapped.get_action_meanings()

In [None]:
resize = T.Compose([T.ToPILImage(),
                    T.Resize(64, interpolation=Image.CUBIC),
                    T.ToTensor()])

def get_screen():
    # convert to channel,h,w dimensions
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
    
    # erase background
    screen[screen == 72] = 0 
    screen[screen == 74] = 0 
    screen[screen == 144] = 0 
    screen[screen != 0] = 213
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    
    screen = torch.from_numpy(screen)
    
    # convert to batch,channel,h,w dimensions
    return resize(screen).unsqueeze(0).to(device)

if NOTEBOOK_MODE:
    env.reset()
    # run game for a bit to load the ball and opponent paddle
    for i in range(50):
        env.step(0)
    plt.figure()
    plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(),
               interpolation='none')
    plt.show()

In [None]:
memory = ReplayBuffer(size=100)

#memory.put((state,action,reward,next_state,done_mask))

In [None]:
import sys
#memory.put((state,1,0.0,next_state,0.0))

# Test Memory Replay Buffer

In [16]:
%load_ext autoreload
%autoreload 2
from memory import ReplayBuffer
import gym
from helpers import get_state
from wrappers import make_env
env = gym.make('PongNoFrameskip-v4')
env = make_env(env)
memory = ReplayBuffer(size=100000)
state = get_state(env.reset())
next_state = get_state(env.step(0)[0])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
for i in range(20000):
    if i % 100 == 0:
        env.reset()
    next_state = get_state(env.step(1)[0])
    memory.put((state,1,0.0,next_state,0.0))
    state = next_state

In [22]:
import torch 
device = 'cuda'
for i in range(10000):
    s,a,r,s_prime,done_mask = memory.sample(32)
    s = torch.Tensor(s).to(device)
    a = torch.LongTensor(a).to(device)
    r = torch.Tensor(r).to(device)
    s_prime = torch.Tensor(s_prime).to(device)
    done_mask = torch.Tensor(done_mask).to(device)

In [28]:
torch.as_tensor(memory.sample(10)[0][0])

tensor([[[ 52,  87,  87,  ..., 236, 236, 236],
         [ 52,  87,  87,  ..., 236, 236, 236],
         [ 52,  87,  87,  ..., 236, 236, 236],
         ...,
         [ 87,  87,  87,  ..., 236, 236, 236],
         [ 87,  87,  87,  ..., 236, 236, 236],
         [ 87,  87,  87,  ..., 236, 236, 236]],

        [[ 52,  87,  87,  ..., 236, 236, 236],
         [ 52,  87,  87,  ..., 236, 236, 236],
         [ 52,  87,  87,  ..., 236, 236, 236],
         ...,
         [ 87,  87,  87,  ..., 236, 236, 236],
         [ 87,  87,  87,  ..., 236, 236, 236],
         [ 87,  87,  87,  ..., 236, 236, 236]],

        [[ 52,  87,  87,  ..., 236, 236, 236],
         [ 52,  87,  87,  ..., 236, 236, 236],
         [ 52,  87,  87,  ..., 236, 236, 236],
         ...,
         [ 87,  87,  87,  ..., 236, 236, 236],
         [ 87,  87,  87,  ..., 236, 236, 236],
         [ 87,  87,  87,  ..., 236, 236, 236]],

        [[ 52,  87,  87,  ..., 236, 236, 236],
         [ 52,  87,  87,  ..., 236, 236, 236],
         [ 5

# Test Prioritized Memory Replay Buffer

In [None]:
%load_ext autoreload
%autoreload 2
from memory import PrioritizedReplayBuffer
import gym
from helpers import get_state
from wrappers import make_env
env = gym.make('PongNoFrameskip-v4')
env = make_env(env)
memory = PrioritizedReplayBuffer(size=10000, alpha = 1)

state = get_state(env.reset())
next_state = get_state(env.step(0)[0])
for _ in range(10000):
    a = np.random.choice([0,1,2,3])
    next_state = get_state(env.step(a)[0])
    memory.put((state,a,0.0,next_state,1.0))
    state = next_state

In [None]:
s,a,r,s_prime,done_mask,weights,idxes = memory.sample(4, 0.8)
print(weights)
print(idxes)

In [None]:
from models import Qnet
import torch
import torch.nn.functional as F
h, w = 84, 84
gamma = 0.98
device='cpu'
q = Qnet(h,w, in_channels = 4, n_actions = 4).to(device)
q_target = Qnet(h,w, in_channels = 4, n_actions = 4).to(device)

# Load policy weights into target network
q_target.load_state_dict(q.state_dict())

s = torch.Tensor(s).to(device)
a = torch.LongTensor(a).to(device)
r = torch.Tensor(r).to(device)
s_prime = torch.Tensor(s_prime).to(device)
done_mask = torch.Tensor(done_mask).to(device)

q_out = q(s)
# collect output from the chosen action dimension
q_a = q_out.gather(1,a)

# most reward we get in next state s_prime
argmax_q = q(s_prime).argmax(1).unsqueeze(1)
q_prime = q_target(s_prime).gather(1,argmax_q)
target = r + gamma * q_prime

TD_error = target-q_a
print(target-q_a)
loss = F.smooth_l1_loss(q_a, target)
print(loss)

In [None]:
w = torch.Tensor(weights)

In [None]:
w = torch.Tensor([0.03,0.5,0.2,0.8])

In [None]:
((TD_error**2).view(-1) * w).mean()

In [None]:
F.smooth_l1_loss(target.view(-1)*w, q_a.view(-1)*w)

In [None]:
(TD_error.view(-1) * torch.Tensor(weights)).mean()

In [None]:
%%timeit
torch.FloatTensor(np.expand_dims(np.uint8(x).transpose(), 0))

In [None]:
%%timeit
torch.ByteTensor(x).permute((2,0,1)).unsqueeze(0).float()

In [None]:
from pympler.asizeof import asizeof

In [None]:
sys.getsizeof(memory.buffer)

In [None]:
asizeof(memory)

In [None]:
asizeof(memory.buffer[0][0])

In [None]:
memory.buffer[0][0].type()

In [None]:
a = memory.buffer[0][0]
a.element_size() * a.nelement() # 4 * 1 * 4 * 84 * 84

In [None]:
a.element_size() * a.nelement() * 2 * 100000 / 1e+9

In [None]:
buffer_cost = 4 * 1 * 4 * 84 * 84 * 2 * 100000 / 1e9
print("%sGB" % buffer_cost)

In [None]:
'''
    Optimizes our training policy by computing the Huber Loss between our minibatch of samples and the maximum possible reward for the next state(s)
    Huber Loss here is defined as:
    loss(x,y) = \frac{1}{n}\sum{z_i}, where z_i = 0.5(x_i-y_i)^2; if |x_i - y_i| < 1 or 
                                                = |x_i - y_i| - 0.5; otherwise
'''
def train(q, q_target, memory, optimizer, batch_size, gamma):
    s,a,r,s_prime,done_mask = memory.sample(batch_size)
    
    q_out = q(s)
    # collect output from the chosen action dimension
    q_a = q_out.gather(1,a) 
    
    # most reward we get in next state s_prime
    max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
    target = r + gamma * max_q_prime * done_mask

    # how much is our policy different from the true target 
    loss = F.smooth_l1_loss(q_a, target)
    
    optimizer.zero_grad()

    #with amp.scale_loss(loss, optimizer) as scaled_loss: # playing around with mixed-precision training
    	#scaled_loss.backward()
    loss.backward()
    optimizer.step()



#def main(num_episodes, episode_start = 1, saved_model = None, save_loc = 'checkpoints/tmp/'):
#watcher = tw.Watcher()
num_episodes = 100
episode_start = 1
saved_model = 'checkpoints/4channel/target_bot_1500.pt'
save_loc = None
# Model parameters 
learning_rate = 1e-4 # 0.0001 matches paper
gamma = 0.98
buffer_limit = 10 ** 5 # paper uses 1M last frames, but this is expensive, so we try 10x less
batch_size = 32

# Epsilon Decay Parameters
eps_start = 1
eps_end = 0.01
decay_factor = 10 ** 5

epsilon_decay = lambda x: eps_end + (eps_start - eps_end) * \
    math.exp(-1. * x / decay_factor)

env = gym.make('PongNoFrameskip-v4')
env = make_env(env)
h, w = 84, 84


# Initialize the policy (q) network, target network, and experience replay buffer
q = Qnet(h,w, in_channels = 4, n_actions = 4).to(device)
q_target = Qnet(h,w, in_channels = 4, n_actions = 4).to(device)
memory = ReplayBuffer(buffer_limit)


# Load policy weights into target network
q_target.load_state_dict(q.state_dict())
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

save_interval = 250
print_interval = 1
update_target_interval = 1000 # every 1000 frames
score = 0.0


#[q, q_target], optimizer = amp.initialize([q, q_target], optimizer, opt_level="O1") #playing around with mixed-precision training
total_frames = 0
best_episode_score = -100
for episode in tqdm(range(episode_start,episode_start + num_episodes)):
    # anneal 100% to 1% over training
    epsilon = epsilon_decay(total_frames)

    # Reset Environment for each game
    state = get_state(env.reset())
    episode_score = 0
    done = False
    while not done:
        total_frames += 1
        action = q.sample_action(state.to(device), epsilon)

        obs, reward, done, info = env.step(action)

        next_state = get_state(obs)

        done_mask = 0.0 if done else 1.0
        memory.put((state,action,reward,next_state,done_mask))

        state = next_state

        score += reward
        episode_score += reward

        if memory.size() > 10000:
            train(q, q_target, memory, optimizer, batch_size, gamma)
        if total_frames%update_target_interval == 0:
            q_target.load_state_dict(q.state_dict())
        if done:
            break

    if episode_score > best_episode_score:
        best_episode_score = episode_score

    if episode%print_interval==0 and episode!=0:
        print("n_episode : {}, Total Frames : {}, Average Score : {:.1f}, Episode Score : {:.1f}, Best Score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
            episode, total_frames, score/episode, episode_score, best_episode_score, memory.size(), epsilon*100))

''' # Microsoft Tensorwatch Watcher for Visualizing Training
watcher.observe(
    episode = episode,
    episode_score = episode_score,
    total_score = score,
    buffer_size = memory.size(),
    epsilon = epsilon,
    frames = total_frames,
)'''

In [None]:
#train(q, q_target, memory, optimizer, batch_size, gamma)
s,a,r,s_prime,done_mask = memory.sample(batch_size)

q_out = q(s)
# collect output from the chosen action dimension
q_a = q_out.gather(1,a) 

# most reward we get in next state s_prime
max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
target = r + gamma * max_q_prime * done_mask

# how much is our policy different from the true target 
loss = F.smooth_l1_loss(q_a, target)

optimizer.zero_grad()

#with amp.scale_loss(loss, optimizer) as scaled_loss: # playing around with mixed-precision training
    #scaled_loss.backward()
print(loss)
loss.backward()
optimizer.step()

# DQN Training

In [None]:
from DQN import DQN
import gym
from wrappers import make_env
env = gym.make('PongNoFrameskip-v4')
env = make_env(env)
dqn = DQN(env, save_location = "checkpoints/pong/")
dqn.run(100)

# Double DQN Training

In [None]:
#s,a,r,s_prime,done_mask = memory.sample(batch_size)
# Q_out is the observed transitions given the current network
q_out = q(s)
# collect output from the chosen action dimension
q_a = q_out.gather(1,a)

# DDQN Update
argmax_q = q(s_prime).argmax(1).unsqueeze(1)
# most reward we get in next state s_prime
q_prime = q_target(s_prime).gather(1,argmax_q)
# most reward we get in next state s_prime
target = r + gamma * q_prime * done_mask

# how much is our policy different from the true target 
loss = F.smooth_l1_loss(q_a, target)
optimizer.zero_grad()
print(loss)
#with amp.scale_loss(loss, optimizer) as scaled_loss: # playing around with mixed-precision training
    #scaled_loss.backward()
loss.backward()
optimizer.step()

In [None]:
argmax_q = q(s_prime).argmax(1).unsqueeze(1)
print(argmax_q.shape)
q_prime = q_target(s_prime).gather(1,argmax_q)

In [None]:
# record trained agent gameplay

frames = []

env.reset()
current_s = get_screen()
done = False
last_s = get_screen()
current_s = get_screen()
s = last_s - current_s
epsilon = 0.0
while not done:
    a = q.sample_action(s, epsilon) + 2
    
    # use environment's frame instead of preprocessed get_screen()
    next_frame, _, done, info = env.step(a)
    frames.append(next_frame)
    last_s = current_s
    current_s = get_screen()
    s_prime = last_s - current_s

    done_mask = 0.0 if done else 1.0
    s = s_prime
    if done:
        break

In [None]:
# save game to video 
height, width = frames[0].shape[:2] 

writer = cv2.VideoWriter_fourcc('M','J','P','G')
fps = 30
video_file = 'playback.avi'
out = cv2.VideoWriter(video_file, writer, fps, (width,height))
for frame in frames:
    out.write(frame)

out.release()

In [19]:
%load_ext autoreload
%autoreload 2
from DDQN import DDQN
import gym
from wrappers import make_env
env = gym.make('PongNoFrameskip-v4')
env = make_env(env)
ddqn = DDQN(env, save_location = "checkpoints/pong/")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
ddqn.run(100)


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:01<02:01,  1.23s/it][A

n_episode : 1, Total Frames : 957, Average Score : -20.0, Episode Score : -20.0, Best Score : -20.0, n_buffer : 957, eps : 100.0%



  2%|▏         | 2/100 [00:25<13:23,  8.20s/it][A

n_episode : 2, Total Frames : 1892, Average Score : -19.5, Episode Score : -19.0, Best Score : -19.0, n_buffer : 1892, eps : 99.1%



  3%|▎         | 3/100 [00:55<23:35, 14.60s/it][A

n_episode : 3, Total Frames : 2957, Average Score : -19.7, Episode Score : -20.0, Best Score : -19.0, n_buffer : 2957, eps : 98.1%



  4%|▍         | 4/100 [01:25<30:53, 19.30s/it][A

n_episode : 4, Total Frames : 4031, Average Score : -19.5, Episode Score : -19.0, Best Score : -19.0, n_buffer : 4031, eps : 97.1%


KeyboardInterrupt: 

In [1]:
import torch

In [2]:
epsilon_decay = lambda x: eps_end + (eps_start - eps_end) * math.exp(-1. * x / decay_factor)

In [6]:
import math
eps_end = 1
eps_start = 0.4
decay_factor = 100000
epsilon_decay(1e6)

0.9999727600421425