## Setup

In [0]:
#!pip install torch
#!pip install gym
# Necessary for LunarLander-v2
#!apt install swig   # Only if installation of Box2d fails
#!pip install 'gym[box-2d]'

In [6]:
from collections import deque
from datetime import datetime
import random
import numpy as np
import gym
#import Box2D
import torch
import torch.nn as nn
import torch.nn.grad as grad
import torch.optim as optim
import torch.nn.functional as func

use_gpu = True

if torch.cuda.is_available() and use_gpu:
    available_device = torch.device('cuda')
    print("Using cuda")
else:
    available_device = torch.device('cpu')
    print("Using cpu")

Using cuda


## Class definitions

In [0]:
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen = capacity)
        
    def sample(self, k):
        return np.array(random.sample(self.buffer, k))
    
    def add(self, new_sample):
        self.buffer.append(new_sample)
        
    def count(self):
        return len(self.buffer)
    
class DQN(nn.Module):
    def __init__(self, n_input, n_hidden_1, n_hidden_2, n_hidden_3, n_output, learning_rate):
        super(DQN, self).__init__()
        if(n_hidden_3 > 0):        
            self.layers = nn.Sequential(
                nn.Linear(n_input, n_hidden_1).to(available_device),
                nn.ReLU(),
                nn.Linear(n_hidden_1, n_hidden_2).to(available_device),
                nn.ReLU(),
                nn.Linear(n_hidden_2, n_hidden_3).to(available_device),
                nn.ReLU(),
                nn.Linear(n_hidden_3, n_output).to(available_device),
            )
        elif(n_hidden_2 > 0):        
            self.layers = nn.Sequential(
                nn.Linear(n_input, n_hidden_1).to(available_device),
                nn.ReLU(),
                nn.Linear(n_hidden_1, n_hidden_2).to(available_device),
                nn.ReLU(),
                nn.Linear(n_hidden_2, n_output).to(available_device),
            )
        else:
            self.layers = nn.Sequential(
                nn.Linear(n_input, n_hidden_1).to(available_device),
                nn.ReLU(),
                nn.Linear(n_hidden_1, n_output).to(available_device),
            )

        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        self.loss_fct = nn.SmoothL1Loss()
    
    def forward(self, x):
        return self.layers(x)
    
    def loss(self, q_outputs, q_targets):
        #return 0.5 * torch.sum(torch.pow(q_outputs - q_targets, 2))
        return self.loss_fct(q_outputs.float(), q_targets.float())
        
    def update_params(self, new_params, tau):
        params = self.state_dict()
        for k in params.keys():
            params[k] = (1-tau) * params[k] + tau * new_params[k]
        self.load_state_dict(params)

## Training


In [0]:
run_in_colab = True
load_networks = False

if(run_in_colab):
    from google.colab import drive
    drive.mount('/content/gdrive')
    policy_net_path = "/content/gdrive/My Drive/Colab Notebooks/pong-ram-policy.pt"
    target_net_path = "/content/gdrive/My Drive/Colab Notebooks/pong-ram-target.pt"
else:
    policy_net_path = "/home/philipp/Dokumente/AAAUniversitaet/Deep-Learning/Reinforcement-learning/lunar-policy.pt"
    target_net_path = "/home/philipp/Dokumente/AAAUniversitaet/Deep-Learning/Reinforcement-learning/lunar-target.pt"

# Setup environment
env = gym.make("Pong-ramDeterministic-v4")

# Set seeds
seed = 42
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
env.seed(seed)
env.action_space.np_random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


# Set hyperparameters
num_epochs = 5000000
batch_size = 64 #32
learning_rate = 0.0001
gamma = 0.99 #0.95
replay_buffer_capacity = 1000000 #100000
replay_init_size = 10000
epsilon = 1.0
epsilon_final = 0.05
epsilon_final_reached = 100000 # ebert 50000 #100000
epsilon_decay = (epsilon - epsilon_final)/epsilon_final_reached
target_update_frequency = 5000 #1000
# tau = 0.01
validation_frequency = 10000
save_frequency = 100000
do_validation = False



# Define policy and target networks
n_input = env.observation_space.shape[0]
n_hidden_1 = 512
n_hidden_2 = 256
n_hidden_3 = 64
n_output = env.action_space.n


policy_net = DQN(n_input, n_hidden_1, n_hidden_2, n_hidden_3, n_output, learning_rate)
target_net = DQN(n_input, n_hidden_1, n_hidden_2, n_hidden_3, n_output, learning_rate)
target_net.load_state_dict(policy_net.state_dict())


if(load_networks):
    print("Loading saved networks from file")
    policy_net.load_state_dict(torch.load(policy_net_path))
    target_net.load_state_dict(torch.load(target_net_path))

    # We have a (somewhat) working net already -> Use network to prefill buffer
    print("Prefilling replay buffer")
    replay_buffer = ReplayBuffer(replay_buffer_capacity)
    s = env.reset()
    for i in range(replay_init_size):
        with torch.no_grad():
            s_tensor = torch.as_tensor(s, device = available_device).float()
            a = policy_net.forward(s_tensor).argmax().item()
        s1, r, done, _ = env.step(a)
        replay_buffer.add([s,a,s1,r,done])
        s = s1
        if(done):
            s = env.reset()
            done = False
else:
    # Prefill the replay buffer randomly
    print("Prefilling replay buffer")
    replay_buffer = ReplayBuffer(replay_buffer_capacity)
    s = env.reset()
    for i in range(replay_init_size):
        a = env.action_space.sample()
        s1, r, done, _ = env.step(a)
        replay_buffer.add([s,a,s1,r,done])
        s = s1
        if(done):
            s = env.reset()
            done = False

# Start training
print("Starting training")
losses, rewards, episode_duration = [], [], []
episode_loss, episode_reward, episode_it = 0, 0, 0
completed_at_last_validation = 0
s = env.reset()
starttime = datetime.now()
try:
    for i in range(num_epochs):    
        # Do one gradient step
        batch = replay_buffer.sample(batch_size)
        ss = torch.as_tensor(np.stack(batch[:,0]), device = available_device).float()
        aa = torch.as_tensor(np.stack(batch[:,1]), device = available_device)
        ss1 = torch.as_tensor(np.stack(batch[:,2]), device = available_device).float()
        rr = torch.as_tensor(np.stack(batch[:,3]), device = available_device)
        ddone = torch.as_tensor(np.stack(batch[:,4]), device = available_device)
        
        policy_net.optimizer.zero_grad()
        Q = policy_net.forward(ss)
        q_policy = Q[range(len(aa)), aa]
        
        with torch.no_grad():
            q_target = rr + gamma * target_net.forward(ss1).max(dim=1)[0] * (~ ddone)
            #aa1 = target_net.forward(ss1).argmax(dim=1)
            #q_target = rr + gamma * target_net.forward(ss1)[range(len(aa1)), aa1] * (~ ddone)
            
        loss = policy_net.loss(q_policy, q_target)
        loss.backward()
        policy_net.optimizer.step()
        
        # Update target network parameters from policy network parameters
        if((i+1)%target_update_frequency == 0):
            target_net.load_state_dict(policy_net.state_dict())
        #target_net.update_params(policy_net.state_dict(), tau)

        # Decrease epsilon
        if(epsilon > epsilon_final):
            epsilon -= epsilon_decay
        
        # Add new sample to buffer
        if(np.random.uniform() < epsilon):
            a = env.action_space.sample()
        else:
            with torch.no_grad():
                s_tensor = torch.as_tensor(s, device = available_device).float()
                a = policy_net.forward(s_tensor).argmax().item()

        s1, r, done, _ = env.step(a)
        replay_buffer.add([s, a, s1, r, done])
        s = s1
            
        episode_it += 1
        episode_loss += loss.item()
        episode_reward += r
        
        if(done):
            episode_duration.append(episode_it)
            losses.append(episode_loss/episode_it)
            rewards.append(episode_reward)
            episode_loss, episode_reward, episode_it = 0, 0, 0
            done_any = False
            s = env.reset()

        if ((i+1)%validation_frequency == 0):
            if do_validation:
                validation_rewards, validation_duration = [], []
                episode_reward, episode_it = 0, 0
                s = env.reset()
                k = 0
                while k < 10:
                    with torch.no_grad():
                        s_tensor = torch.as_tensor(s, device = available_device).float()
                        a = policy_net.forward(s_tensor).argmax().item()
                    s1, r, done, _ = env.step(a)
                    episode_reward += r
                    episode_it += 1
                    s = s1
                    if(done):
                        validation_duration.append(episode_it)
                        validation_rewards.append(episode_reward)
                        episode_reward, episode_it = 0, 0
                        done = False
                        k += 1
                        s = env.reset()
                
                print("%i: Episodes completed: %d \t Mean training reward: %5.2f \t Mean validation reward: %5.2f \t Mean normalized loss: %5.2f \t Mean training duration: %5.2f \t Mean validation duration: %5.2f" % 
                    (i+1, len(rewards[completed_at_last_validation:]), np.mean(rewards[completed_at_last_validation:]), np.mean(validation_rewards), np.mean(losses[completed_at_last_validation:]), np.mean(episode_duration[completed_at_last_validation:]), np.mean(validation_duration)))
            else:
                print("%i: Episodes completed: %d \t Mean training reward: %5.2f \t Mean normalized loss: %5.2f \t Mean training duration: %5.2f" % 
                    (i+1, len(rewards[completed_at_last_validation:]), np.mean(rewards[completed_at_last_validation:]), np.mean(losses[completed_at_last_validation:]), np.mean(episode_duration[completed_at_last_validation:])))
            
            completed_at_last_validation = len(rewards)
        
        # Save the networks intermittently
        if((i+1)%save_frequency == 0):
            torch.save(policy_net.state_dict(), policy_net_path)
            torch.save(target_net.state_dict(), target_net_path)            
except KeyboardInterrupt:
    print('Training interrupted early.')

torch.save(policy_net.state_dict(), policy_net_path)
torch.save(target_net.state_dict(), target_net_path)   

endtime = datetime.now()
print("Finished training. Completed %d episodes in %s." % (len(rewards), str(endtime - starttime)))


# Run tests:
try:
    print("\nRunning tests")
    test_rewards, test_duration = [], []
    episode_reward, episode_it = 0, 0
    s = env.reset()
    k, nr_tests = 0, 10
    while k < nr_tests:
        with torch.no_grad():
            s_tensor = torch.as_tensor(s, device = available_device).float()
            a = policy_net.forward(s_tensor).argmax().item()
        s1, r, done, _ = env.step(a)
        episode_reward += r
        episode_it += 1
        s = s1
        if(done):
            test_duration.append(episode_it)
            test_rewards.append(episode_reward)
            episode_reward, episode_it = 0, 0
            done = False
            k += 1
            s = env.reset()
except KeyboardInterrupt:
    print('Testing interrupted early.')    
                
print("Mean test reward: %5.2f \t Mean test duration: %5.2f" % (np.mean(test_rewards), np.mean(test_duration)))

env.close()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
Prefilling replay buffer
Starting training
10000: Episodes completed: 11 	 Mean training reward: -20.55 	 Mean normalized loss:  0.05 	 Mean training duration: 908.91
20000: Episodes completed: 11 	 Mean training reward: -20.36 	 Mean normalized loss:  0.01 	 Mean training duration: 900.91
30000: Episodes completed: 11 	 Mean training reward: -20.73 	 Mean normalized loss:  0.01 	 Mean training duration: 846.45
40000: Episodes completed: 12 	 Mean training re

## Some old timing test

from datetime import datetime
batch_size = 64


replay_buffer = ReplayBuffer(replay_buffer_capacity)
starttime = datetime.now()
while True:
    s = env.reset()
    done = False
    for j in range(episode_length):
        a = env.action_space.sample()
        s1, r, done, _ = env.step(a)
        replay_buffer.add([s,a,s1,r,done])
        s = s1
        i = i+1
        if(done):
            break
        if(i >= replay_buffer_capacity):
            break;
    if(i >= replay_buffer_capacity):
            break;

print("Creation time on CPU: ", datetime.now() - starttime)
starttime = datetime.now()

for i in range(10000):
    batch = replay_buffer.sample(batch_size)
    ss = torch.as_tensor(np.stack(batch[:,0]), device = available_device)
    aa = torch.as_tensor(np.stack(batch[:,1]), device = available_device)
    ss1 = torch.as_tensor(np.stack(batch[:,2]), device = available_device)
    rr = torch.as_tensor(np.stack(batch[:,3]), device = available_device)
    ddone = torch.as_tensor(np.stack(batch[:,4]), device = available_device)

print("Sampling time with copy to GPU: ", datetime.now() - starttime)
starttime = datetime.now()

replay_buffer = ReplayBuffer2(replay_buffer_capacity)
starttime = datetime.now()
while True:
    s = env.reset()
    done = False
    for j in range(episode_length):
        a = env.action_space.sample()
        s1, r, done, _ = env.step(a)
        s = torch.as_tensor(s, device = available_device)
        a = torch.as_tensor(a, device = available_device)
        s1 = torch.as_tensor(s1, device = available_device)
        r = torch.as_tensor(r, device = available_device)
        done = torch.as_tensor(done, device = available_device)
        replay_buffer.add([s,a,s1,r,done])
        s = s1
        i = i+1
        if(done.item()):
            break
        if(i >= replay_buffer_capacity):
            break;
    if(i >= replay_buffer_capacity):
            break;

print("Creation time on GPU: ", datetime.now() - starttime)
starttime = datetime.now()

for i in range(10000):
    batch = replay_buffer.sample(batch_size)
    batch = [list(x) for x in zip(*batch)]
    ss = torch.stack(batch[0])
    aa = torch.stack(batch[1])
    ss1 = torch.stack(batch[2])
    rr = torch.stack(batch[3])
    ddone = torch.stack(batch[4])

print("Sampling time directly on GPU: ", datetime.now() - starttime)