In [None]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]
!pip3 install ipdb

!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1



In [None]:
import gym
import math
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import deque
import torch.optim as optim
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from gym import wrappers

from IPython.display import clear_output

In [None]:
# For visualization
from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display 
import glob
import base64, io
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from gym.wrappers import Monitor



In [None]:
"""
Q-network class
"""
class QNetwork(nn.Module):
  def __init__(self, num_states, num_actions, hidden_layers, seed):
    super(QNetwork, self).__init__()
    self.seed = torch.manual_seed(seed)
    self.fc1 = nn.Linear(num_states, hidden_layers[0])
    self.fc2 = nn.Linear(hidden_layers[0], hidden_layers[1])
    self.fc3 = nn.Linear(hidden_layers[1], num_actions)

  def forward(self, X):
    X = F.relu((self.fc1(X)))
    X = F.relu((self.fc2(X)))
    return self.fc3(X)

In [None]:
#define data structure for experience 
Experience = namedtuple('Experience',
                        ('state', 'action', 'next_state', 'reward', 'done'))
"""
Replay Memory to store experiences 
"""
class ReplayMemory:
  def __init__(self, num_actions, mem_capacity, rand_seed):
    self.num_actions = num_actions
    self.memory = deque(maxlen = mem_capacity)
    self.seed = random.seed(rand_seed)

  """
  Magic method to return size of memory
  """
  def __len__(self):
    return len(self.memory)

  """
  Add experience to memory
  """
  def add_experience(self, state, action, next_state, reward, done):
    self.memory.append(Experience(state, action, next_state, reward, done))

  """
  Get n experiences from memory
  """
  def sample_exp(self, n):

    #get n exp
    experiences = random.sample(self.memory, k=n)

    states = [] 
    actions = []
    rewards = [] 
    next_states = [] 
    dones = []  

    #get states, actions..
    for e in experiences:
      if e is not None:
        state, action, next_state, reward, done = e
        states.append(state)
        actions.append(action)
        next_states.append(next_state)
        rewards.append(reward)
        dones.append(done)

    #convert to tensors 
    states = torch.from_numpy(np.vstack(states)).to(torch.float).to(device)
    actions = torch.from_numpy(np.vstack(actions)).to(torch.long).to(device)
    next_states = torch.from_numpy(np.vstack(next_states)).to(torch.float).to(device)
    rewards = torch.from_numpy(np.vstack(rewards)).to(torch.float).to(device)
    dones = torch.from_numpy(np.vstack(dones).astype(np.uint8)).to(torch.float).to(device)

    #return as tuple 
    return (states, actions, next_states, rewards, dones)


In [None]:
class Agent():

  def __init__(self, num_states, num_actions, seed, learning_rate, memory_capacity, batch_size, update_frequency, gamma, tau, ddqn = True):
    self.num_states = num_states
    self.num_actions = num_actions
    self.seed = seed #random.seed(seed)
    self.update_frequency = update_frequency 
    self.batch_size = batch_size
    self.gamma = gamma 
    self.tau = tau 
    self.ddqn = ddqn
    self.time_step = 0 


    #Q-network 
    # local = policy, target = target 
    self.policy_net = QNetwork(num_states, num_actions, [64, 64], self.seed).to(device)
    self.target_net = QNetwork(num_states, num_actions,[64,64],  self.seed).to(device)
    #initialize target with policy net params 
    self.sync_networks()

    self.optimizer = optim.Adam(self.policy_net.parameters(), lr = learning_rate)

    #memory
    self.memory = ReplayMemory(num_actions, memory_capacity, self.seed)


  """
  sync both networks
  """
  def sync_networks(self):
    _ = self.target_net.load_state_dict(self.policy_net.state_dict())

  """
  Agent executes action to build knowledge base
  Every n steps, learns from experience 
  """
  def execute_action(self, state, action, reward, next_state, done):
    #add experience
    self.memory.add_experience(state, action, reward, next_state, done)
    
    #back prop every n steps 
    self.time_step += 1 
    if self.time_step % self.update_frequency == 0:
      #check if there are enough experiences
      if len(self.memory) > self.batch_size:
        exp = self.memory.sample_exp(self.batch_size)
        #learn from experience
        self.ddqn_learn(exp, self.gamma)


  """
  Select an actions based on actions available for a given state
  """
  def select_action(self, state, eps = 0 ):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    #turn off grad for forward pass 
    self.policy_net.eval()
    with torch.no_grad():
        actions = self.policy_net(state)
    self.policy_net.train()
    
    # Epsilon-greedy action selection
    if random.random() > eps:
        return np.argmax(actions.cpu().data.numpy())
    else:
        return random.choice(np.arange(self.num_actions))


  """
  Learns from prior experiences
  only for ddqn 
  """
  def ddqn_learn(self, experiences, gamma):
    states, actions, rewards, next_states, dones = experiences 

    # Target Q-value calculation 
    # Q-value of next state 
    if self.ddqn:
      #q_targets_next = self.target_net(next_states).detach().max(1)[0].unsqueeze(1)
      ### Calculate target value from bellman equation
      #q_targets = rewards + gamma * q_targets_next * (1 - dones)
      ### Calculate expected value from policy network 
      #q_expected = self.policy_net(states).gather(1, actions)
      #loss = F.mse_loss(q_expected, q_targets)
      _, q_values = self.policy_net(next_states).max(dim =1 , keepdim = True) #gets actions 
      
    else:
      _, q_values = self.target_net(next_states).max(dim =1 , keepdim = True) #gets actions 

    q_values_next = self.target_net(next_states).gather(dim = 1, index = q_values)
    target_q_values = rewards + gamma * q_values_next * (1 - dones)

    policy_q_values = self.policy_net(states).gather(1, actions)


    ### Loss calculation 
    loss = F.mse_loss(policy_q_values, target_q_values)

    
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

    #update target net based on policy net 
    for target_param, policy_param in zip(self.target_net.parameters(), self.policy_net.parameters()):
       target_param.data.copy_(self.tau*policy_param.data + (1.0-self.tau)*target_param.data)
    

In [None]:
def train_ddqn(env, agent, num_episodes = 2000, max_steps = 1000, eps_start = 1.0, eps_end = 0.01, eps_decay = 0.995):
  scores = [] 
  scores_window = deque(maxlen = 100)
  eps = eps_start 

  for i in range(1, num_episodes+1):
    state = env.reset() 
    current_score = 0 
    for step in range(max_steps):
      #select action 
      action = agent.select_action(state, eps)
      #take action in environment
      next_state, reward, done, _ = env.step(action)
      #store experience in memory & learn 
      agent.execute_action(state, action, reward, next_state, done)
      
      #update state & rewards 
      state = next_state 
      current_score += reward

      # game over 
      if done:
        break 


    #save episode scores   
    scores_window.append(current_score)       
    scores.append(current_score)                    
    
    #decay epsilon 
    eps = max(eps_end, eps_decay*eps) 

    #Score printing formatting  
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i, np.mean(scores_window)), end="")
    if i % 100 == 0:
          print('\rEpisode {}\tAverage Score: {:.2f}'.format(i, np.mean(scores_window)))

    #break condition to stop training 
    if np.mean(scores_window)>=200.0:
          print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i-100, np.mean(scores_window)))
          torch.save(agent.policy_net.state_dict(), 'checkpoint.pth')
          break
          
  return scores





# Train agent on DQN

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device: ", device)

game = 'LunarLander-v2'
env = gym.make(game)

print("game: ", game)
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n

print("States size:", num_states, "Actions size: ",  num_actions)


device:  cuda:0
game:  LunarLander-v2
States size: 8 Actions size:  4


In [None]:
#set up parameters for agent
seed = 0
learning_rate = 5e-4
memory_capacity = 100000
batch_size = 64
update_every = 4
gamma = 0.99 
tau = 0.001
ddqn = False #using DQN now

#training params 
num_episodes = 2000
max_steps = 1000
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.995


agent = Agent(num_states, num_actions, seed, learning_rate, memory_capacity, batch_size, update_every, gamma, tau, 
              ddqn )
scores = train_ddqn(env, agent, num_episodes, max_steps , eps_start, eps_end , eps_decay)




Episode 100	Average Score: -168.92
Episode 200	Average Score: -112.55
Episode 300	Average Score: -32.44
Episode 400	Average Score: 70.39
Episode 500	Average Score: 164.18
Episode 600	Average Score: 142.66
Episode 700	Average Score: 156.91
Episode 800	Average Score: 162.22
Episode 900	Average Score: 176.59
Episode 963	Average Score: 201.44
Environment solved in 863 episodes!	Average Score: 201.44


In [None]:
def play_game(agent, render):
    done = False
    state = env.reset()
    reward_total = 0

    while not done:
        state_t = torch.from_numpy(state).float().unsqueeze(0).to(device)
        action = agent.policy_net(state_t).max(1)[1].view(1, 1)
        
        state, reward, done, _ = env.step(action.item())
        reward_total += reward
        
        if render: env.render()
        
    
    if render:
        env.close()
        show_video()
    
    print(f'reward = {reward_total}')


def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:

display = Display(visible=0, size=(1400, 900))
display.start()
env = wrap_env(gym.make('LunarLander-v2'))
play_game(agent, render=True)

reward = 285.9686929200702


# Train Agent on DDQN

In [None]:
#set up parameters for agent
seed = 0
learning_rate = 5e-4
memory_capacity = 100000
batch_size = 64
update_every = 4
gamma = 0.99 
tau = 0.001
ddqn = True #using DQN now

#training params 
num_episodes = 2000
max_steps = 1000
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.995


agent = Agent(num_states, num_actions, seed, learning_rate, memory_capacity, batch_size, update_every, gamma, tau, 
              ddqn )
scores = train_ddqn(env, agent, num_episodes, max_steps , eps_start, eps_end , eps_decay)




Episode 100	Average Score: -191.20
Episode 200	Average Score: -117.45
Episode 300	Average Score: -60.10
Episode 400	Average Score: -12.22
Episode 459	Average Score: 45.14

KeyboardInterrupt: ignored

In [None]:

display = Display(visible=0, size=(1400, 900))
display.start()
env = wrap_env(gym.make('LunarLander-v2'))
play_game(agent, render=True)