In [1]:
# Requirements
!pip install Box2D
!python -m pip install pyvirtualdisplay
!pip install gym 
!apt-get install -y xvfb python-opengl ffmpeg 

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
ffmpeg is already the newest version (7:3.4.6-0ubuntu0.18.04.1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.4).
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 59 not upgraded.


In [2]:
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import count
from collections import namedtuple
from pyvirtualdisplay import Display

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# Model hyperparameters
LR = 0.000415  
TAU = 0.001             # Controls the interpolation weight used in the soft update of parameters       
GAMMA = 0.99            # Discount factor which is used as a weight for the next_state_values
UPDATE_RATE = 4         # Number of steps before updating the policy network

# Buffer Hyperparameters
BUFFER_SIZE = 100000    
BATCH_SIZE = 64         # Size of every random sample extracted from the buffer

# Parameters for epsilon greedy action selection 
EPS_START = 1
EPS_END = 0.01
EPS_DECAY = 0.996

# Use GPU if possible, else use CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Set parameter to False in case you want to store the model in your PC
# If it is True it will store the results in Google Drive

colab = True
if colab == True:
    from google.colab import drive

    # Mount Google Drive
    drive.mount('/content/drive')
    data_path = '/content/drive/My Drive/ReinforcementLearningOnVideoGames-master/data/models/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Defines Deep Q Network (3 fully connected layers)
class DQN(nn.Module):

    def __init__(self, state_size, action_size, fc1_size = 64, fc2_size = 64):       
        super(DQN, self).__init__()
        self.num_actions = action_size
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [4]:
# Class defining the agent.
class Agent():

    def __init__(self, state_size, n_actions):
        self.state_size = state_size
        self.n_actions = n_actions

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)

        # Deep Q Networks
        self.policy_net = DQN(state_size, n_actions).to(device)
        self.target_net = DQN(state_size, n_actions).to(device)

        # Adam optimizer 
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LR)

        # Initialize counter for updating every UPDATE_RATE steps
        self.update_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.push(state, action, reward, next_state, done)
        
        # Learn every UPDATE_RATE time steps.
        self.update_step =  self.update_step + 1 
        if (self.update_step % UPDATE_RATE) == 0:
            # If enough samples are available in memory, get random sample and learn
            if len(self.memory) > BATCH_SIZE:
                self.learn(self.memory.sample(), GAMMA)

    def learn(self, transitions, gamma):
        states ,actions, rewards, next_states, dones = transitions

        # Compute V(s_{t+1}) for all next states.  
        non_final_mask = ~dones
        next_state_values = self.target_net(next_states).max(1)[0].detach().reshape(BATCH_SIZE,1)
        next_state_values = next_state_values * non_final_mask
        
        # Compute Q(s_t, a)
        state_action_values  = self.policy_net(states).gather(1, actions).double()

        #Compute the expected Q values
        expected_state_action_values  = (rewards + (gamma * next_state_values)).double() 
        
        # Compute loss (MSE loss performs better than Huber loss in this case)
        loss =  F.mse_loss(state_action_values, expected_state_action_values).double()

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        # Update target_model softly
        self.soft_update(self.policy_net, self.target_net, TAU) 
        
    def select_action(self, state, eps=0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Epsilon-greedy selection
        if random.random() > eps: 
            # Exploitation
            with torch.no_grad():
                return  self.policy_net(state).max(1)[1].view(1, 1)
        else: 
            # Exploration
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    # Soft updating (Weighted Sum) on the target network parameters
    # This function was extracted from https://gist.github.com/abhinavsagar/a7cd1d67ad5a71589d576cd84f15b648 as
    # we read that it improved the performance but we didn't know how to copy the parameters to the network 
    def soft_update(self, local_model, target_model, tau):
      for target_param,local_param in zip(target_model.parameters(),local_model.parameters()):
        target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [5]:
Transition = namedtuple('Transition',('state', 'action', 'reward', 'next_state','done'))

class ReplayBuffer():

    def __init__(self, buffer_size, batch_size):
        self.capacity = buffer_size
        self.memory = []
        self.position = 0
    
    # Inserts a Transition the buffer
    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity
    
    # Picks randomly BATCH_SIZE elements
    def sample(self):
        batch = random.sample(self.memory, BATCH_SIZE)

        # Separate transitions so that computations using the retuned output are simpler to code
        states = torch.tensor([s.state for s in batch]).float().to(device)
        actions = torch.tensor([s.action for s in batch]).unsqueeze(1).to(device)
        rewards = torch.tensor([s.reward for s in batch]).unsqueeze(1).to(device)
        next_states = torch.tensor([s.next_state for s in batch]).float().to(device)
        dones =  torch.tensor([s.done for s in batch]).unsqueeze(1).to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [6]:
# Saves the model to the corresponding folder and show how it plays
def save_model(network,colab=True):
    print('\nUse the test function of the cell below so that you can see the model that has been saved')
    if colab == True:
        torch.save(network.state_dict(), data_path +'Lunar_Lander_model.ckpt')
    else:
        torch.save(network.state_dict(),'../data/models/Lunar_Lander_model.ckpt')


def train(n_episodes=2000,eps_start=EPS_START, eps_end=EPS_END, eps_decay=EPS_DECAY,colab=False):
    
    # Array containing the scores of each episode
    scores = []                        

    # The model will save the network with highest avg (if it is higher than 150, considered a reasonably nice model)
    max_avg_score = 150
    output_net = DQN(state_size, n_actions)

    # Initialize epsilon
    eps = EPS_START                    
    for episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in count():
            action = agent.select_action(state, eps)
            next_state, reward, done, _ = env.step(action.item())
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 

        scores.append(score)   

        # Decrease epsilon          
        eps = max(EPS_END, EPS_DECAY*eps) 

        avg_score = np.mean(scores[-100:])

        # Update output network with best result parameters
        if max_avg_score < avg_score:
            max_avg_score = avg_score
            output_net.load_state_dict(agent.policy_net.state_dict())

        # Print mean of episodes every 50 episodes
        if episode % 50 == 0:
            print('Episode',episode,'\tAverage Score: ', avg_score)

        # In case the game is won (230 avg score)
        if avg_score>=230:
            print('\nCongratulations!! The model has solved the environment in ', episode-100, 'episodes! :D\nThe average score in the last 100 episodes was ',avg_score,'\n')
            save_model(agent.policy_net,colab)
            return

    print('\nThe model has not been able to win in less than',n_episodes,' try changing the parameters or increasing the maximum number of episodes\nThe model of best episode with score: ',max_avg_score ,'will be stored')  
    save_model(output_net,colab)

In [7]:
import glob
import io
import base64
from gym.wrappers import Monitor
from IPython.display import HTML
import imageio

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display as ipythondisplay

plt.ion()

# Class used to display videos (Not done by us)
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    
# Class used to wrap the enviroment with a Monitor wrapper (Not done by us)
def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
# Creating display object to properly show videos
display = Display(visible=0, size=(1400, 900))
display.start()
plt.ion()

# We use wrap_env so that we can store the videos
env = wrap_env(gym.make('LunarLander-v2'))

# Seed can be altered to modify randomness
seed = 6
env.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

# Obtain state size and action space size for the enviroment
state_size = env.observation_space.shape[0]
n_actions = env.action_space.n

agent = Agent(state_size, n_actions)
train(colab=colab)

Episode 50 	Average Score:  -169.49880810816504
Episode 100 	Average Score:  -157.57282134855453
Episode 150 	Average Score:  -125.78270984797791
Episode 200 	Average Score:  -104.79739984836311
Episode 250 	Average Score:  -86.53533128098672
Episode 300 	Average Score:  -30.451326907493097
Episode 350 	Average Score:  37.86775371858639
Episode 400 	Average Score:  63.03655557214421
Episode 450 	Average Score:  84.9859542276604
Episode 500 	Average Score:  117.90410407243061
Episode 550 	Average Score:  136.74965408398543
Episode 600 	Average Score:  156.12769752790314
Episode 650 	Average Score:  173.94133047850062
Episode 700 	Average Score:  178.47525431777404
Episode 750 	Average Score:  176.94211810216547
Episode 800 	Average Score:  191.2990308260622
Episode 850 	Average Score:  213.84271309016356
Episode 900 	Average Score:  227.15293047086408

Congratulations!! The model has solved the environment in  805 episodes! :D
The average score in the last 100 episodes was  230.03455108

In [8]:
# Function used to test the model. It will play max_episodes times and show the game in video.
def test(file_path,max_episodes = 1):
    test_agent = Agent(state_size=8, n_actions=4)
    test_agent.policy_net.load_state_dict(torch.load(file_path))
    test_agent.policy_net.eval()
    for episode in range(max_episodes): 
        env = wrap_env(gym.make('LunarLander-v2'))
        
        state = env.reset()
        for t in count():
            action = test_agent.select_action(state)
            next_state, reward, done, _ = env.step(action.item())
            state = next_state     
            if done:     
                break
        env.close() 
        show_video() 

# Creating display object to properly show videos
display = Display(visible=0, size=(1400, 900))
display.start()
plt.ion()

# Contains the path with the location of the stored model file
file_path = '../data/models/Lunar_Lander_model.ckpt'
if colab == True:
    from google.colab import drive

    # Mount Google Drive
    drive.mount('/content/drive')
    # Folder of drive where file will be stored
    data_path = '/content/drive/My Drive/ReinforcementLearningOnVideoGames-master/data/models/' 
    file_path = data_path + 'Lunar_Lander_model.ckpt'

test(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
