In [1]:
# Requirements
!python -m pip install pyvirtualdisplay
!apt-get install -y xvfb python-opengl ffmpeg 
!pip install gym 

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
ffmpeg is already the newest version (7:3.4.6-0ubuntu0.18.04.1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.4).
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 59 not upgraded.


In [2]:
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import count
from collections import namedtuple
from pyvirtualdisplay import Display

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# Model hyperparameters
LR = 0.0004   
TAU = 0.001             # Controls the interpolation weight used in the soft update of parameters       
GAMMA = 0.99            # Discount factor which is used as a weight for the next_state_values
UPDATE_RATE = 4         # Number of steps before updating the policy network

# Buffer Hyperparameters
BUFFER_SIZE = 100000    
BATCH_SIZE = 64         # Size of every random sample extracted from the buffer

# Parameters for epsilon greedy action selection 
EPS_START = 1
EPS_END = 0.01
EPS_DECAY = 0.996

# Use GPU if possible, else use CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Set parameter to False in case you want to store the model in your PC
# If it is True it will store the results in Google Drive
colab = True
if colab == True:
    from google.colab import drive

    # Mount Google Drive
    drive.mount('/content/drive')
    data_path = '/content/drive/My Drive/ReinforcementLearningOnVideoGames-master/data/models/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Defines Deep Q Network (3 fully connected layers)
class DQN(nn.Module):
  
    def __init__(self, state_size, action_size, fc1_size = 64, fc2_size = 64):       
        super(DQN, self).__init__()
        self.num_actions = action_size
        fc3_1_size = fc3_2_size = 32
        self.seed = torch.manual_seed(seed)
      
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3_1 = nn.Linear(fc2_size, fc3_1_size)
        self.fc4_1 = nn.Linear(fc3_1_size, 1)
        self.fc3_2 = nn.Linear(fc2_size, fc3_2_size)
        self.fc4_2 = nn.Linear(fc3_2_size, action_size)

    # Dueling DQN
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        val = F.relu(self.fc3_1(x))
        val = self.fc4_1(val)
        adv = F.relu(self.fc3_2(x))
        adv = self.fc4_2(adv)
        # Q(s,a) = V(s) + (A(s,a) - 1/|A| * sum A(s,a'))
        action = val + adv - adv.mean(1).unsqueeze(1).expand(state.size(0), self.num_actions)
        return action

In [4]:
# Class defining the agent.
class Agent():

    def __init__(self, state_size, n_actions):
        self.state_size = state_size
        self.n_actions = n_actions

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)

        # Deep Q Networks
        self.policy_net = DQN(state_size, n_actions).to(device)
        self.target_net = DQN(state_size, n_actions).to(device)

        # Adam optimizer 
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LR)

        # Initialize counter for updating every UPDATE_RATE steps
        self.update_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.push(state, action, reward, next_state, done)
        
        # Learn every UPDATE_RATE time steps.
        self.update_step =  self.update_step + 1 
        if (self.update_step % UPDATE_RATE) == 0:
            # If enough samples are available in memory, get random sample and learn
            if len(self.memory) > BATCH_SIZE:
                self.learn(self.memory.sample(), GAMMA)

    def learn(self, transitions, gamma):
        states ,actions, rewards, next_states, dones = transitions

        # Compute V(s_{t+1}) for all next states.  
        non_final_mask = ~dones
        next_state_values = self.target_net(next_states).max(1)[0].detach().reshape(BATCH_SIZE,1)
        next_state_values = next_state_values * non_final_mask
        
        # Compute Q(s_t, a)
        state_action_values  = self.policy_net(states).gather(1, actions).double()

        #Compute the expected Q values
        expected_state_action_values  = (rewards + (gamma * next_state_values)).double() 
        
        # Compute loss (MSE loss performs better than Huber loss in this case)
        #loss =  F.smooth_l1_loss(state_action_values, expected_state_action_values).double()
        loss = F.mse_loss(state_action_values, expected_state_action_values).double()

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target_model softly
        self.soft_update(self.policy_net, self.target_net, TAU) 
        
    def select_action(self, state, eps=0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Epsilon-greedy selection
        if random.random() > eps: 
            self.policy_net.eval()
            # Exploitation
            with torch.no_grad():
                return  self.policy_net(state).max(1)[1].view(1, 1)
            self.policy_net.train()
        else: 
            # Exploration
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    # Soft updating (Weighted Sum) on the target network parameters
    # This function was extracted from https://gist.github.com/abhinavsagar/a7cd1d67ad5a71589d576cd84f15b648 as
    # we read that it improved the performance but we didn't know how to copy the parameters to the network 
    def soft_update(self, local_model, target_model, tau):
      for target_param,local_param in zip(target_model.parameters(),local_model.parameters()):
        target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [5]:
Transition = namedtuple('Transition',('state', 'action', 'reward', 'next_state','done'))

class ReplayBuffer():

    def __init__(self, buffer_size, batch_size):
        self.capacity = buffer_size
        self.memory = []
        self.position = 0
    
    # Inserts a Transition the buffer
    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity
    
    # Picks randomly BATCH_SIZE elements
    def sample(self):
        batch = random.sample(self.memory, BATCH_SIZE)

        # Separate transitions so that computations using the retuned output are simpler to code
        states = torch.tensor([s.state for s in batch]).float().to(device)
        actions = torch.tensor([s.action for s in batch]).unsqueeze(1).to(device)
        rewards = torch.tensor([s.reward for s in batch]).unsqueeze(1).to(device)
        next_states = torch.tensor([s.next_state for s in batch]).float().to(device)
        dones =  torch.tensor([s.done for s in batch]).unsqueeze(1).to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [6]:
# Functions used to obtain the x-coordinates of the paddle and the ball
# To check how they work check Breakout_SL.ipynb
#Color of the ball and the paddle
color=[200,72,72]

# Checks if a position has the ball and paddle color.
def check_color(i,j,observation):
    for k in range(0,3):
        if observation[i][j][k]!=color[k]:
            return 0
    return 1

# Calculates the x-coordinate of the paddle (center).
def calculate_paddle(observation):
    exit=0
    i=192
    for j in range(10, 152):
        exit=check_color(i,j,observation)
        if(exit==1):
            exit2=0
            for w in range(j, j+10):
                exit2=check_color(i,w,observation)
                if(exit2!=1):
                    break
            if(exit2==1):
                break
    if exit2==0:
        return -1
    return j+12

# Calculates the x-coordinate of the ball in case the ball exists and it's near the paddle.
def calculate_ball(observation):
    exit=0
    for i in range(90,192):
        for j in range(10, 152):
            exit=check_color(i,j,observation)
            if(exit==1):
                break
        if(exit==1):
            break
    if exit==0:
        return -1
    return j+2

In [7]:
max_paddle_ball_distance = 140

def save_model(network,colab=True):
    print('\nUse the test function of the cell below so that you can see the model that has been saved')
    if colab == True:
        torch.save(network.state_dict(), data_path +'breakout_rl_model_final.ckpt')
    else:
        torch.save(network.state_dict(),'../data/models/breakout_rl_model_final.ckpt')

def train(n_episodes=3000,eps_start=EPS_START, eps_end=EPS_END, eps_decay=EPS_DECAY,colab=False):
    
    # Array containing the scores of each episode
    scores = []                        

    max_avg_score = 2
    output_net = DQN(2, 4)
    # Initialize epsilon
    eps = EPS_START                    
    for episode in range(1, n_episodes+1):
        score = 0
        action=1
        lives=6
        info=dict()
        info['ale.lives']=5
        state = env.reset()
        # New state computation
        state = np.array([calculate_paddle(state),calculate_ball(state)])
        for t in count():
            if(lives!=info['ale.lives']):
                action=torch.tensor([[1]], device=device, dtype=torch.long)
                lives=info['ale.lives']
            else:
                action = agent.select_action(state, eps)
            action1=agent.select_action(state, eps)
            next_state, reward, done, info = env.step(action.item())

            # New state
            next_state = np.array([calculate_paddle(next_state),calculate_ball(next_state)])

            # Despite using a new reward for updating correctly the parameters. We will use the old reward
            # (1 if a block is hit, 0 otherwise) to keep track of the real score.
            score += reward

            # New reward taking into account distance. Multiplied by scalar to differentiate more the rewards in different cases.
            # Checking if the coordinates found are correct and if they are compute new reward. It will return incorrect coordinates
            # when the balls go through already broken blocks.
            if next_state[0]!=-1 and next_state[1]!=-1:
                reward = 5*(1-(np.abs(next_state[0]-next_state[1])/max_paddle_ball_distance))

            agent.step(state, action1, reward, next_state, done)
            state = next_state
            
            if done:
                break 
            
        scores.append(score)   

        # Decrease epsilon          
        eps = max(EPS_END, EPS_DECAY*eps) 

        avg_score = np.mean(scores[-100:])

        # Update output network with best result parameters
        if max_avg_score < avg_score:
            max_avg_score = avg_score
            output_net.load_state_dict(agent.policy_net.state_dict())

        # Print mean of episodes every 25 episodes
        if episode % 5 == 0:
            print('Episode ',episode,'\tAverage Score: ',avg_score)

        #In case we don't finish the execution we have some models
        if episode % 50==0:
            torch.save(agent.policy_net.state_dict(), data_path +'breakout_'+str(episode)+'_rl_model.ckpt')

        # If we arrive to a consistent score of 7 we stop training
        if avg_score>=15:
            print('\nCongratulations!! The model has solved the environment in ', episode, 'episodes! :D\nThe average score in the last 100 episodes was ',avg_score,'\n')
            save_model(agent.policy_net,colab)
            return
    print('\nThe model has not been able to win in less than',n_episodes,' try changing the parameters or increasing the maximum number of episodes\nThe model of best episode with score: ',max_avg_score ,'will be stored')  
    save_model(output_net,colab)

In [8]:
import glob
import io
import base64
from gym.wrappers import Monitor
from IPython.display import HTML
import imageio

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display as ipythondisplay

plt.ion()

# Class used to display videos (Not done by us)
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
# Creating display object to properly show videos
display = Display(visible=0, size=(1400, 900))
display.start()

# We use wrap_env so that we can store the videos
env = wrap_env(gym.make('Breakout-v0'))

# Seed can be altered to modify randomness
# seed = 123 was the one used for testing
# by changing the seed you can obtain different results
seed = 123
env.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

agent = Agent(state_size=2, n_actions=4)
train(colab=colab)

Episode  5 	Average Score:  1.2
Episode  10 	Average Score:  1.1
Episode  15 	Average Score:  1.1333333333333333
Episode  20 	Average Score:  1.05
Episode  25 	Average Score:  1.12
Episode  30 	Average Score:  1.1333333333333333
Episode  35 	Average Score:  1.2
Episode  40 	Average Score:  1.2
Episode  45 	Average Score:  1.2444444444444445
Episode  50 	Average Score:  1.26
Episode  55 	Average Score:  1.2727272727272727
Episode  60 	Average Score:  1.3333333333333333
Episode  65 	Average Score:  1.353846153846154
Episode  70 	Average Score:  1.4
Episode  75 	Average Score:  1.44
Episode  80 	Average Score:  1.4375
Episode  85 	Average Score:  1.4588235294117646
Episode  90 	Average Score:  1.4333333333333333
Episode  95 	Average Score:  1.431578947368421
Episode  100 	Average Score:  1.44
Episode  105 	Average Score:  1.44
Episode  110 	Average Score:  1.43
Episode  115 	Average Score:  1.44
Episode  120 	Average Score:  1.46
Episode  125 	Average Score:  1.54
Episode  130 	Average Sc

KeyboardInterrupt: ignored

In [9]:
# Function used to test the model. It will play max_episodes times and show the game in a video 
def test(file_path,max_episodes = 1):
    test_agent = Agent(2,4)
    test_agent.policy_net.load_state_dict(torch.load(file_path))
    test_agent.policy_net.eval()
    
    for episode in range(max_episodes): 
        env = wrap_env(gym.make('Breakout-v0'))
        info=dict()
        info['ale.lives']=5
        lives=6
        state = env.reset()
        action=1
        state=np.array([calculate_paddle(state),calculate_ball(state)])
        for t in count():
            action = test_agent.select_action(state)
            if lives!=info['ale.lives']:
              lives=info['ale.lives']
              action=1
            elif action==0:
              action=1
            state, reward, done, info = env.step(action)  
            state=np.array([calculate_paddle(state),calculate_ball(state)])
            if done:     
                break
        env.close() 
        show_video() 

# Contains the path with the location of the stored model file
file_path = 'breakout_700_rl_model.ckpt'
if colab == True:
    # Folder of drive where file will be stored
    data_path = '/content/drive/My Drive/ReinforcementLearningOnVideoGames-master/data/models/'
    file_path = data_path + file_path

# Creating display object to properly show videos
display = Display(visible=0, size=(400, 300))
display.start()
plt.ion()

# Seed can be altered to modify randomness
seed = 123
torch.manual_seed(seed)
random.seed(seed)

test(file_path)