In [1]:
import numpy as np
import pandas as pd
from ple import PLE
from ple.games.catcher import Catcher
import matplotlib.pyplot as plt
import cv2

import torch
import torch.nn as nn

couldn't import doomish
Couldn't import doom


In [12]:
dim_in = 4
dim_hidden = 4
dim_out = 3

# Randomly initialize weights
w1_value = np.random.randn(dim_in, dim_hidden) 
w2_value = np.random.randn(dim_hidden, dim_out) 

# Randomly initialize biases
b1_value = np.random.randn(dim_hidden) 
b2_value = np.random.randn(dim_out) 

# Convert to pyTorch data sturture
w1 = torch.from_numpy(w1_value).type(torch.FloatTensor).requires_grad_(True)
w2 = torch.from_numpy(w2_value).type(torch.FloatTensor).requires_grad_(True)
b1 = torch.from_numpy(b1_value).type(torch.FloatTensor).requires_grad_(True)
b2 = torch.from_numpy(b2_value).type(torch.FloatTensor).requires_grad_(True)

In [14]:
def update_weights(logs):
    learning_rate = 0.25
    
    action_set = p.getActionSet()
    criterion = nn.MSELoss()
    tanh = nn.Tanh()

    # reshape data
    x_value = np.array(logs[['player_x', 'player_vel', 'fruit_x', 'fruit_y']])
    x = torch.from_numpy(x_value).type(torch.FloatTensor)
        
    # forward pass
    x_hidden = x.mm(w1) + b1
    x_hidden_act = tanh(x_hidden) 
    y_pred = x_hidden_act.mm(w2) + b2
       
    # compute y
    actions = np.array(logs['action'])
    rewards = np.array(logs['reward'])
    y = torch.tensor(y_pred).detach()
    for i in range(len(actions)):
        y[i, action_set.index(actions[i])] = rewards[i]
        
    # compute loss
    loss = criterion(y_pred, y)
        
    # backprop
    loss.backward()
        
    # update weights using gradient descent  
    w1.data -= learning_rate * w1.grad
    w2.data -= learning_rate * w2.grad
    b1.data -= learning_rate * b1.grad
    b2.data -= learning_rate * b2.grad

    # manually zero the gradients
    w1.grad.zero_()
    w2.grad.zero_()
    b1.grad.zero_()
    b2.grad.zero_()      
    
    return 

In [4]:
# initialize game
width = 100
height = 100
game = Catcher(width, height, init_lives=1)
p = PLE(game, fps=30, frame_skip=3, num_steps=1, force_fps=True, display_screen=False)
p.init()

In [9]:
class Agent():
    def __init__(self, actions):
        self.actions = actions
        self.softmax = nn.Softmax(dim=1)
        
    def pick_action(self, state, w1, b1, w2, b2, epsilon):
        
        rand = np.random.rand()
        if rand<epsilon:
            # exploration
            action = self.actions[np.random.randint(0, len(self.actions))]
        else: 
            # predict action q-values with NN
            x = np.reshape(list(state.values()), (1,4))
            x = torch.from_numpy(x).type(torch.FloatTensor)
            x_hidden = x.mm(w1) + b1
            tanh = nn.Tanh()
            x_hidden_act = tanh(x_hidden) 
            y_pred = x_hidden_act.mm(w2) + b2
            # choose action with highest q-value
            action_index = np.argmax(np.reshape(y_pred.detach().numpy(), (3)))
            action = self.actions[action_index]
            
        return action

In [7]:
def play_batch(epsilon=0.5):
    n_episodes = 20
    n_timestamps = 200
    agent = Agent(p.getActionSet())
    
    episode_rewards = []
    logs = pd.DataFrame()
    
    for episode_i in range(n_episodes):
        p.reset_game()
        episode_reward = 0
        state = game.getGameState()
        
        for timestamp in range(n_timestamps):
            action = agent.pick_action(state, w1, b1, w2, b2, epsilon)
            
            reward = p.act(action)
            
            state['episode'] = episode_i
            state['action'] = action
            state['reward'] = reward
            logs = pd.concat((logs, pd.DataFrame(state, index = [0])))
            state = game.getGameState()
            
            if p.game_over():
                episode_reward += -1
                break
    
    return logs
                

In [15]:
epsilon = 0.5
for i in range(500):
    logs = play_batch()
    print('Epoch %3d: mean reward: %.3f' % (i+1, logs['reward'].sum()/logs['episode'].nunique()))
    update_weights(logs)
    if (i+1)%10 == 0:
        epsilon *= 0.95
        print('Epsilon changed to %.4f' % epsilon)


Epoch   1: mean reward: -5.700
Epoch   2: mean reward: -5.850
Epoch   3: mean reward: -5.700
Epoch   4: mean reward: -5.550
Epoch   5: mean reward: -5.600
Epoch   6: mean reward: -5.550
Epoch   7: mean reward: -5.650
Epoch   8: mean reward: -5.700
Epoch   9: mean reward: -5.550
Epoch  10: mean reward: -5.550
Epsilon changed to 0.4750
Epoch  11: mean reward: -5.550
Epoch  12: mean reward: -5.550
Epoch  13: mean reward: -5.650
Epoch  14: mean reward: -5.200
Epoch  15: mean reward: -5.350
Epoch  16: mean reward: -5.400
Epoch  17: mean reward: -5.600
Epoch  18: mean reward: -5.550
Epoch  19: mean reward: -5.750
Epoch  20: mean reward: -5.600
Epsilon changed to 0.4512
Epoch  21: mean reward: -5.600
Epoch  22: mean reward: -5.850
Epoch  23: mean reward: -5.500
Epoch  24: mean reward: -5.400
Epoch  25: mean reward: -5.650
Epoch  26: mean reward: -5.700
Epoch  27: mean reward: -5.700
Epoch  28: mean reward: -5.500
Epoch  29: mean reward: -5.600
Epoch  30: mean reward: -5.800
Epsilon changed to

KeyboardInterrupt: 