In [1]:
from ast import literal_eval
import ray
import gym
import retro
import os
import numpy as np
import matplotlib.pyplot as plt
from markov import sampleMarkov, createMarkov, randMarkov
from support import getInitial, verifyTrajectory, install_games_from_rom_dir, frameToCell, action_set, trajectoryToGif
import time



In [2]:
import imageio
imageio.plugins.freeimage.download()

In [3]:
ray.init()

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-01-23_00-10-20_2192/logs.
Waiting for redis server at 127.0.0.1:46177 to respond...
Waiting for redis server at 127.0.0.1:10050 to respond...
Starting Redis shard with 10.0 GB max memory.
Starting the Plasma object store with 12.785916313 GB memory using /tmp.

View the web UI at http://localhost:8888/notebooks/ray_ui.ipynb?token=c43331ca232bb127a9007cdf2aa763cc09a8e9dc596ccedd



{'node_ip_address': None,
 'redis_address': '172.17.0.6:46177',
 'object_store_address': '/tmp/ray/session_2019-01-23_00-10-20_2192/sockets/plasma_store',
 'webui_url': 'http://localhost:8888/notebooks/ray_ui.ipynb?token=c43331ca232bb127a9007cdf2aa763cc09a8e9dc596ccedd',
 'raylet_socket_name': '/tmp/ray/session_2019-01-23_00-10-20_2192/sockets/raylet'}

In [None]:
def winCondition(cell, info):
    return info['level_end_bonus'] != 0

def stopCondition(cell, info, step):
    return step > 500 or winCondition(cell, info) or info['lives'] != 3


In [4]:
@ray.remote
class MasterActor(object):
    def __init__(self, 
                 initialPolicy,
                 initialCell,
                 initialFitness,
                 initialTrajectory,
                 initialState):
        self.best_trajectory = None
        self.best_fitness = None
        self.policy = initialPolicy
        self.cells = [initialCell]
        self.fitnesses = {initialCell:initialFitness}
        self.cell_prob = {initialCell:1}
        self.trajectories = {initialCell:initialTrajectory}
        self.states = {initialCell:initialState}

    def pushResult(self, cell, trajectory, state, info, step):
        fitness = len(trajectory)
        if cell in self.cells:
            if fitness < self.fitnesses[cell]:
                #Improvement to existing cell
                self.fitnesses[cell] = fitness
                self.trajectories[cell] = trajectory
                self.states[cell] = state
                self.cell_prob[cell] += 1
        else:
            if winCondition(cell, info):
                if self.best_trajectory is None:
                    #First time win
                    self.best_trajectory = trajectory
                    self.best_fitness = fitness
                elif fitness<self.best_fitness:
                    #Improvement win
                    self.best_trajectory = trajectory       
                    self.best_fitness = fitness
            else:
                #First time to this new cell
                self.cells.append(cell)
                self.fitnesses[cell] = fitness
                self.trajectories[cell] = trajectory
                self.states[cell] = state
                self.cell_prob[cell] = 10

    def pullCache(self):
        return (self.policy, self.cells, self.fitnesses)
    
    def pullCell(self, cell):
        return (self.states[cell], self.trajectories[cell])
    
    def pullBestTrajectory(self):
        return self.best_trajectory
    
    def renormalizeCellProbs(self):
        padd = .1
        probsSum = np.array([self.cell_prob[c] for c in self.cells]).sum() + len(self.cells)*padd
        for cell in self.cells:
            self.cell_prob[cell]=(self.cell_prob[cell]+padd)/probsSum
    
    def updatePolicy(self):
        if self.best_trajectory is None:
            self.policy['weights'] = createMarkov(self.trajectories[self.cells[-1]],12)
        else:
            self.policy['weights'] = createMarkov(self.best_trajectory)
    
    def pullGo(self):
        
        normalized_cell_prob = np.array([self.cell_prob[c] for c in self.cells])
        normalized_cell_prob = normalized_cell_prob/normalized_cell_prob.sum()
    
        goCell = self.cells[np.random.choice(np.arange(len(self.cells)), p = normalized_cell_prob )]
    
        return (self.states[goCell], self.trajectories[goCell])

In [5]:

@ray.remote
def GoExploreWorker(game, master):
    env = retro.make(game)
    env.reset()
    while(True):
        policy, cells, fitnesses = ray.get(master.pullCache.remote())
        for _ in range(10):
            state, trajectory = ray.get(master.pullGo.remote())

            recurrent_state = None

            if policy['type']=='markov':
                recurrent_state = np.random.randint(12)

            env.em.set_state(state)
            step = 0
            while(True):

                action = None
                if policy['type'] == 'random':
                    action = np.random.randint(12)
                if policy['type'] == 'markov':
                    action = sampleMarkov(recurrent_state, policy['weights'])
                    recurrent_state = action

                observation, reward, done, info = env.step(action_set[action])
                trajectory.append(action)
                cell = frameToCell(observation, info)
                fitness = len(trajectory)
                state = env.em.get_state()
                if cell in cells:
                    if fitness < fitnesses[cell]:
                        master.pushResult.remote(cell, trajectory.copy(), state, info, step)
                else:
                    master.pushResult.remote(cell, trajectory.copy(), state, info, step)
                    cells.append(cell)
                    fitnesses[cell]=fitness
                    
                if (stopCondition(cell,info,step)):
                    break
                step += 1

In [6]:
install_games_from_rom_dir('roms/')

game = 'SonicTheHedgehog-Genesis'
stateStr = 'GreenHillZone.Act1.state'

initialPolicy = {'type':'markov', 'weights':randMarkov(10,12)}

initialCell, initialState, initialTrajectory, initialFitness = getInitial(game, stateStr)

NWorkers = 8

master = MasterActor.remote(initialPolicy, initialCell, initialFitness, initialTrajectory, initialState)
workers = [ GoExploreWorker.remote(game, master) for _ in range(NWorkers)]    

Importing SonicTheHedgehog-Genesis
Imported 1 games


In [7]:
#time.sleep(10)
#policy, cells, fitnesses = ray.get(master.pullCache.remote())
#test_cell = cells[-1]
#state, trajectory = ray.get(master.pullCell.remote(test_cell))
#verifyTrajectory(game, stateStr, trajectory, state)

In [None]:
start_time = time.time()
i = 0
while True:
    time.sleep(1)
    master.renormalizeCellProbs.remote()
    master.updatePolicy.remote()
    
    
    
    if i%10==0:
        message = ''
        policy, cells, fitnesses = ray.get(master.pullCache.remote())
        best_trajectory = ray.get(master.pullBestTrajectory.remote())
        message += 'Time elapsed: ' + str(time.time()-start_time)
        message += ', Cells: ' + str(len(cells))
        if best_trajectory is not None:
            message += ', Best trajectory length: ' + str(len(best_trajectory))
            if i%200==0:
                trajectoryToGif(game, stateStr, best_trajectory, True, 'Gameplay_FIN_'+str(len(best_trajectory))+'.gif')
        else:
            if i%200==0:
                cell = cells[np.array([literal_eval(cell)[0] for cell in cells]).argsort()[-1]]
                state, trajectory = ray.get(master.pullCell.remote(cell))
                c = literal_eval(cell)
                trajectoryToGif(game, stateStr, trajectory, True, 'Gameplay_FR_'+str(len(trajectory))+'-'+str(c[0])+'-'+str(c[1])+'-'+str(c[2])+'-'+str(c[3])+'.gif')  
        print(message)
    
    i+=1

Time elapsed: 3.003373384475708, Cells: 1
Time elapsed: 13.310152769088745, Cells: 148
Time elapsed: 24.958292722702026, Cells: 149
Time elapsed: 34.97505259513855, Cells: 152
Time elapsed: 44.99101161956787, Cells: 152
Time elapsed: 55.00776815414429, Cells: 152
Time elapsed: 65.02482318878174, Cells: 152
Time elapsed: 75.04122018814087, Cells: 152
Time elapsed: 85.40085768699646, Cells: 156
Time elapsed: 95.41773915290833, Cells: 156
Time elapsed: 105.43607258796692, Cells: 156
Time elapsed: 115.45266962051392, Cells: 156
Time elapsed: 125.46985077857971, Cells: 156
Time elapsed: 135.48710083961487, Cells: 165
Time elapsed: 145.50497698783875, Cells: 166
Time elapsed: 155.78705859184265, Cells: 179
Time elapsed: 165.80695581436157, Cells: 186
Time elapsed: 175.97297406196594, Cells: 196
Time elapsed: 185.99217057228088, Cells: 264
Time elapsed: 196.01325035095215, Cells: 502
Time elapsed: 206.03539156913757, Cells: 741
Time elapsed: 238.7682502269745, Cells: 757
Time elapsed: 248.962