In [1]:
import malmo.minecraftbootstrap; malmo.minecraftbootstrap.launch_minecraft()
malmo.minecraftbootstrap.set_malmo_xsd_path()

Nothing is listening on port 10000 - will attempt to launch Minecraft from a new terminal.
Giving Minecraft some time to launch... 
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ok
C:\Users\User\Desktop\Malmo\MalmoPlatform\Schemas


In [16]:
from __future__ import print_function
from future import standard_library
standard_library.install_aliases()
from builtins import range
from builtins import object
import os
import sys
import time
import json
import random
import logging

# Microsoft Malmo API
import malmo.MalmoPython as MalmoPython

# Check Tkinter version on your pc.
if sys.version_info[0] == 2:  
    import Tkinter as tk
else:
    import tkinter as tk

In [17]:
# Define agent's action
agent_actions = ["move 1", "move -1", "turn 1", "turn -1"]

# Appoint value to alpha, epsilon, and gamma.
epsilon = 0.6
alpha = 0.1
gamma = 0.1

In [18]:
class QL_AEG(object):

    def __init__(self):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.training = True
        
        # Initialize the action count
        self.count = 0

        self.logger = logging.getLogger(__name__)
        if False:
            self.logger.setLevel(logging.DEBUG)
        else:
            self.logger.setLevel(logging.INFO)
        self.logger.handlers = []
        self.logger.addHandler(logging.StreamHandler(sys.stdout))

        self.actions = agent_actions
        self.q_table = {}
        self.canvas = None
        self.root = None
        
    def count_action(self):
        return self.count
    
    def resetcount_action(self):
        self.count = 0
        return self.count
    
    def loadModel(self, model_file):
        with open(model_file) as f:
            self.q_table = json.load(f)
            
    def training(self):
        self.training = True
        
    def evaluate(self):
        self.training = False
        
    def act(self, world_state, agent_host, current_r ):
        
        obs_text = world_state.observations[-1].text
        obs = json.loads(obs_text) # most recent observation
        self.logger.debug(obs)
        if not u'XPos' in obs or not u'ZPos' in obs:
            self.logger.error("Incomplete observation received: %s" % obs_text)
            return 0
        current_s = "%d:%d" % (int(obs[u'XPos']), int(obs[u'ZPos']))
        self.logger.debug("State: %s (x = %.2f, z = %.2f)" % (current_s, float(obs[u'XPos']), float(obs[u'ZPos'])))
        if current_s not in self.q_table:
            self.q_table[current_s] = ([0] * len(self.actions))

        # update Q values
        if self.training and self.prev_s is not None and self.prev_a is not None:
            old_q = self.q_table[self.prev_s][self.prev_a]
            self.q_table[self.prev_s][self.prev_a] = old_q + self.alpha * (current_r
                + self.gamma * max(self.q_table[current_s]) - old_q)

        self.drawQ( curr_x = int(obs[u'XPos']), curr_y = int(obs[u'ZPos']) )

        # select the next action
        rnd = random.random()
        if rnd < self.epsilon:
            a = random.randint(0, len(self.actions) - 1)
            self.actions[a]
        else:
            m = max(self.q_table[current_s])
            self.logger.debug("Current values: %s" % ",".join(str(x) for x in self.q_table[current_s]))
            l = list()
            for x in range(0, len(self.actions)):
                if self.q_table[current_s][x] == m:
                    l.append(x)
            y = random.randint(0, len(l)-1)
            a = l[y]
            self.actions[a]
        
        # try to send the selected action, only update prev_s if this succeeds
        try:
            agent_host.sendCommand(self.actions[a])
            self.prev_s = current_s
            self.prev_a = a

        except RuntimeError as e:
            self.logger.error("Failed to send command: %s" % e)
        
        self.count += 1
        return current_r

    def run(self, agent_host):
        total_reward = 0   
        self.prev_s = None
        self.prev_a = None  
        is_first_action = True
        
        # main loop:
        world_state = agent_host.getWorldState()
        while world_state.is_mission_running:

            current_r = 0       
            if is_first_action:
                # wait until have received a valid observation
                while True:
                    time.sleep(0.1)
                    world_state = agent_host.getWorldState()
                    for error in world_state.errors:
                        self.logger.error("Error: %s" % error.text)
                    for reward in world_state.rewards:
                        current_r += reward.getValue()
                    if world_state.is_mission_running and len(world_state.observations)>0 and not world_state.observations[-1].text=="{}":
                        total_reward += self.act(world_state, agent_host, current_r)
                        break
                    if not world_state.is_mission_running:
                        break
                is_first_action = False
            else:
                # wait for non-zero reward
                while world_state.is_mission_running and current_r == 0:
                    time.sleep(0.1)
                    world_state = agent_host.getWorldState()
                    for error in world_state.errors:
                        self.logger.error("Error: %s" % error.text)
                    for reward in world_state.rewards:
                        current_r += reward.getValue()
                # allow time to stabilise after action
                while True:
                    time.sleep(0.1)
                    world_state = agent_host.getWorldState()
                    for error in world_state.errors:
                        self.logger.error("Error: %s" % error.text)
                    for reward in world_state.rewards:
                        current_r += reward.getValue()
                    if world_state.is_mission_running and len(world_state.observations)>0 and not world_state.observations[-1].text=="{}":
                        total_reward += self.act(world_state, agent_host, current_r)
                        break
                    if not world_state.is_mission_running:
                        break

        # process final reward
        self.logger.debug("Final reward: %d" % current_r)
        total_reward += current_r

        # update Q values
        if self.training and self.prev_s is not None and self.prev_a is not None:
            old_q = self.q_table[self.prev_s][self.prev_a]
            self.q_table[self.prev_s][self.prev_a] = old_q + self.alpha * (current_r - old_q)
            
        self.drawQ()
    
        return total_reward
        
    def drawQ( self, curr_x=None, curr_y=None ):
        scale = 22
        world_x = 35
        world_y = 35
        if self.canvas is None or self.root is None:
            self.root = tk.Tk()
            self.root.wm_title("Q-table")
            self.canvas = tk.Canvas(self.root, width=world_x*scale, height=world_y*scale, borderwidth=0, highlightthickness=0, bg="black")
            self.canvas.grid()
            self.root.update()
        self.canvas.delete("all")
        action_inset = 0.1
        action_radius = 0.1
        curr_radius = 0.2
        action_positions = [ ( 0.5, action_inset ), ( 0.5, 1-action_inset ), ( action_inset, 0.5 ), ( 1-action_inset, 0.5 ) ]
        min_value = -20
        max_value = 20
        for x in range(world_x):
            for y in range(world_y):
                s = "%d:%d" % (x,y)
                self.canvas.create_rectangle( x*scale, y*scale, (x+1)*scale, (y+1)*scale, outline="#fff", fill="#000")
                for action in range(4):
                    if not s in self.q_table:
                        continue
                    value = self.q_table[s][action]
                    color = int( 255 * ( value - min_value ) / ( max_value - min_value ))
                    color = max( min( color, 255 ), 0 )
                    color_string = '#%02x%02x%02x' % (255-color, color, 0)
                    self.canvas.create_oval( (x + action_positions[action][0] - action_radius ) *scale,
                                             (y + action_positions[action][1] - action_radius ) *scale,
                                             (x + action_positions[action][0] + action_radius ) *scale,
                                             (y + action_positions[action][1] + action_radius ) *scale, 
                                             outline=color_string, fill=color_string )
        if curr_x is not None and curr_y is not None:
            self.canvas.create_oval( (curr_x + 0.5 - curr_radius ) * scale, 
                                     (curr_y + 0.5 - curr_radius ) * scale, 
                                     (curr_x + 0.5 + curr_radius ) * scale, 
                                     (curr_y + 0.5 + curr_radius ) * scale, 
                                     outline="#fff", fill="#fff" )
        self.root.update()


In [19]:
start_time_all_trials = time.time()
results_df = []
cumulative_rewards = []

agent = QL_AEG()
agent_host = MalmoPython.AgentHost()

try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print('ERROR:',e)
    print(agent_host.getUsage())
    exit(1)
if agent_host.receivedArgument("help"):
    print(agent_host.getUsage())
    exit(0)

mission_file = './maze2.xml'
with open(mission_file, 'r') as f:
    print("Loading mission from %s" % mission_file)
    mission_xml = f.read()
    my_mission = MalmoPython.MissionSpec(mission_xml, True)
    
agent_host.setObservationsPolicy(MalmoPython.ObservationsPolicy.LATEST_OBSERVATION_ONLY)
agent_host.setVideoPolicy(MalmoPython.VideoPolicy.LATEST_FRAME_ONLY)
my_mission_record = MalmoPython.MissionRecordSpec()
my_mission.requestVideo(800, 500)
my_mission.setViewpoint(0)

my_clients = MalmoPython.ClientPool()
my_clients.add(MalmoPython.ClientInfo('127.0.0.1', 10000)) # add Minecraft machines here as available
agentID = 0
expID = 'Q-Learning with Alpha, Epsilon, and Gamma.'

max_retries = 3

if agent_host.receivedArgument("test"):
    num_repeats = 1
else:
    num_repeats = 30

for i in range(num_repeats):
    
    print()
    print('Repeat %d of %d' % (i+1, num_repeats ))
    
    for retry in range(max_retries):
        try:
            agent_host.startMission(my_mission, my_clients, my_mission_record, agentID, "%s-%d" % (expID, i))
            break
        except RuntimeError as e:
            if retry == max_retries - 1:
                print("Error starting mission:",e)
                exit(1)
            else:
                time.sleep(2.5)

    print("Waiting for the mission to start", end=' ')
    world_state = agent_host.getWorldState()
    while not world_state.has_mission_begun:
        
        print(".", end="")
        time.sleep(0.1)
        world_state = agent_host.getWorldState()
        for error in world_state.errors:
            print()   
    print()
    
    print("Mission started ")
    print("--------------------------------")
    # Run the program.
    start_time = time.time()
    cumulative_reward = agent.run(agent_host)
    print('Reward obtained:%d' % cumulative_reward)
    cumulative_rewards += [cumulative_reward]
    timeTaken = (time.time() - start_time)
    print("Total time trained:", "%s seconds" % timeTaken )
    print("Number of actions performed:", agent.count_action())
    result = [i, cumulative_reward, timeTaken, agent.count_action()]
    results_df.append(result)
    agent.resetcount_action()

print()
print("Done.")
print("Cumulative rewards for all %d runs:" % num_repeats)
print(cumulative_rewards)
print("Average reward:", sum(cumulative_rewards)/num_repeats)
print("Total time used for 30 trials:", "%s seconds" % (time.time() - start_time_all_trials))

ERROR: Caught std::exception: unrecognised option '-f'

Malmo version: 0.36.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test


Loading mission from ./maze2.xml

Repeat 1 of 30
Waiting for the mission to start .....
Mission started 
--------------------------------
Error: AgentHost::sendCommand : commands connection is not open. Is the mission running?
Reward obtained:-742
Total time trained: 300.6891858577728 seconds
Number of actions performed: 743

Repeat 2 of 30
Waiting for the mission to start .....
Mission started 
--------------------------------
Error: AgentHost::sendCommand : commands connection is not open. Is the mission running?
Reward obtained:-615
Total time trained: 300.71309542655945 seconds
Number of actions performed: 616

Repeat 3 of 30
Waiting for the mission to start .....
Mission started 
--------------------------------
Error: AgentHost::sendCommand : commands connection is not 

In [20]:
import pandas as pd
column = ['Number of attempt', 'Reward', 'Time taken', 'Number of actions']
results_df = pd.DataFrame(results_df, columns = column)
results_df

Unnamed: 0,Number of attempt,Reward,Time taken,Number of actions
0,0,-742.0,300.689186,743
1,1,-615.0,300.713095,616
2,2,-578.0,300.577612,579
3,3,-553.0,300.854693,554
4,4,-540.0,300.712221,541
5,5,11092.0,238.096387,409
6,6,-444.0,300.786708,444
7,7,9717.0,195.74472,284
8,8,583.0,300.679986,418
9,9,-417.0,300.983394,418


In [21]:
print("Current epsilon value:", epsilon)
print("Current alpha value:", alpha)
print("Current gamma value:", gamma)

Current epsilon value: 0.6
Current alpha value: 0.1
Current gamma value: 0.1


In [22]:
results_df.to_csv('E06A01G01_M2.csv', index = False)