In [14]:
import malmo.minecraftbootstrap; malmo.minecraftbootstrap.launch_minecraft()
malmo.minecraftbootstrap.set_malmo_xsd_path()

Nothing is listening on port 10000 - will attempt to launch Minecraft from a new terminal.
Giving Minecraft some time to launch... 
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ok
C:\Users\User\Desktop\Malmo\MalmoPlatform\Schemas


In [20]:
from __future__ import print_function
from future import standard_library
standard_library.install_aliases()
from builtins import range
import sys
import time
import json
import math
import random
import numpy as np
import pandas as pd

# Microsoft Malmo API
import malmo.MalmoPython as MalmoPython

# Check Tkinter version on your pc.
if sys.version_info[0] == 2:  
    import Tkinter as tk
else:
    import tkinter as tk

from PIL import Image
from itertools import count
from collections import namedtuple

# Neural network API
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [21]:
canvas = None
root = None
q_table = {}
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DQN(nn.Module):

    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1)
        self.pool1 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = torch.nn.Conv2d(16, 8, kernel_size=3, stride=1)
        self.pool2 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        
        def conv2d_size_out(size, kernel_size, stride):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(conv2d_size_out(w,3,1),2,2),3,1),2,2)
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(conv2d_size_out(h,3,1),2,2),3,1),2,2)
        linear_input_size = convw * convh * 8
        self.fc1 = torch.nn.Linear(linear_input_size, 64)
        self.head = nn.Linear(64, outputs)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = x.view(-1, 512)
        x = F.relu(self.fc1(x))
        x = self.head(x)
        return x

def get_screen(world_state):
    frame = world_state.video_frames[0] 
    image = Image.frombytes('RGB', (frame.width, frame.height), bytes(frame.pixels) )
    image = np.asarray(image)
    screen = image.transpose((2, 0, 1))
    screen = np.ascontiguousarray(screen, dtype=np.float32) /255
    screen = torch.from_numpy(screen)
    resize = T.Compose([T.ToPILImage(),
                    T.Resize((40, 40)),
                    T.ToTensor()])
    return resize(screen).unsqueeze(0)

def optimize_model():
    #For Batch Calculations
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)

    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), dtype=torch.uint8)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    
def select_action(state):
    global steps_done
    sample = random.random()
    if steps_done > 764:
        eps_threshold = 0
    else:
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        action = torch.tensor([[random.randrange(n_actions)]], dtype=torch.long)
        return action

def move(A):
    #moves given Action A
    if A == 0: #forward
        agent_host.sendCommand("move 1")
    elif A == 1: #backward
        agent_host.sendCommand("move -1")
    elif A == 2: #left
        agent_host.sendCommand("turn 1")
    elif A == 3: #right
        agent_host.sendCommand("turn -1")

def drawQ(curr_x=None, curr_y=None):
    global canvas
    global root
    
    scale = 22
    world_x = 35
    world_y = 35
    if canvas is None or root is None:
        root = tk.Tk()
        root.wm_title("Q-table")
        canvas = tk.Canvas(root, width=world_x*scale, height=world_y*scale, borderwidth=0, highlightthickness=0, bg="black")
        canvas.grid()
        root.update()
    canvas.delete("all")
    action_inset = 0.1
    action_radius = 0.1
    curr_radius = 0.2
    action_positions = [ ( 0.5, action_inset ), ( 0.5, 1-action_inset ), ( action_inset, 0.5 ), ( 1-action_inset, 0.5 ) ]
    min_value = -20
    max_value = 20
    for x in range(world_x):
        for y in range(world_y):
            s = "%d:%d" % (x,y)
            canvas.create_rectangle( x*scale, y*scale, (x+1)*scale, (y+1)*scale, outline="#fff", fill="#000")
            for action in range(4):
                if not s in q_table:
                    continue
                value = q_table[s][action]
                color = int( 255 * ( value - min_value ) / ( max_value - min_value ))
                color = max( min( color, 255 ), 0 )
                color_string = '#ff6425'
                canvas.create_oval( (x + action_positions[action][0] - action_radius ) *scale,
                                         (y + action_positions[action][1] - action_radius ) *scale,
                                         (x + action_positions[action][0] + action_radius ) *scale,
                                         (y + action_positions[action][1] + action_radius ) *scale, 
                                         outline=color_string, fill=color_string )
    if curr_x is not None and curr_y is not None:
        canvas.create_oval( (curr_x + 0.5 - curr_radius ) * scale, 
                                 (curr_y + 0.5 - curr_radius ) * scale, 
                                 (curr_x + 0.5 + curr_radius ) * scale, 
                                 (curr_y + 0.5 + curr_radius ) * scale, 
                                 outline="#fff", fill="#fff" )
    root.update()

In [22]:
start_time_all_trials = time.time()
results_df = []
cumulative_rewards = []

# DQN vairables
BATCH_SIZE = 64
GAMMA = 0.1
EPS_START = 0.8
EPS_END = 0.1
EPS_DECAY = 45
TARGET_UPDATE = 10
n_actions = 4
lr = 0.001

screen_height = 40 
screen_width = 40
policy_net = DQN(screen_height, screen_width, n_actions)
target_net = DQN(screen_height, screen_width, n_actions)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
memory = ReplayMemory(1000)

agent_host = MalmoPython.AgentHost()

try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print('ERROR:',e)
    print(agent_host.getUsage())
    exit(1)
if agent_host.receivedArgument("help"):
    print(agent_host.getUsage())
    exit(0)

mission_file = './maze.xml'
with open(mission_file, 'r') as f:
    print("Loading mission from %s" % mission_file)
    mission_xml = f.read()
    my_mission = MalmoPython.MissionSpec(mission_xml, True)
    
print()
agent_host.setObservationsPolicy(MalmoPython.ObservationsPolicy.LATEST_OBSERVATION_ONLY)
agent_host.setVideoPolicy(MalmoPython.VideoPolicy.LATEST_FRAME_ONLY)
my_mission_record = MalmoPython.MissionRecordSpec()
my_mission.requestVideo(800, 500)
my_mission.setViewpoint(0)

my_clients = MalmoPython.ClientPool()
my_clients.add(MalmoPython.ClientInfo('127.0.0.1', 10000)) # add Minecraft machines here as available
agentID = 0
expID = 'Deep Q Network'

max_retries = 3

if agent_host.receivedArgument("test"):
    num_repeats = 1
else:
    num_repeats = 100

for i in range(num_repeats): #loop for training runs 
    
    action_count = 0
    steps_done = 0
    
    for retry in range(max_retries):
        try:
            agent_host.startMission( my_mission, my_clients, my_mission_record, agentID, "%s-%d" % (expID, i)) #replace to start on a different client
            break
        except RuntimeError as e:
            if retry == max_retries - 1:
                print("Error starting mission:",e)
                exit(1)
            else:
                time.sleep(2)
        
    # Loop until mission starts:
    print('Repeat %d of %d' % (i+1, num_repeats ))
    print("Waiting for the mission to start ")
    world_state = agent_host.getWorldState()
    while not world_state.has_mission_begun:
        time.sleep(0.1)
        world_state = agent_host.getWorldState()
        for error in world_state.errors:
            print()

    # wait for a valid observation
    world_state = agent_host.peekWorldState()
    while world_state.is_mission_running and all(e.text=='{}' for e in world_state.observations):
        world_state = agent_host.peekWorldState()
    # wait for a frame to arrive after that
    num_frames_seen = world_state.number_of_video_frames_since_last_state
    while world_state.is_mission_running and world_state.number_of_video_frames_since_last_state == num_frames_seen:
        world_state = agent_host.peekWorldState()
    world_state = agent_host.getWorldState()
    for error in world_state.errors:
        print(error)

    state = get_screen(world_state)
    R_total = 0 #Sum of Rewards
    framecount=0
    
    start_time = time.time()
    print("Mission started ")
    print("--------------------------------")
    while world_state.is_mission_running:

        action = select_action(state)
        move(action.item())
        
        # Count action performed
        if(action.item()):
            action_count += 1

        world_state = agent_host.peekWorldState()
        while world_state.number_of_video_frames_since_last_state < 1 and world_state.is_mission_running:
            world_state = agent_host.peekWorldState()                
        world_state = agent_host.getWorldState()

        if world_state.is_mission_running:
            next_state = get_screen(world_state)

        elif not world_state.is_mission_running:
            next_state = None

        reward = sum(r.getValue() for r in world_state.rewards)
        R_total += reward
        reward = torch.tensor([reward], dtype=torch.float)

        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state
        
        if world_state.is_mission_running and len(world_state.observations)>0 and not world_state.observations[-1].text=="{}":
            obs_text = world_state.observations[-1].text
            obs = json.loads(obs_text) # most recent observation
            #print(float(obs[u'XPos']),":", float(obs[u'ZPos']))
            #drawQ(curr_x = int(obs[u'XPos']), curr_y = int(obs[u'ZPos']))
            
    optimize_model()

    # Update the target network, copying all weights and biases in DQN
    if i % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

    steps_done += 1  
    
    print("Reward obtained:", R_total)  
    cumulative_rewards += [R_total]
    timeTaken = (time.time() - start_time)
    print("Total time trained:", "%s seconds" % timeTaken )
    print("Number of actions performed:", action_count)
    result = [i, R_total, timeTaken, action_count]
    results_df.append(result)
    print()
    # Mission has ended.

print('Done')
print("Cumulative rewards for all %d runs:" % num_repeats)
print(cumulative_rewards)
print("Average reward:", sum(cumulative_rewards)/num_repeats)
print("Total time used for 30 trials:", "%s seconds" % (time.time() - start_time_all_trials))

ERROR: Caught std::exception: unrecognised option '-f'

Malmo version: 0.36.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test


Loading mission from ./maze.xml

Repeat 1 of 100
Waiting for the mission to start 
Mission started 
--------------------------------




Reward obtained: -5632.0
Total time trained: 360.20864510536194 seconds
Number of actions performed: 6108

Repeat 2 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: 8589.0
Total time trained: 225.29939603805542 seconds
Number of actions performed: 3163

Repeat 3 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: -4121.0
Total time trained: 360.23427414894104 seconds
Number of actions performed: 4865

Repeat 4 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: -4143.0
Total time trained: 360.23378896713257 seconds
Number of actions performed: 4896

Repeat 5 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: -4198.0
Total time trained: 360.2813391685486 seconds
Number of actions performed: 4951

Repeat 6 of 100
Waiting for the mission to start 
Mission started 


Mission started 
--------------------------------
Reward obtained: -6616.0
Total time trained: 360.15013909339905 seconds
Number of actions performed: 6092

Repeat 42 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: 4426.0
Total time trained: 279.7961513996124 seconds
Number of actions performed: 4458

Repeat 43 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: 7565.0
Total time trained: 195.09945940971375 seconds
Number of actions performed: 3015

Repeat 44 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: 7894.0
Total time trained: 180.4364948272705 seconds
Number of actions performed: 2830

Repeat 45 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: -6968.0
Total time trained: 360.6100687980652 seconds
Number of actions performed: 5375

Repeat 46 of 100


Repeat 81 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: 8361.0
Total time trained: 155.81680560112 seconds
Number of actions performed: 2910

Repeat 82 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: -7556.0
Total time trained: 360.12913060188293 seconds
Number of actions performed: 6541

Repeat 83 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: -5605.0
Total time trained: 360.111492395401 seconds
Number of actions performed: 6655

Repeat 84 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: 9171.0
Total time trained: 90.18959259986877 seconds
Number of actions performed: 1503

Repeat 85 of 100
Waiting for the mission to start 
Mission started 
--------------------------------
Reward obtained: 6204.0
Total time trained: 198.63978624343872 seconds
Numb

In [23]:
import pandas as pd
column = ['Number of attempt', 'Reward', 'Time taken', 'Number of actions']
results_df = pd.DataFrame(results_df, columns = column)
results_df

Unnamed: 0,Number of attempt,Reward,Time taken,Number of actions
0,0,-5632.0,360.208645,6108
1,1,8589.0,225.299396,3163
2,2,-4121.0,360.234274,4865
3,3,-4143.0,360.233789,4896
4,4,-4198.0,360.281339,4951
...,...,...,...,...
95,95,-4917.0,204.301507,3635
96,96,9556.0,93.860729,1777
97,97,-7293.0,360.175674,7253
98,98,-7460.0,360.069291,7463


In [24]:
results_df.to_csv('DQN_E7_M1.csv', index = False)