## Level 1: Easy Stright Line

#### **Important note:**

The Minecraft world is generated using Microsoft Malmo's mission xml template, make sure to put the mission template in this directory (navigateDense.xml) in the MineRL python package location E.g. `~/anaconda3/envs/rltorch/lib/python3.7/site-packages/minerl/herobraine/env_specs/missions/`

### Load Agent Environment Libraries

In [11]:
import gym
import minerl

from logging import getLogger
logger = getLogger(__name__)

In [12]:
!export JAVA_HOME=/Library/Java/JavaVirtualMachines/temurin-8.jdk/Contents/Home

### Load MineRL environment wrappers
* The MineRL Gym Environment returns action and observation spaces as Dictionary spaces instead of Discrete spaces. We need a wrapper to map possible actions to discrete space.

In [13]:
# Add parent directory to sys path to acess env_wrappers.py
import sys
sys.path.insert(0,'..')

In [14]:
import chainerrl
from chainerrl.wrappers import ContinuingTimeLimit
from chainerrl.wrappers.atari_wrappers import FrameStack, ScaledFloatFrame

# Environment wrapper borrowed from minerl sample code:
# https://github.com/minerllabs/baselines/tree/master/general/chainerrl
from env_wrappers import (
    SerialDiscreteActionWrapper, CombineActionWrapper, SerialDiscreteCombineActionWrapper,
    ContinuingTimeLimitMonitor,
    MoveAxisWrapper, FrameSkip, ObtainPoVWrapper, PoVWithCompassAngleWrapper, GrayScaleWrapper)


In [15]:
# Agruments for wrapper
class Args:
    def __init__(self):
        self.frame_skip = None
        self.gray_scale = False
        self.env = 'MineRLNavigateDense'
        self.frame_stack = None
        self.disable_action_prior = False # False=Discrete of True=CombineDiscrete
args = Args()

In [16]:
# This entire function is borrowed from MineRL demo files:
# https://github.com/minerllabs/baselines/blob/master/general/chainerrl/baselines/ppo.py#L124
def wrap_env(env, test):

        if isinstance(env, gym.wrappers.TimeLimit):
            logger.info('Detected `gym.wrappers.TimeLimit`! Unwrap it and re-wrap our own time limit.')
            env = env.env
            max_episode_steps = env.spec.max_episode_steps
            env = ContinuingTimeLimit(env, max_episode_steps=max_episode_steps)

        # wrap env: observation...
        # NOTE: wrapping order matters!

        if test and args.monitor:
            env = ContinuingTimeLimitMonitor(
                env, os.path.join(args.outdir, 'monitor'),
                mode='evaluation' if test else 'training', video_callable=lambda episode_id: True)
        if args.frame_skip is not None:
            env = FrameSkip(env, skip=args.frame_skip)
        if args.gray_scale:
            env = GrayScaleWrapper(env, dict_space_key='pov')
        if args.env.startswith('MineRLNavigate'):
            env = PoVWithCompassAngleWrapper(env)
        else:
            env = ObtainPoVWrapper(env)
        env = MoveAxisWrapper(env, source=-1, destination=0)  # convert hwc -> chw as Chainer requires.
        env = ScaledFloatFrame(env)
        if args.frame_stack is not None and args.frame_stack > 0:
            env = FrameStack(env, args.frame_stack, channel_order='chw')

        # wrap env: action...
        if not args.disable_action_prior:
            env = SerialDiscreteActionWrapper(
                env,
                always_keys=[], reverse_keys=[], exclude_keys=['camera'], exclude_noop=False)
        else:
            env = CombineActionWrapper(env)
            env = SerialDiscreteCombineActionWrapper(env)

        return env

### Load the environment

In [17]:
core_env = gym.make("MineRLNavigateDense-v0") # A MineRLNavigate-v0 env

In [18]:
env = wrap_env(core_env, test=False)

In [None]:
# Initialize environment to check if mission XML working
env.reset()
print('done')



### Define custom policy network

In [None]:
import gym
import torch
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3 import DQN
from stable_baselines3.common.policies import register_policy


In [None]:
class ModifiedCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=512):
        super(ModifiedCNN, self).__init__(observation_space, features_dim)
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=2, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64 * (observation_space.shape[1] - 4) * (observation_space.shape[2] - 4), features_dim),
            nn.ReLU()
        )

    def forward(self, observations):
        return self.cnn(observations.permute((0, 3, 1, 2)))

class CustomPolicy(DQN.policy_class):
    def __init__(self, *args, **kwargs):
        super(CustomPolicy, self).__init__(*args, features_extractor_class=ModifiedCNN, **kwargs)

register_policy("CustomPolicy", CustomPolicy)

### Define model

DQN Model by OpenAI: https://openai.com/blog/openai-baselines-dqn/

In [None]:
model = DQN(policy="CustomPolicy", env=env, verbose=1, tensorboard_log="./test_tensorboard/")

In [None]:
model.learn(total_timesteps=100000, log_interval=100)
model.save("level1pt5_dqn")
