In [None]:
import os
from google.colab import drive
MOUNTPOINT = '/content/gdrive'
drive.mount(MOUNTPOINT, force_remount=True)

In [None]:
path_to_folder = '/content/gdrive/Shareddrives/RL_MiniHack'

In [None]:
!sudo apt update
!sudo apt install -y build-essential autoconf libtool pkg-config python3-dev \
    python3-pip python3-numpy git flex bison libbz2-dev

!wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | sudo apt-key add -
!sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
!sudo apt-get update && apt-get --allow-unauthenticated install -y \
    cmake \
    kitware-archive-keyring

# feel free to use a more elegant solution to make /usr/bin/cmake the default one
!sudo rm $(which cmake)
!$(which cmake) --version

In [None]:
!pip3 install -Uv nle

In [None]:
!pip install minihack

In [None]:
!pip install imageio_ffmpeg

In [None]:
import nle, gym
from gym import spaces
from nle import nethack
import minihack
from minihack import RewardManager

import math
import os
import random
import operator
from datetime import datetime

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import flatten
from torch.nn import BatchNorm2d, Conv2d, CrossEntropyLoss, Dropout, Linear, MaxPool2d, Module, ReLU, Sequential, Softmax

from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display 
import glob
import matplotlib.pyplot as plt

import cv2
import moviepy
import moviepy.video.io.ImageSequenceClip

In [None]:
savedir = '/content/gdrive/Shareddrives/RL_MiniHack/save_dir1'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def plot_results(env_name, scores, color, ylim):
    """Plots the reward attained by an Agent at each step of training in 
        an environment for each iteration and average over all iterations"""
    
    plt.figure(figsize=(12,6))
    
    # Plot individual iterations 
    for score in scores:
        plt.plot(score, alpha =0.1, color=color)
    
    # Plot mean over all iterations
    mean = np.mean(scores,axis=0)
    plt.plot(mean, color=color,label="Mean Reward")
    
    plt.title(f"DQN with PER - {env_name}")
    plt.xlabel("Episode Number")
    plt.ylabel("Reward")
    plt.yticks(np.arange(ylim[0], ylim[1], 1.00))
    plt.legend(loc=4)
    plt.savefig(f"DQN with PER-{env_name}.pdf")
    plt.show()

In [None]:
def format_state(state):
    """Formats state into form that the NN can accept"""
    glyphs = state["glyphs"]
    # Normalize
    glyphs = glyphs/glyphs.max()
    glyphs = glyphs.reshape((1,1,21,79))
    return torch.from_numpy(glyphs).squeeze(0)

In [None]:
def moving_average(arr, win_size):
    sum = np.cumsum(arr, dtype=float)
    sum[win_size:] = sum[win_size:] - sum[:-win_size]
    return sum/win_size

### DQN Model

In [None]:
class SegmentTree(object):
    def __init__(self, capacity, operation, neutral_element):
        """Build a Segment Tree data structure.
        https://en.wikipedia.org/wiki/Segment_tree
        Can be used as regular array, but with two
        important differences:
            a) setting item's value is slightly slower.
               It is O(lg capacity) instead of O(1).
            b) user has access to an efficient ( O(log segment size) )
               `reduce` operation which reduces `operation` over
               a contiguous subsequence of items in the array.
        Paramters
        ---------
        capacity: int
            Total size of the array - must be a power of two.
        operation: lambda obj, obj -> obj
            and operation for combining elements (eg. sum, max)
            must form a mathematical group together with the set of
            possible values for array elements (i.e. be associative)
        neutral_element: obj
            neutral element for the operation above. eg. float('-inf')
            for max and 0 for sum.
        """
        assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
        self._capacity = capacity
        self._value = [neutral_element for _ in range(2 * capacity)]
        self._operation = operation

    def _reduce_helper(self, start, end, node, node_start, node_end):
        if start == node_start and end == node_end:
            return self._value[node]
        mid = (node_start + node_end) // 2
        if end <= mid:
            return self._reduce_helper(start, end, 2 * node, node_start, mid)
        else:
            if mid + 1 <= start:
                return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
            else:
                return self._operation(
                    self._reduce_helper(start, mid, 2 * node, node_start, mid),
                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
                )

    def reduce(self, start=0, end=None):
        """Returns result of applying `self.operation`
        to a contiguous subsequence of the array.
            self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
        Parameters
        ----------
        start: int
            beginning of the subsequence
        end: int
            end of the subsequences
        Returns
        -------
        reduced: obj
            result of reducing self.operation over the specified range of array elements.
        """
        if end is None:
            end = self._capacity
        if end < 0:
            end += self._capacity
        end -= 1
        return self._reduce_helper(start, end, 1, 0, self._capacity - 1)

    def __setitem__(self, idx, val):
        # index of the leaf
        idx += self._capacity
        self._value[idx] = val
        idx //= 2
        while idx >= 1:
            self._value[idx] = self._operation(
                self._value[2 * idx],
                self._value[2 * idx + 1]
            )
            idx //= 2

    def __getitem__(self, idx):
        assert 0 <= idx < self._capacity
        return self._value[self._capacity + idx]


class SumSegmentTree(SegmentTree):
    def __init__(self, capacity):
        super(SumSegmentTree, self).__init__(
            capacity=capacity,
            operation=operator.add,
            neutral_element=0.0
        )

    def sum(self, start=0, end=None):
        """Returns arr[start] + ... + arr[end]"""
        return super(SumSegmentTree, self).reduce(start, end)

    def find_prefixsum_idx(self, prefixsum):
        """Find the highest index `i` in the array such that
            sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
        if array values are probabilities, this function
        allows to sample indexes according to the discrete
        probability efficiently.
        Parameters
        ----------
        perfixsum: float
            upperbound on the sum of array prefix
        Returns
        -------
        idx: int
            highest index satisfying the prefixsum constraint
        """
        try:
            assert 0 <= prefixsum <= self.sum() + 1e-5
        except AssertionError:
            print("Prefix sum error: {}".format(prefixsum))
            exit()
        idx = 1
        while idx < self._capacity:  # while non-leaf
            if self._value[2 * idx] > prefixsum:
                idx = 2 * idx
            else:
                prefixsum -= self._value[2 * idx]
                idx = 2 * idx + 1
        return idx - self._capacity


class MinSegmentTree(SegmentTree):
    def __init__(self, capacity):
        super(MinSegmentTree, self).__init__(
            capacity=capacity,
            operation=min,
            neutral_element=float('inf')
        )

    def min(self, start=0, end=None):
        """Returns min(arr[start], ...,  arr[end])"""

        return super(MinSegmentTree, self).reduce(start, end)

In [None]:
#Modified for Prioritized Experience Replay
class ReplayBuffer:
    """
    Simple storage for transitions from an environment.
    """

    def __init__(self, size, alpha=0.6, beta_start=0.4, beta_frames=100000):
        """
        Initialise a buffer of a given size for storing transitions
        :param size: the maximum number of transitions that can be stored
        """
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

        assert alpha >= 0
        self._alpha = alpha

        self.beta_start = beta_start
        self.beta_frames = beta_frames
        self.frame=1

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def __len__(self):
        return len(self._storage)

    def add(self, state, action, reward, next_state, done):
        """
        Add a transition to the buffer. Old transitions will be overwritten if the buffer is full.
        :param state: the agent's initial state
        :param action: the action taken by the agent
        :param reward: the reward the agent received
        :param next_state: the subsequent state
        :param done: whether the episode terminated
        """
        data = (state, action, reward, next_state, done)
        idx = self._next_idx
        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize

        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha
    
    def beta_by_frame(self, frame_idx):
        return min(1.0, self.beta_start + frame_idx * (1.0 - self.beta_start) / self.beta_frames)
    
    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def _encode_sample(self, indices):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in indices:
            data = self._storage[i]
            state, action, reward, next_state, done = data
            states.append(np.array(state, copy=False))
            actions.append(action)
            rewards.append(reward)
            next_states.append(np.array(next_state, copy=False))
            dones.append(done)
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

    def sample(self, batch_size):
        """
        Randomly sample a batch of transitions from the buffer.
        :param batch_size: the number of transitions to sample
        :return: a mini-batch of sampled transitions
        """
        idxes = self._sample_proportional(batch_size)

        weights = []

        #find smallest sampling prob: p_min = smallest priority^alpha / sum of priorities^alpha
        p_min = self._it_min.min() / self._it_sum.sum()

        beta = self.beta_by_frame(self.frame)
        self.frame+=1
        
        #max_weight given to smallest prob
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / max_weight)
        weights = torch.tensor(weights, device=device, dtype=torch.float) 
        encoded_sample = self._encode_sample(idxes)
        return encoded_sample, idxes, weights

        '''indices = np.random.randint(0, len(self._storage) - 1, size=batch_size)
        return self._encode_sample(indices)'''

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.
        sets priority of transition at index idxes[i] in buffer
        to priorities[i].
        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = (priority+1e-5) ** self._alpha
            self._it_min[idx] = (priority+1e-5) ** self._alpha

            self._max_priority = max(self._max_priority, (priority+1e-5))

In [None]:
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()

        self.tanh = nn.Tanh()

        self.conv1 = Conv2d(in_channels=1, out_channels=16, kernel_size=4, stride=1)
        self.conv2 = Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2)
        self.conv3 = Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2)

        self.fc1 = Linear(in_features=1728, out_features=100)
        self.fc2 = Linear(in_features=100, out_features=num_actions)
        self.s1 = Softmax(dim=-1)

    def forward(self, x):
        """
        Returns the values of a forward pass of the network
        :param x: The input to feed into the network 
        """
        x = self.tanh(self.conv1(x))
        x = self.tanh(self.conv2(x))
        x = self.tanh(self.conv3(x))

        # Define fully connected layers
        x = x.reshape(x.shape[0], -1)
        x = self.tanh(self.fc1(x))
        x = self.fc2(x)
        x = self.s1(x)

        return x


In [None]:
class DQNAgent():
    def __init__(self, observation_space, action_space, **kwargs):

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.action_space = action_space
        self.use_double_dqn = kwargs.get("use_double_dqn", None)
        self.gamma = kwargs.get("gamma", 0.99)
        self.lr = kwargs.get("lr", None)
        self.experience_replay_size = kwargs.get("replay_buffer_size", None)
        self.batch_size = kwargs.get("batch_size", None)
        self.num_feats = observation_space['glyphs'].shape
        self.num_actions = action_space.n

        self.declare_networks()
            
        #self.model.load_state_dict(torch.load("/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/model maze-ez 8nov.pth"))
        #self.target_model.load_state_dict(torch.load("/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/target_model maze-ez 8nov.pth"))
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        
        #move to correct device
        self.model = self.model.to(self.device)
        self.target_model.to(self.device)

        self.update_count = 0

        self.declare_memory()

    def declare_networks(self):
        self.model = DQN(self.num_feats, self.num_actions)
        self.target_model = DQN(self.num_feats, self.num_actions)

    def declare_memory(self):
        self.memory = ReplayBuffer(self.experience_replay_size)

    def append_to_replay(self, s, a, r, s_):
        self.memory.add((s, a, r, s_))

    def prep_minibatch(self):
        # random transition batch is taken from experience replay memory
        encoded_sample, indices, weights  = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, dones = encoded_sample
        
        states = np.array(states)
        next_states = np.array(next_states)
        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).long().to(device)
        rewards = torch.from_numpy(rewards).float().to(device)
        next_states = torch.from_numpy(next_states).float().to(device)
        dones = torch.from_numpy(dones).float().to(device)

        shape = (-1,)+self.num_feats
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_states)), device=self.device, dtype=torch.uint8)
        try: #sometimes all next states are false
            non_final_next_states = torch.tensor([s for s in next_states if s is not None], device=self.device, dtype=torch.float).view(shape)
            empty_next_state_values = False
        except:
            non_final_next_states = None
            empty_next_state_values = True

        return states, actions, rewards, non_final_next_states, non_final_mask, empty_next_state_values, indices, weights

    def compute_loss(self, batch_vars): #faster
        batch_state, batch_action, batch_reward, non_final_next_states, non_final_mask, empty_next_state_values, indices, weights = batch_vars

        #estimate
        #self.model.sample_noise()
        current_q_values = self.model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze()
        
        #target
        with torch.no_grad():
            max_next_q_values = torch.zeros(self.batch_size, device=self.device, dtype=torch.float).unsqueeze(dim=1)
            if not empty_next_state_values:
                max_next_action = self.get_max_next_state_action(non_final_next_states)
                #self.target_model.sample_noise()
                max_next_q_values[non_final_mask] = self.target_model(non_final_next_states).gather(1, max_next_action)
            expected_q_values = batch_reward + ((self.gamma)*max_next_q_values.squeeze())

        diff = (expected_q_values - current_q_values)
        self.memory.update_priorities(indices, diff.detach().squeeze().abs().cpu().numpy().tolist())
        loss = self.MSE(diff).squeeze() * weights
        loss = loss.mean()

        return loss

    def update(self, s, a, r, s_):
        batch_vars = self.prep_minibatch()
        loss = self.compute_loss(batch_vars)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        return loss

    def act(self, observation):
        if not torch.cuda.is_available():
            observation = observation.type(torch.FloatTensor) 
        else:
            observation = observation.type(torch.cuda.FloatTensor) 
        state = torch.unsqueeze(observation, 0).to(device)
        result = self.model.forward(state)
        action = torch.argmax(result).item()
        #print(result)
        return action

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def get_max_next_state_action(self, next_states):
        return self.target_model(next_states).max(dim=1)
    
    def MSE(self, x):
        return 0.5 * x.pow(2)

In [None]:
def dqn(env, seed, learning_rate, max_episodes, max_episode_length, gamma, verbose=True):
    """
    Method to train DQN model.
    
    Input:
    env: The environment to be used during training
    seed: The random seed for any random operations performed 
    learning_rate: The learning rate uesd for the Adam optimizer when training the model 
    number_episodes: Number of episodes to train for 
    max_episode_length: The maximum number of steps to take in an episode before terminating
    gamma: The discount factor used when calculating the discounted rewards of an episode
    verbose: Print episode reward after each episode
    
    Returns:
    scores: The cumulative reward achieved by the agent for each episode during traiing
    """

    hyper_params = {
        'replay-buffer-size': int(1e6),
        'learning-rate': 0.001,
        'gamma': gamma,  # discount factor
        'num-steps': int(7e5),  # Steps to run for, max episodes should be hit before this
        'batch-size': 32,  
        'learning-starts': 1000,  # set learning to start after 1000 steps of exploration
        'learning-freq': 1,  # Optimize after each step
        'use-double-dqn': False,
        'target-update-freq': 1000, # number of iterations between every target network update
        'eps-start': 1.0,  # e-greedy start threshold 
        'eps-end': 0.1,  # e-greedy end threshold 
        'eps-fraction': 0.6,  # Percentage of the time that epsilon is annealed
        'print-freq': 10,

    }
    
    np.random.seed(seed)
    env.seed(seed)
    
    # Create DQN agent

    agent = DQNAgent(
        env.observation_space, 
        env.action_space,
        train=True,
        use_double_dqn=hyper_params['use-double-dqn'],
        lr=hyper_params['learning-rate'],
        batch_size=hyper_params['batch-size'],
        gamma=hyper_params['gamma'],
        replay_buffer_size=hyper_params['replay-buffer-size']
    )
    
    # define variables to track agent metrics
    total_reward = 0
    scores = []
    mean_rewards = []

    # Reset gym env before training
    state = format_state(env.reset())
    eps_timesteps = hyper_params['eps-fraction'] * float(hyper_params['num-steps'])

    actions_taken = []
    #video = []

    # Train for set number of steps
    for t in range(hyper_params['num-steps']):

        # determine exploration probability
        fract = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params["eps-start"] + fract * (hyper_params["eps-end"] - hyper_params["eps-start"])
        sample = random.random()

        # Decide to explore and choose random action or use model to act
        if sample < eps_threshold:
            action = np.random.choice(agent.action_space.n)
        else:
            action = agent.act(state)
            actions_taken.append(action)

        # Take step in environment
        next_state, reward, done, _ = env.step(action)
        #video.append(next_state['pixel'])
        next_state = format_state(next_state)
        agent.memory.add(state, action, reward, next_state, float(done))
        total_reward += reward
        state = next_state
        
        if done:
            scores.append(total_reward)
            print(f"episode reward: {total_reward} ", f"{len(actions_taken)} " 'actions taken: {}'.format(actions_taken))
            np.random.seed(seed)
            env.seed(seed)
            state = format_state(env.reset())
            '''if t % 10000 == 0:
              clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(video, fps=4)
              clip.write_videofile("/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/videos/jared/{}.mp4".format("qeust-h "+str(num_episodes)))
            video.clear()'''
            total_reward = 0
            actions_taken.clear()
            

        if t > hyper_params['learning-starts'] and t % hyper_params['learning-freq'] == 0:
            td_loss = agent.update(state, action, reward, next_state)

        if t > hyper_params['learning-starts'] and t % hyper_params['target-update-freq'] == 0:
            td_loss = agent.update_target_model()

        num_episodes = len(scores)
        if done and hyper_params['print-freq'] is not None and len(scores) % hyper_params['print-freq'] == 0:
            mean_100ep_reward = round(np.mean(scores[-101:-1]), 1)
            mean_rewards.append(mean_100ep_reward)
            print('********************************************************')
            print('steps: {}'.format(t))
            print('episodes: {}'.format(num_episodes))
            print('mean 100 episode reward: {}'.format(mean_100ep_reward))
            print('% time spent exploring: {}'.format(eps_threshold))
            print('********************************************************')

   
  
        '''if num_episodes >= max_episodes:
            return agent, scores'''

    return agent, scores

In [None]:
def run_dqn(env, number_episodes, max_episode_length, iterations, model_file_name):
    """Trains DQN model for a number of episodes on a given environment"""
    seeds = np.random.randint(1000, size=iterations)
    scores_arr = [] 
    
    for seed in seeds:
        print("\nseed: ", seed)

        # Train the DQN Model 
        agent, scores = dqn(env=env, 
                            seed=seed, 
                            learning_rate=0.01,
                            max_episodes=number_episodes, 
                            max_episode_length=max_episode_length, 
                            gamma=0.99 ,
                            verbose=True)
        
        # Store rewards for this iteration
        torch.save(agent.model.state_dict(), "/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/model {}.pth".format(model_file_name))
        torch.save(agent.target_model.state_dict(), "/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/target_model {}.pth".format(model_file_name))
        scores_arr.append(scores)
        
    return agent, scores_arr

In [None]:
def save_video_of_model(agent, env, model_file_name):
    state = (env.reset())
    video = []
    done = False
    i = 0
    today = datetime.now()
    curr_datetime = str(today.date()) + " " + str(today.hour) + "_" + str(today.minute)
    
    # continues until completion
    while not done:
        video.append(state['pixel'])
        state = format_state(state)
        action = agent.act(state)
        print(action)
        state, reward, done, _ = env.step(action)
    fps=4

    clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(video, fps=fps)
    clip.write_videofile("/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/videos{}.mp4".format(model_file_name+curr_datetime))

In [None]:
def maze_explore_reward(env, prev_obs, action, next_obs):
    if (prev_obs[0] == 2359).sum() > (next_obs[0] == 2359).sum():
        return 0.1
    return 0

In [None]:
reward_gen = RewardManager()

# Random reward included to prevent reward glitch 
reward_gen.add_eat_event("apple", reward=0.5)

# Custom Rewards for long corridors at top and bottom 
reward_gen.add_coordinate_event((3,27), reward = -5, terminal_required = False)
reward_gen.add_coordinate_event((3,28), reward = -5, terminal_required = False)
reward_gen.add_coordinate_event((3,29), reward = -5, terminal_required = False)

reward_gen.add_coordinate_event((19,27), reward = -5, terminal_required = False)
reward_gen.add_coordinate_event((19,28), reward = -5, terminal_required = False)
reward_gen.add_coordinate_event((19,29), reward = -5, terminal_required = False)

reward_gen.add_custom_reward_fn(maze_explore_reward)

reward_gen.add_coordinate_event((11,27), reward = 100, terminal_required = False)# first door at end of maze


In [None]:
NAVIGATE_ACTIONS = (nethack.CompassDirection.N,
    nethack.CompassDirection.S,
    nethack.CompassDirection.W,
    nethack.CompassDirection.E,
    nethack.Command.EAT,
    nethack.Command.PICKUP)
#nethack.MiscDirection.DOWN)

In [None]:
QUEST_ACTIONS = (
    nethack.CompassDirection.N,
    nethack.CompassDirection.E,
    nethack.CompassDirection.S,
    nethack.CompassDirection.W,
    nethack.CompassDirection.NW,
    nethack.CompassDirection.NE,
    nethack.CompassDirection.SW,
    nethack.CompassDirection.SE,
    nethack.Command.PICKUP,
    nethack.Command.EAT,
    nethack.Command.APPLY,
    nethack.Command.ZAP, 
    nethack.Command.PUTON,
    nethack.Command.QUAFF,
    nethack.Command.WIELD,
    nethack.Command.RUSH,
    nethack.Command.OPEN)

### RUNS

In [None]:
#,"pixel","message"

In [None]:
'''env_name = "MiniHack-Quest-Easy-v0"
env = gym.make(env_name, observation_keys=["glyphs"], actions=QUEST_ACTIONS)
agent_easy, quest_easy_scores = run_dqn(env, number_episodes=1000, max_episode_length=1000, iterations=1, model_file_name='quest-ez 7nov')
'''

In [None]:
env_name = "MiniHack-Quest-Hard-v0"
env = gym.make(env_name, observation_keys=["glyphs"], actions=QUEST_ACTIONS)
print(env.observation_space['glyphs'].shape, env.action_space.n)
agent_hard, quest_hard_scores = run_dqn(env, number_episodes=1000, max_episode_length=1000, iterations=2, model_file_name='quest-h ')

In [None]:
'''env_name = "MiniHack-ExploreMaze-Easy-v0"
env = gym.make(env_name, observation_keys=["glyphs"], actions=QUEST_ACTIONS)
print(env.observation_space['glyphs'].shape, env.action_space.n)
agentMaze, maze_scores = run_dqn(env, number_episodes=1000, max_episode_length=1000, iterations=1, model_file_name="maze-ez 8nov")
'''

In [None]:
'''np.savetxt("/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/quest_hard_j.txt", quest_hard_scores)
torch.save(agent.online_network.state_dict(), "/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/online_model_j.pth")
torch.save(agent.target_network.state_dict(), "/content/gdrive/Shareddrives/RL_MiniHack/DQN_Agents/target_model_j.pth")'''

In [None]:
plot_results(env_name=env_name,scores=quest_hard_scores, ylim=(-12,8), color="red" )

In [None]:
#plot_results(env_name=env_name,scores=maze_scores,ylim=(-12,8), color="red" )

In [None]:
env = gym.make(env_name, observation_keys=["glyphs", "pixel"])
save_video_of_model(agent_hard, env, "quest-h ")