<a href="https://colab.research.google.com/github/fezilemahlangu/Reinforcement-Learning-Project/blob/master/A2C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt update
!apt install -y cmake
!apt-get install -y build-essential autoconf libtool pkg-config
!apt-get install flex bison libbz2-dev
!pip install nle
!pip install minihack
# !python -m minihack.scripts.env_list
!pip install gym[atari,accept-rom-license]

Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease [15.9 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:13 http://securit



---



---


# ACTOR CRITIC

In [1]:
import gym
import minihack
from minihack import reward_manager
import numpy as np
from minihack import RewardManager
from gym import spaces
from nle import nethack
from numpy.lib.function_base import select
import torch  
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import torch.nn.functional as F
import warnings
warnings.filterwarnings("ignore")



In [None]:
"""
Modified the A2C method from https://github.com/raillab/a2c
This A2C has LSTM
"""


class Flatten(nn.Module):
    """
    Flatten a multi dimensional output from the Conv2D to a single dimension
    """

    def forward(self, x):
        return x.view(x.shape[0], -1)

class ActorCritic(nn.Module):
    def __init__(self, glyph_shape, num_actions, enable_lstm, crop_dims=10):
        super(ActorCritic, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

        self.flatten = Flatten()
        if enable_lstm:
            self.lstm = nn.LSTMCell(256, 256)
        self.linear = nn.Linear(1152, 256)
        self.actor = nn.Linear(256, num_actions)
        self.critic = nn.Linear(256, 1)

        for m in self.features:
            if isinstance(m, nn.Conv2d):
                nn.init.orthogonal_(m.weight, nn.init.calculate_gain('relu'))
                nn.init.constant_(m.bias, 0.0)

        nn.init.orthogonal_(self.linear.weight)
        nn.init.constant_(self.linear.bias, 0.0)

        nn.init.orthogonal_(self.critic.weight)
        nn.init.constant_(self.critic.bias, 0.0)

        nn.init.orthogonal_(self.actor.weight, 0.01)
        nn.init.constant_(self.actor.bias, 0.0)

    def forward(self, x_glyphs, hx, cx, enable_lstm):
        x_glyphs = x_glyphs.unsqueeze(0)
        x_glyphs = self.features(x_glyphs)
        x_glyphs = self.flatten(x_glyphs)
        x_glyphs = self.linear(x_glyphs)
        x_glyphs = F.relu(x_glyphs)
        if enable_lstm:
            hx, cx = self.lstm(x_glyphs, (hx, cx))
            x = hx
        else:
            x = x_glyphs
        return Categorical(logits=self.actor(x)), self.critic(x), hx, cx



---



---


# REINFORCE

In [None]:
import sys
import math
import numpy as np
import matplotlib.pyplot as plt
import gym
import minihack
import torch
import torch.nn as nn
import torch.nn.functional as Func
import random
from torch.autograd import Variable
from collections import deque
from nle import nethack
from minihack import RewardManager

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class SimplePolicy(nn.Module):
    def __init__(self, s_size, h_size, a_size, learning_rate=0.001):
        super(SimplePolicy, self).__init__()
        self.linear1 = nn.Linear(s_size, h_size)
        self.linear2 = nn.Linear(h_size, a_size)
        self.loss_fn=nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        #print("before: ",x.shape)
        x = torch.flatten(x)
        x = torch.reshape(x, (1,x.shape[0]))
        #print("after: ",x.shape)
        #normalize tensors
        x = torch.nn.functional.normalize(x, p=2.0, dim=1, eps=1e-12, out=None)
        func = Func.relu(self.linear1(x))
        #print("func: ",func)
        func = Func.softmax(self.linear2(func), dim=1)
        return func


class StateValueNetwork(nn.Module):
    # Takes in state
    def __init__(self, s_size=4, h_size=16, learning_rate=0.001):
        super(StateValueNetwork, self).__init__()
        self.linear1 = nn.Linear(s_size, h_size)
        self.linear2 = nn.Linear(h_size, 1)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        #input layer
        x = torch.flatten(x)
        x = torch.reshape(x, (1,x.shape[0]))
        f = self.linear1(x)
        #activiation relu
        f = Func.relu(f)
        #get state value
        state_value = self.linear2(f)
        return state_value

def compute_returns_naive_baseline(rewards, gamma):
    returns = []
    #calculates the return values
    for t in range(len(rewards)):
        Gt = 0
        for r in rewards[t:]:
            Gt = Gt * gamma + r
        returns.append(Gt)
    returns = torch.tensor(returns).to(device)
    returns = (returns - returns.mean()) / (
        returns.std())
    return returns

def learning(states ,scores, state_model, policy_model, lProbs, env, gamma):
    returns = compute_returns_naive_baseline(scores, gamma)
    #env.render()
    #this section calculates the state values
    #calculate MSE loss
    stateValues = []
    for i in states:
        stateValues.append(state_model.forward(Variable(i)))
    stateValues = torch.stack(stateValues).squeeze()
    valLoss = Func.mse_loss(stateValues, returns)
    #backpropagate
    state_model.optimizer.zero_grad()
    valLoss.backward()
    state_model.optimizer.step()
    deltas = []
    for gt, val in zip(returns, stateValues):
        deltas.append(gt-val)
    deltas = torch.tensor(deltas).to(device)
    #this section is where we calculate the policy gradient
    #this section is from https://gist.github.com/cyoon1729/3920da556f992909ace8516e2f321a7c#file-reinforce_update-py
    policyGrad = []
    #training policy
    for logProb, Dt in zip(lProbs, deltas):
        policyGrad.append(-logProb * Dt)
    policy_model.optimizer.zero_grad()
    policyGrad = torch.stack(policyGrad).sum()
    #backpropagate
    policyGrad.backward()
    policy_model.optimizer.step()

def reinforce_naive_baseline(env, policy_model, state_model, seed,
                             number_episodes,
                             max_episode_length,
                             gamma, verbose=True):
    global hyper_params
    # set random seeds (for reproducibility)
    torch.manual_seed(hyper_params['seed'])
    torch.cuda.manual_seed_all(hyper_params['seed'])
    np.random.seed(hyper_params['seed'])
    random.seed(hyper_params['seed'])
    env.seed(hyper_params['seed'])
    policy = []
    numsteps = []
    avgNumsteps = []
    allRewards = []
    for episode in range(number_episodes):
        state = env.reset()['glyphs_crop']
        lProbs = []
        scores = []
        states = []
        for steps in range(max_episode_length):
            #env.render()
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            #this section is from https://gist.github.com/cyoon1729/bc41d466b868ea10e794a7c04321ff3b#file-reinforce_model-py
            probs = policy_model.forward(Variable(state))
            action = np.random.choice(env.action_space.n)
            lprob = torch.log(probs.squeeze(0)[action])
            nextState, score, done, _ = env.step(action)
            lProbs.append(lprob)
            scores.append(score)
            states.append(state)
            if steps%100==0:
                p = 0
                #learning(states, scores, state_model, policy_model, lProbs, env, gamma)
            if done:
                learning(states, scores, state_model, policy_model, lProbs, env, gamma)
                numsteps.append(steps)
                avgNumsteps.append(np.mean(numsteps[-10:]))
                allRewards.append(np.sum(scores))
                if episode % 1 == 0:
                    print("Reinforce with baseline -> episode: {}, total reward: {}, average_reward: {}, length: {}".format(episode,np.round(
                                                                                                                  np.sum(
                                                                                                                      scores),
                                                                                                                  decimals=3),
                                                                                                              np.round(
                                                                                                                  np.mean(
                                                                                                                      allRewards[
                                                                                                                      -10:]),
                                                                                                                  decimals=3),
                                                                                                      steps))
                break
            state = nextState['glyphs_crop']
    env.close()
    return policy, allRewards
import cv2
cv2.ocl.setUseOpenCL(False)

class ExploreEvent(minihack.reward_manager.Event):
    def __init__(self, reward: float, repeatable: bool, terminal_required: bool, terminal_sufficient: bool):
        super().__init__(reward, repeatable, terminal_required, terminal_sufficient)

    def check(self, env, previous_observation, action, observation) -> float:
        # blank spots are 32
        # agent is 64
        # agent spawn point is 60
        # pathways are 35
        # obs[1] is the char observation 
        # print("+++++++++++++++++++\nobs = \n++++++++++++++++++++++\n", observation[1])
        current = sum(np.count_nonzero(i == 35) for i in observation[1])
        current += sum(np.count_nonzero(i == 60) for i in observation[1])
        prev = sum(np.count_nonzero(i == 35) for i in previous_observation[1])
        prev += sum(np.count_nonzero(i == 60) for i in previous_observation[1])
        if current > prev:
            return self.reward
        else:
            return 0

class RenderRGB(gym.Wrapper):
    def __init__(self, env, key_name="pixel"):
        super().__init__(env)
        self.last_pixels = None
        self.viewer = None
        self.key_name = key_name

        render_modes = env.metadata['render.modes']
        render_modes.append("rgb_array")
        env.metadata['render.modes'] = render_modes

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.last_pixels = obs[self.key_name]
        return obs, reward, done, info

    def render(self, mode="human", **kwargs):
        img = self.last_pixels

        # Hacky but works
        if mode != "human":
            return img
        else:
            from gym.envs.classic_control import rendering

            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)
            return self.viewer.isopen

    def reset(self):
        obs = self.env.reset()
        self.last_pixels = obs[self.key_name]
        return obs

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None

def create_env():
    global hyper_params
    # ACTIONS define the actions allowed by the agent
    MOVE_ACTIONS = tuple(nethack.CompassDirection)
    NAVIGATE_ACTIONS = MOVE_ACTIONS + (
        nethack.Command.OPEN,   # Not sure if needed
        nethack.Command.PICKUP, 
        nethack.Command.WEAR,   
        nethack.Command.WIELD,  
        nethack.Command.QUAFF,
        nethack.Command.INVOKE,
        nethack.Command.ZAP,
        nethack.Command.SWAP,   # Not sure if needed

        # Might need more? All actions and descriptions found here
        # https://minihack.readthedocs.io/en/latest/getting-started/action_spaces.html
    )
    pixel_obs = "pixel_crop"

    reward_manager = RewardManager()
    reward_manager.add_kill_event("minotaur", reward=1, terminal_required=False)
    strings = list()
    strings.append("The door opens.")
    reward_manager.add_message_event(strings, reward=1, terminal_required=True)

    strings = list()
    strings.append("It's solid stone.")
    reward_manager.add_message_event("It's solid stone.", reward=-0.75, terminal_required=False, repeatable=True)

    reward_manager.add_event(ExploreEvent(0.5, True, True, False))
    
    # Create env with modified actions
    # Probably can limit the observations as well
    env = gym.make(
        hyper_params["env-name"],
        observation_keys=("glyphs_crop", "chars", "colors", "pixel", "message", "blstats", pixel_obs),
        actions=NAVIGATE_ACTIONS,
        reward_lose=-1,
        reward_win=1,
        savedir="./games",
        reward_manager=reward_manager
    )

    env.seed(hyper_params["seed"])
    env = RenderRGB(env, pixel_obs)
    # env = gym.wrappers.Monitor(env, "recordings", force=True)

    return env

def run_reinforce():
    global hyper_params
    env = create_env()
    print("number of actions: ",env.action_space)
    #print(env.observation_space['glyphs'])
    #deimension of game space
    size = 9 * 9
    hSize = round(size/2)
    num_epi = 50
    policy_model = SimplePolicy(s_size=size, h_size=size, a_size=env.action_space.n,learning_rate=hyper_params['learning-rate']).to(device)
    stateval_model = StateValueNetwork(s_size=size, h_size=size,learning_rate=hyper_params['learning-rate']).to(device)
    policy, scores = reinforce_naive_baseline(env=env, policy_model=policy_model, state_model=stateval_model, seed=42,
                               number_episodes=num_epi,
                               max_episode_length=hyper_params['num-steps'],
                               gamma=hyper_params['discount-factor'],
                               verbose=True)
    # Plot learning curve
    plt.plot(scores,'o')
    plt.xlabel('Episodes')
    plt.ylabel('Average reward')
    plt.title('Average reward per episode')
    plt.show()


if __name__ == '__main__':
    hyper_params = {
        "seed": 42,  # which seed to use
        "env-name": "MiniHack-Quest-Hard-v0",  # name of the game
        "learning-rate": 1e-2,  # learning rate for Adam optimizer
        "discount-factor": 0.99,  # discount factor
        "num-steps": int(10000),  # total number of steps to run the environment for
        "print-freq": 25, # number of iterations between each print out
        "save-freq": 500, # number of iterations between each model save
    }
    run_reinforce()

number of actions:  Discrete(16)
Reinforce with baseline -> episode: 0, total reward: -700.48, average_reward: -700.48, length: 999
Reinforce with baseline -> episode: 1, total reward: -703.48, average_reward: -701.98, length: 999
Reinforce with baseline -> episode: 2, total reward: -727.19, average_reward: -710.383, length: 999
Reinforce with baseline -> episode: 3, total reward: -715.83, average_reward: -711.745, length: 999
Reinforce with baseline -> episode: 4, total reward: -725.91, average_reward: -714.578, length: 999
Reinforce with baseline -> episode: 5, total reward: -715.48, average_reward: -714.728, length: 999
Reinforce with baseline -> episode: 6, total reward: -707.11, average_reward: -713.64, length: 999
Reinforce with baseline -> episode: 7, total reward: -698.48, average_reward: -711.745, length: 999
