In [3]:
import numpy as np
from itertools import count
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from nle import nethack

In [None]:
device = torch.device('cpu')
seed = 1
max_episode_steps = 2000
window = 25
gamma = 0.99
alpha = 0.1
render = False
max_msg = 256 #np.iinfo(np.uint8).max

In [None]:
ACTIONS = [
    nethack.CompassCardinalDirection.N,
    nethack.CompassCardinalDirection.E,
    nethack.CompassCardinalDirection.S,
    nethack.CompassCardinalDirection.W,
]

In [None]:
STATS_INDICES = {
    'x_coordinate': 0,
    'y_coordinate': 1,
    'score': 9,
    'health_points': 10,
    'health_points_max': 11,
    'hunger_level': 18,
}

In [None]:
env = gym.make("NetHackScore-v0")
env.seed(seed)
torch.manual_seed(seed)

In [None]:
#env.render() # How to see the environment
#env.step(action) # How to step through the environment 0 = UP, 1 = Right, 2 = Down, 3 = Left

In [None]:
class Policy(nn.Module):
    def __init__(self, obs_size, act_size):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(obs_size, 512)
#         self.dropout = nn.Dropout(p=0.6)
        self.affine2 = nn.Linear(512, 128)
#         self.dropout = nn.Dropout(p=0.5)
        self.affine3 = nn.Linear(128, 64)
#         self.dropout = nn.Dropout(p=0.4)
        self.affine4 = nn.Linear(64, act_size)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
#         x = self.dropout(x)
        x = F.relu(x)
        x = self.affine2(x)
#         x = self.dropout(x)
        x = F.relu(x)
        x = self.affine3(x)
#         x = self.dropout(x)
        x = F.relu(x)
        action_scores = self.affine4(x)
        return F.softmax(action_scores, dim=0)

In [None]:
def crop_glyphs(glyphs, x, y, size=7):
    x_max = 79
    y_max = 21

    x_start = x - size
    x_end = x + size

    if x_start < 0:
        x_end = x_end + (-1 * x_start)
        x_start = 0

    if x_end > x_max:
        x_start = x_start - (x_end - x_max)
        x_end = x_max

    y_start = y - size
    y_end = y + size

    if y_start < 0:
        y_end = y_end + (-1 * y_start)
        y_start = 0

    if y_end > y_max:
        y_start = y_start - (y_end - y_max)
        y_end = y_max

    y_range = np.arange(y_start, (y_end), 1)
    x_range = np.arange(x_start, (x_end), 1)
    window_glyphs = []
    for row in y_range:
        for col in x_range:
            window_glyphs.append(glyphs[row][col])

    crop = np.asarray(window_glyphs)

    return crop

In [None]:
def transform_observation(observation):
    """Process the state into the model input shape
    of ([glyphs, stats], )"""
#     observed_glyphs = observation['glyphs']

#     stat_x_coord = observation['blstats'][STATS_INDICES['x_coordinate']]
#     stat_y_coord = observation['blstats'][STATS_INDICES['y_coordinate']]
#     stat_health = float(observation['blstats'][STATS_INDICES['health_points']]) - float(
#         observation['blstats'][STATS_INDICES['health_points_max']] / 2)
#     stat_hunger = observation['blstats'][STATS_INDICES['hunger_level']]


#     observed_chars = observation['chars']
#     cropped_chars = crop_glyphs(observed_chars, stat_x_coord, stat_y_coord)
    # chars_mean = np.mean(cropped_chars)
    # chars_std = np.std(cropped_chars)
    # print('MEAN:', chars_mean)
    # print('STD:', chars_std)
    # norm_chars = (cropped_chars - chars_mean)/chars_std
#     chars_min = np.min(cropped_chars)
#     chars_max = np.max(cropped_chars)
#     chars_range = chars_max - chars_min
#     norm_chars = (cropped_chars - chars_min) / chars_range

    msg = observation['message']
    msg_norm = msg/max_msg
    return msg_norm

In [None]:
#get observation size
#but change observation size to characters just around agent
state = transform_observation(env.reset())

In [None]:
policy = Policy(state.shape[0], env.action_space.n)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()

In [None]:
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()

In [None]:
print(state)
print(type(state))
print(state.shape)

In [None]:
action = select_action(state)

In [None]:
action

In [None]:
episode_rewards = []
episode_loss = []

In [None]:
def finish_episode():
    R = 0
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    episode_loss.append(policy_loss)
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]

In [None]:
def main():
    running_reward = 0
    for i_episode in range(1,500):
        state, ep_reward = transform_observation(env.reset()), 0
        for t in range(1, max_episode_steps):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _ = env.step(action)
            if render:
                env.render()
            state = transform_observation(state)
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                break
        
        
        episode_rewards.append(ep_reward)
        running_reward = alpha * ep_reward + (1 - alpha) * running_reward
        finish_episode()

        if i_episode % window == 0:
            print('Episode {}\tLast reward: {:.2f}\tLast action: {}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, action, (sum(episode_rewards)/i_episode)))
#         if running_reward > 20:
#             print("Alert! Running reward is now {} and "
#                   "the last episode runs to {} time steps!".format(running_reward, t))
        if ((sum(episode_rewards)/i_episode) > 500) or ((sum(episode_rewards)/i_episode) < -25):
            print("End! Average reward is now {}".format((sum(episode_rewards)/i_episode)))
            break

In [None]:
main()

In [None]:
torch.save(policy, '/home/clarise/Desktop/COMS7053A - RL/mod_msg4.pt')

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = np.arange(1, len(episode_rewards)+1, 1)
y = episode_rewards
avg = []
for i in range(x.shape[0]):
    rewards = sum(y[0:i])
    avg.append(rewards/x[i])
plt.plot(x,y, color = 'Purple', label = 'Actual Reward')
plt.plot(x, avg, color = 'Blue', label = 'Average Reward')
plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.title('Rewards recieved for REINFORCE trained on messages')
plt.legend()
plt.show()