## Random Agent


In [3]:
from typing import Mapping, Any

import numpy as np

import textworld.gym


class RandomAgent(textworld.gym.Agent):
    """ Agent that randomly selects a command from the admissible ones. """
    def __init__(self, seed=1234):
        self.seed = seed
        self.rng = np.random.RandomState(self.seed)

    @property
    def infos_to_request(self) -> textworld.EnvInfos:
        return textworld.EnvInfos(admissible_commands=True)
    
    def act(self, obs: str, score: int, done: bool, infos: Mapping[str, Any]) -> str:
        return self.rng.choice(infos["admissible_commands"])


## Play Function

In [4]:
import os
from glob import glob

import gym
import textworld.gym


def play(agent, path, max_step=100, nb_episodes=10, verbose=True):
    infos_to_request = agent.infos_to_request
    infos_to_request.max_score = True  # Needed to normalize the scores.
    
    gamefiles = [path]
    if os.path.isdir(path):
        gamefiles = glob(os.path.join(path, "*.ulx"))
        
    env_id = textworld.gym.register_games(gamefiles,
                                          request_infos=infos_to_request,
                                          max_episode_steps=max_step)
    env = gym.make(env_id)  # Create a Gym environment to play the text game.
    if verbose:
        if os.path.isdir(path):
            print(os.path.dirname(path), end="")
        else:
            print(os.path.basename(path), end="")
        
    # Collect some statistics: nb_steps, final reward.
    avg_moves, avg_scores, avg_norm_scores = [], [], []
    for no_episode in range(nb_episodes):
        obs, infos = env.reset()  # Start new episode.

        score = 0
        done = False
        nb_moves = 0
        while not done:
            command = agent.act(obs, score, done, infos)
            obs, score, done, infos = env.step(command)
            nb_moves += 1
        
        agent.act(obs, score, done, infos)  # Let the agent know the game is done.
                
        if verbose:
            print(".", end="")
        avg_moves.append(nb_moves)
        avg_scores.append(score)
        avg_norm_scores.append(score / infos["max_score"])

    env.close()
    msg = "  \tavg. steps: {:5.1f}; avg. score: {:4.1f} / {}."
    if verbose:
        if os.path.isdir(path):
            print(msg.format(np.mean(avg_moves), np.mean(avg_norm_scores), 1))
        else:
            print(msg.format(np.mean(avg_moves), np.mean(avg_scores), infos["max_score"]))
    

## Testing the Random Agent

In [5]:
play(RandomAgent(),"./games/rewardsDense_goalDetailed.ulx")
play(RandomAgent(), "./games/rewardsBalanced_goalDetailed.ulx")
play(RandomAgent(), "./games/rewardsSparse_goalDetailed.ulx")

rewardsDense_goalDetailed.ulx..........  	avg. steps: 100.0; avg. score:  4.2 / 10.
rewardsBalanced_goalDetailed.ulx..........  	avg. steps: 100.0; avg. score:  0.7 / 4.
rewardsSparse_goalDetailed.ulx..........  	avg. steps: 100.0; avg. score:  0.0 / 1.


## Neural Agent (with Pytorch)

In [6]:
import re
from typing import List, Mapping, Any, Optional
from collections import defaultdict

import numpy as np

import textworld
import textworld.gym
from textworld import EnvInfos

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class CommandScorer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CommandScorer, self).__init__()
        torch.manual_seed(42)  # For reproducibility
        self.embedding    = nn.Embedding(input_size, hidden_size)
        self.encoder_gru  = nn.GRU(hidden_size, hidden_size)
        self.cmd_encoder_gru  = nn.GRU(hidden_size, hidden_size)
        self.state_gru    = nn.GRU(hidden_size, hidden_size)
        self.hidden_size  = hidden_size
        self.state_hidden = torch.zeros(1, 1, hidden_size, device=device)
        self.critic       = nn.Linear(hidden_size, 1)
        self.att_cmd      = nn.Linear(hidden_size * 2, 1)

    def forward(self, obs, commands, **kwargs):
        input_length = obs.size(0)
        batch_size = obs.size(1)
        nb_cmds = commands.size(1)

        embedded = self.embedding(obs)
        encoder_output, encoder_hidden = self.encoder_gru(embedded)
        state_output, state_hidden = self.state_gru(encoder_hidden, self.state_hidden)
        self.state_hidden = state_hidden
        value = self.critic(state_output)

        # Attention network over the commands.
        cmds_embedding = self.embedding.forward(commands)
        _, cmds_encoding_last_states = self.cmd_encoder_gru.forward(cmds_embedding)  # 1 x cmds x hidden

        # Same observed state for all commands.
        cmd_selector_input = torch.stack([state_hidden] * nb_cmds, 2)  # 1 x batch x cmds x hidden

        # Same command choices for the whole batch.
        cmds_encoding_last_states = torch.stack([cmds_encoding_last_states] * batch_size, 1)  # 1 x batch x cmds x hidden

        # Concatenate the observed state and command encodings.
        cmd_selector_input = torch.cat([cmd_selector_input, cmds_encoding_last_states], dim=-1)

        # Compute one score per command.
        scores = F.relu(self.att_cmd(cmd_selector_input)).squeeze(-1)  # 1 x Batch x cmds

        probs = F.softmax(scores, dim=2)  # 1 x Batch x cmds
        index = probs[0].multinomial(num_samples=1).unsqueeze(0) # 1 x batch x indx
        return scores, index, value

    def reset_hidden(self, batch_size):
        self.state_hidden = torch.zeros(1, batch_size, self.hidden_size, device=device)


class NeuralAgent:
    """ Simple Neural Agent for playing TextWorld games. """
    MAX_VOCAB_SIZE = 1000
    UPDATE_FREQUENCY = 10
    LOG_FREQUENCY = 1000
    GAMMA = 0.9
    
    def __init__(self) -> None:
        self._initialized = False
        self._epsiode_has_started = False
        self.id2word = ["<PAD>", "<UNK>"]
        self.word2id = {w: i for i, w in enumerate(self.id2word)}
        
        self.model = CommandScorer(input_size=self.MAX_VOCAB_SIZE, hidden_size=128)
        self.optimizer = optim.Adam(self.model.parameters(), 0.00003)
        
        self.mode = "test"
    
    def train(self):
        self.mode = "train"
        self.stats = {"max": defaultdict(list), "mean": defaultdict(list)}
        self.transitions = []
        self.model.reset_hidden(1)
        self.last_score = 0
        self.no_train_step = 0
    
    def test(self):
        self.mode = "test"
        self.model.reset_hidden(1)
        
    @property
    def infos_to_request(self) -> EnvInfos:
        return EnvInfos(description=True, inventory=True, admissible_commands=True,
                        won=True, lost=True)
    
    def _get_word_id(self, word):
        if word not in self.word2id:
            if len(self.word2id) >= self.MAX_VOCAB_SIZE:
                return self.word2id["<UNK>"]
            
            self.id2word.append(word)
            self.word2id[word] = len(self.word2id)
            
        return self.word2id[word]
            
    def _tokenize(self, text):
        # Simple tokenizer: strip out all non-alphabetic characters.
        text = re.sub("[^a-zA-Z0-9\- ]", " ", text)
        word_ids = list(map(self._get_word_id, text.split()))
        return word_ids

    def _process(self, texts):
        texts = list(map(self._tokenize, texts))
        max_len = max(len(l) for l in texts)
        padded = np.ones((len(texts), max_len)) * self.word2id["<PAD>"]

        for i, text in enumerate(texts):
            padded[i, :len(text)] = text

        padded_tensor = torch.from_numpy(padded).type(torch.long).to(device)
        padded_tensor = padded_tensor.permute(1, 0) # Batch x Seq => Seq x Batch
        return padded_tensor
      
    def _discount_rewards(self, last_values):
        returns, advantages = [], []
        R = last_values.data
        for t in reversed(range(len(self.transitions))):
            rewards, _, _, values = self.transitions[t]
            R = rewards + self.GAMMA * R
            adv = R - values
            returns.append(R)
            advantages.append(adv)
            
        return returns[::-1], advantages[::-1]

    def act(self, obs: str, score: int, done: bool, infos: Mapping[str, Any]) -> Optional[str]:
        
        # Build agent's observation: feedback + look + inventory.
        input_ = "{}\n{}\n{}".format(obs, infos["description"], infos["inventory"])
        
        # Tokenize and pad the input and the commands to chose from.
        input_tensor = self._process([input_])
        commands_tensor = self._process(infos["admissible_commands"])
        
        # Get our next action and value prediction.
        outputs, indexes, values = self.model(input_tensor, commands_tensor)
        action = infos["admissible_commands"][indexes[0]]
        
        if self.mode == "test":
            if done:
                self.model.reset_hidden(1)
            return action
        
        self.no_train_step += 1
        
        if self.transitions:
            reward = score - self.last_score  # Reward is the gain/loss in score.
            self.last_score = score
            if infos["won"]:
                reward += 100
            if infos["lost"]:
                reward -= 100
                
            self.transitions[-1][0] = reward  # Update reward information.
        
        self.stats["max"]["score"].append(score)
        if self.no_train_step % self.UPDATE_FREQUENCY == 0:
            # Update model
            returns, advantages = self._discount_rewards(values)
            
            loss = 0
            for transition, ret, advantage in zip(self.transitions, returns, advantages):
                reward, indexes_, outputs_, values_ = transition
                
                advantage        = advantage.detach() # Block gradients flow here.
                probs            = F.softmax(outputs_, dim=2)
                log_probs        = torch.log(probs)
                log_action_probs = log_probs.gather(2, indexes_)
                policy_loss      = (-log_action_probs * advantage).sum()
                value_loss       = (.5 * (values_ - ret) ** 2.).sum()
                entropy     = (-probs * log_probs).sum()
                loss += policy_loss + 0.5 * value_loss - 0.1 * entropy
                
                self.stats["mean"]["reward"].append(reward)
                self.stats["mean"]["policy"].append(policy_loss.item())
                self.stats["mean"]["value"].append(value_loss.item())
                self.stats["mean"]["entropy"].append(entropy.item())
                self.stats["mean"]["confidence"].append(torch.exp(log_action_probs).item())
            
            if self.no_train_step % self.LOG_FREQUENCY == 0:
                msg = "{}. ".format(self.no_train_step)
                msg += "  ".join("{}: {:.3f}".format(k, np.mean(v)) for k, v in self.stats["mean"].items())
                msg += "  " + "  ".join("{}: {}".format(k, np.max(v)) for k, v in self.stats["max"].items())
                msg += "  vocab: {}".format(len(self.id2word))
                print(msg)
                self.stats = {"max": defaultdict(list), "mean": defaultdict(list)}
            
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), 40)
            self.optimizer.step()
            self.optimizer.zero_grad()
        
            self.transitions = []
            self.model.reset_hidden(1)
        else:
            # Keep information about transitions for Truncated Backpropagation Through Time.
            self.transitions.append([None, indexes, outputs, values])  # Reward will be set on the next call
        
        if done:
            self.last_score = 0  # Will be starting a new episode. Reset the last score.
        
        return action

## Training the neural agent

In [7]:
agent = NeuralAgent()
play(agent, "./games/rewardsDense_goalDetailed.ulx")
play(RandomAgent(), "./games/rewardsBalanced_goalDetailed.ulx")
play(RandomAgent(), "./games/rewardsSparse_goalDetailed.ulx")

rewardsDense_goalDetailed.ulx..........  	avg. steps: 100.0; avg. score:  4.8 / 10.
rewardsBalanced_goalDetailed.ulx..........  	avg. steps: 100.0; avg. score:  0.7 / 4.
rewardsSparse_goalDetailed.ulx..........  	avg. steps: 100.0; avg. score:  0.0 / 1.


In [8]:
from time import time
agent = NeuralAgent()

print("Training")
agent.train()  # Tell the agent it should update its parameters.
starttime = time()
play(agent, "./games/rewardsDense_goalDetailed.ulx", nb_episodes=500, verbose=False)  # Dense rewards game.
print("Trained in {:.2f} secs".format(time() - starttime))

Training
1000. reward: 0.043  policy: 0.277  value: 0.070  entropy: 2.358  confidence: 0.096  score: 8  vocab: 263
2000. reward: -0.062  policy: -1.605  value: 21.004  entropy: 2.344  confidence: 0.098  score: 9  vocab: 307
3000. reward: -0.054  policy: -1.383  value: 23.397  entropy: 2.426  confidence: 0.093  score: 9  vocab: 309
4000. reward: 0.051  policy: 0.129  value: 0.118  entropy: 2.470  confidence: 0.089  score: 6  vocab: 310
5000. reward: 0.049  policy: 0.042  value: 0.093  entropy: 2.424  confidence: 0.097  score: 7  vocab: 310
6000. reward: -0.053  policy: -0.173  value: 5.477  entropy: 2.479  confidence: 0.089  score: 9  vocab: 312
7000. reward: 0.054  policy: 0.053  value: 0.079  entropy: 2.422  confidence: 0.097  score: 7  vocab: 312
8000. reward: 0.062  policy: 0.058  value: 0.110  entropy: 2.472  confidence: 0.092  score: 8  vocab: 312
9000. reward: 0.052  policy: -0.047  value: 0.095  entropy: 2.437  confidence: 0.098  score: 6  vocab: 312
10000. reward: 0.067  policy

In [9]:
# We report the score and steps averaged over 10 playthroughs.
agent.test()
play(agent, "./games/rewardsDense_goalDetailed.ulx")  # Dense rewards game.

rewardsDense_goalDetailed.ulx..........  	avg. steps:  85.4; avg. score:  9.0 / 10.


In [10]:
!tw-make tw-simple --rewards dense --goal detailed --seed 1 --output games/another_game.ulx -v -f

Global seed: 1
Game generated: /home/prasath/TextWorld-master/notebooks/games/another_game.ulx

Objective:
Hey, thanks for coming over to the TextWorld today, there is something I need you to do for me. First of all, you could, like, look and see that the antique trunk inside the bedroom is opened. Then, recover the old key from the antique trunk. Then, make absolutely sure that the wooden door inside the bedroom is unlocked. After unlocking the wooden door, open the wooden door in the bedroom. Then, try to head east. After that, try to travel south. Once you get through with that, take the milk from the couch within the living room. Having taken the milk, attempt to travel north. That done, rest the milk on the stove inside the kitchen. And if you do that, you're the winner!

Walkthrough:
open antique trunk > take old key from antique trunk > unlock wooden door with old key > open wooden door > go east > go south > take milk from couch > go north > put milk on stove

-= Stats =-
Nb. l

In [11]:
# We report the score and steps averaged over 10 playthroughs.
play(RandomAgent(), "./games/another_game.ulx")
play(agent, "./games/another_game.ulx")

another_game.ulx..........  	avg. steps: 100.0; avg. score:  3.9 / 8.
another_game.ulx..........  	avg. steps:  93.2; avg. score:  6.5 / 8.


In [12]:
! seq 1 100 | xargs -n1 -P4 tw-make tw-simple --rewards dense --goal detailed --format ulx --output training_games/ --seed

Global seed: 3
Global seed: 1
Global seed: 4
Global seed: 2
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-ekDZtbGXIbO5FKp8.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-E5eLHkaXFk6BSgR1.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-ek06H8B7uqoYFVEy.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-D8gMTlO8cPoEtgZx.ulx
Global seed: 5
Global seed: 6
Global seed: 7
Global seed: 8
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-7KpYUDDdckE0cBqZ.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-68kvf8x7TBd9Iq0P.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/training_game

Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-p1RLFX7MU7KjFy0d.ulx
Global seed: 57
Global seed: 58
Global seed: 59
Global seed: 60
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-WxnkhG07upKRhbNV.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-Kx32iybbtDGRFQl6.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-DWvMiBQvCR5sNkx.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-R7y2UJkBUeEqS8Bk.ulx
Global seed: 61
Global seed: 62
Global seed: 63
Global seed: 64
Game generated: /home/prasath/TextWorld-master/notebooks/training_games/tw-simple-rDense+gDetailed+train-house-GP-YnvLs6vYIWvWC51e.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/traini

In [13]:
from time import time
agent = NeuralAgent()

print("Training on 100 games")
agent.train()  # Tell the agent it should update its parameters.
starttime = time()
play(agent, "./training_games/", nb_episodes=100 * 5, verbose=False)  # Each game will be seen 5 times.
print("Trained in {:.2f} secs".format(time() - starttime))

Training on 100 games
1000. reward: 0.172  policy: 1.946  value: 19.621  entropy: 2.416  confidence: 0.092  score: 10  vocab: 486
2000. reward: 0.046  policy: 0.077  value: 0.070  entropy: 2.404  confidence: 0.094  score: 5  vocab: 554
3000. reward: 0.036  policy: 0.062  value: 0.060  entropy: 2.344  confidence: 0.099  score: 6  vocab: 597
4000. reward: 0.054  policy: 0.157  value: 0.108  entropy: 2.485  confidence: 0.088  score: 7  vocab: 615
5000. reward: 0.047  policy: 0.024  value: 0.086  entropy: 2.370  confidence: 0.099  score: 6  vocab: 631
6000. reward: -0.051  policy: -1.764  value: 25.028  entropy: 2.494  confidence: 0.088  score: 7  vocab: 658
7000. reward: 0.050  policy: -0.036  value: 0.085  entropy: 2.374  confidence: 0.098  score: 6  vocab: 659
8000. reward: 0.054  policy: 0.025  value: 0.079  entropy: 2.405  confidence: 0.099  score: 9  vocab: 663
9000. reward: 0.048  policy: -0.037  value: 0.076  entropy: 2.383  confidence: 0.099  score: 5  vocab: 670
10000. reward: 0.

### Evaluating agent on 20 test games

In [14]:
! seq 1 20 | xargs -n1 -P4 tw-make tw-simple --rewards dense --goal detailed --test --format ulx --output testing_games/ --seed

Global seed: 3
Global seed: 2
Global seed: 1
Global seed: 4
Game generated: /home/prasath/TextWorld-master/notebooks/testing_games/tw-simple-rDense+gDetailed+test-house-GP-E5eLHkaXFk6BSgR1.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/testing_games/tw-simple-rDense+gDetailed+test-house-GP-D8gMTlO8cPoEtgZx.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/testing_games/tw-simple-rDense+gDetailed+test-house-GP-ekDZtbGXIbO5FKp8.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/testing_games/tw-simple-rDense+gDetailed+test-house-GP-ek06H8B7uqoYFVEy.ulx
Global seed: 5
Global seed: 6
Global seed: 7
Global seed: 8
Game generated: /home/prasath/TextWorld-master/notebooks/testing_games/tw-simple-rDense+gDetailed+test-house-GP-7KpYUDDdckE0cBqZ.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/testing_games/tw-simple-rDense+gDetailed+test-house-GP-o2RVTmrEi6R5T3p0.ulx
Game generated: /home/prasath/TextWorld-master/notebooks/testing_games/tw-simple-r

In [15]:
agent.test()
play(agent, "./games/rewardsDense_goalDetailed.ulx")  # Averaged over 10 playthroughs.
play(agent, "./testing_games/", nb_episodes=20 * 10)  # Averaged over 10 playthroughs for each test game.
play(RandomAgent(), "./testing_games/", nb_episodes=20 * 10)

rewardsDense_goalDetailed.ulx..........  	avg. steps:  83.4; avg. score:  8.5 / 10.
./testing_games........................................................................................................................................................................................................  	avg. steps:  89.9; avg. score:  0.8 / 1.
./testing_games........................................................................................................................................................................................................  	avg. steps:  99.3; avg. score:  0.5 / 1.
