In [1]:
import random
import gym
import gym.spaces
from collections import namedtuple
import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np

In [17]:
batch_size = 100
n_neurons = 128
percentile = 30
GAMMA = 0.9

In [3]:
Episode_Step = namedtuple('Episode_Step', ['observations', 'actions'])
Episode = namedtuple('Episode', ['rewards', 'episode_step'])

In [4]:
def onehotencoder(x, highest_number):
    onehotencoded_variable = np.zeros(highest_number, dtype=float)
    onehotencoded_variable[x] = 1
    return onehotencoded_variable
        

In [5]:
def OHE(tensor, h):
    z0 = np.zeros([len(tensor), h])
    for t in tensor:
        z0[int(t), :] = onehotencoder(int(t), h)
    return z0

In [6]:
class model(nn.Module):
    def __init__(self, obs_size, n_neurons, n_actions):
        super(model, self).__init__()
        
        self.pipe = nn.Sequential(
            nn.Linear(obs_size, n_neurons),
            nn.ReLU(),
            nn.Linear(n_neurons, n_actions)
        )
    
    def forward(self, x):
        return self.pipe(x)

In [16]:
def iterate_batchs(env, model, batch_size):
    batch = []
    episode_reward = 0.0
    steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    
    while True:
        obs_v = torch.FloatTensor(torch.tensor(obs).unsqueeze(0))
        actions_prob_v = sm(model(obs_v))
        actions_prob = actions_prob_v.data.numpy()[0]
        
        action = np.random.choice(len(actions_prob), p=actions_prob)
        next_obs, reward, is_done, _ = env.step(action)
        episode_reward += reward
        steps.append(Episode_Step(observations = obs, actions = action))
        
        if is_done:
            batch.append(Episode(rewards=episode_reward, episode_step=steps))
            episode_reward = 0.0
            next_obs = env.reset()
            steps = []
            
            if len(batch) == batch_size:
                yield batch
                batch = []
        
        obs = next_obs

In [8]:
def filter_batch(batch, percentile):
    filter_fun = lambda s: s.reward * (GAMMA ** len(s.steps))
    disc_rewards = list(map(filter_fun, batch))
    reward_bound = np.percentile(disc_rewards, percentile)
    
    train_obs = []
    train_act = []
    elite_batch = []
    for example, discounted_reward in zip(batch, disc_rewards):
        if discounted_reward > reward_bound:
            train_obs.extend(map(lambda step: step.observations, example.episode_step))
            train_act.extend(map(lambda step: step.actions, example.episode_step))
            elite_batch.append(example)
    
    return elite_batch, train_obs, train_act, reward_bound

In [9]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        
        shape = (env.observation_space.n, )
        self.observation_space = gym.spaces.Box(0.0, 0.1, shape, dtype=np.float32)
    
    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res

In [11]:
from tensorboardX import SummaryWriter

In [15]:
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v1"))
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = model(obs_size, n_neurons, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)
writer = SummaryWriter(comment="-frozenlake-naive")

for iter_no, batch in enumerate(iterate_batchs(env, net, batch_size)):
    #env.render()
    train_obs, train_act, reward_b, reward_m = filter_batch(batch, percentile)
    
    #act_preds = net(torch.tensor(OHE(train_obs, e.observation_space.n), dtype=torch.float32))
    act_preds = net(train_obs)
    
    loss = objective(act_preds, train_act)
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()
    
    print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (
            iter_no, loss.item(), reward_m, reward_b))
    writer.add_scalar("loss", loss.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    if reward_m > 199:
        print("Solved!")
        break
        
writer.close()
#e.close()

0: loss=1.397, reward_mean=0.0, reward_bound=0.0
1: loss=1.381, reward_mean=0.1, reward_bound=0.0
2: loss=1.356, reward_mean=0.0, reward_bound=0.0
3: loss=1.338, reward_mean=0.0, reward_bound=0.0
4: loss=1.347, reward_mean=0.0, reward_bound=0.0
5: loss=1.287, reward_mean=0.1, reward_bound=0.0
6: loss=1.320, reward_mean=0.0, reward_bound=0.0
7: loss=1.328, reward_mean=0.1, reward_bound=0.0
8: loss=1.249, reward_mean=0.0, reward_bound=0.0
9: loss=1.185, reward_mean=0.0, reward_bound=0.0
10: loss=1.285, reward_mean=0.0, reward_bound=0.0
11: loss=1.199, reward_mean=0.0, reward_bound=0.0
12: loss=1.197, reward_mean=0.0, reward_bound=0.0
13: loss=1.172, reward_mean=0.0, reward_bound=0.0
14: loss=1.271, reward_mean=0.0, reward_bound=0.0
15: loss=1.214, reward_mean=0.0, reward_bound=0.0
16: loss=1.243, reward_mean=0.0, reward_bound=0.0
17: loss=1.196, reward_mean=0.1, reward_bound=0.0
18: loss=1.186, reward_mean=0.1, reward_bound=0.0
19: loss=1.082, reward_mean=0.0, reward_bound=0.0
20: loss=1

166: loss=0.921, reward_mean=0.1, reward_bound=0.0
167: loss=1.049, reward_mean=0.1, reward_bound=0.0
168: loss=1.024, reward_mean=0.1, reward_bound=0.0
169: loss=1.112, reward_mean=0.0, reward_bound=0.0
170: loss=0.927, reward_mean=0.1, reward_bound=0.0
171: loss=0.980, reward_mean=0.0, reward_bound=0.0
172: loss=1.129, reward_mean=0.0, reward_bound=0.0
173: loss=1.015, reward_mean=0.0, reward_bound=0.0
174: loss=1.010, reward_mean=0.0, reward_bound=0.0
175: loss=1.100, reward_mean=0.0, reward_bound=0.0
176: loss=1.091, reward_mean=0.0, reward_bound=0.0
177: loss=1.005, reward_mean=0.1, reward_bound=0.0
178: loss=1.096, reward_mean=0.0, reward_bound=0.0
179: loss=1.038, reward_mean=0.0, reward_bound=0.0
180: loss=0.955, reward_mean=0.0, reward_bound=0.0
181: loss=1.008, reward_mean=0.0, reward_bound=0.0
182: loss=1.144, reward_mean=0.0, reward_bound=0.0
183: loss=0.968, reward_mean=0.0, reward_bound=0.0
184: loss=1.053, reward_mean=0.1, reward_bound=0.0
185: loss=0.878, reward_mean=0.

327: loss=0.756, reward_mean=0.0, reward_bound=0.0
328: loss=0.863, reward_mean=0.1, reward_bound=0.0
329: loss=0.587, reward_mean=0.0, reward_bound=0.0
330: loss=0.739, reward_mean=0.0, reward_bound=0.0
331: loss=0.694, reward_mean=0.1, reward_bound=0.0
332: loss=0.654, reward_mean=0.1, reward_bound=0.0
333: loss=0.737, reward_mean=0.1, reward_bound=0.0
334: loss=0.807, reward_mean=0.1, reward_bound=0.0
335: loss=0.609, reward_mean=0.0, reward_bound=0.0
336: loss=0.723, reward_mean=0.0, reward_bound=0.0
337: loss=0.784, reward_mean=0.1, reward_bound=0.0
338: loss=0.738, reward_mean=0.0, reward_bound=0.0
339: loss=0.640, reward_mean=0.1, reward_bound=0.0
340: loss=0.681, reward_mean=0.0, reward_bound=0.0
341: loss=0.617, reward_mean=0.0, reward_bound=0.0
342: loss=0.640, reward_mean=0.1, reward_bound=0.0
343: loss=0.750, reward_mean=0.0, reward_bound=0.0
344: loss=0.787, reward_mean=0.1, reward_bound=0.0
345: loss=0.553, reward_mean=0.0, reward_bound=0.0
346: loss=0.684, reward_mean=0.

KeyboardInterrupt: 