In [1008]:
import torch 
import numpy as np
from torch import optim
from torch import nn
from torch import distributions
import Box2D
import gym

In [1009]:
#A toy environment to train in + verify 
class BanditEnv(gym.Env):
    '''
    Toy env to test your implementation
    The state is fixed (bandit setup)
    Action space: gym.spaces.Discrete(10)
    Note that the action takes integer values
    '''
    def __init__(self):
        self.action_space = gym.spaces.Discrete(10)
        self.observation_space = gym.spaces.Box(low=np.array([-1]), high=np.array([1]), dtype=np.float32)
        self.state = np.array([0])

    def reset(self):
        return np.array([0])

    def step(self, action):
        assert int(action) in self.action_space

        done = True
        s = np.array([0])
        r = float(-(action - 7) ** 2)
        info = {}
        return s, r, done, info

In [1010]:
#Reinforce class for a learner, environment and optimizer. 
class Reinforce: 
    def __init__(self, policy, env, optimizer):
        self.policy = policy
        self.env = env
        self.optimizer = optimizer

    #Recall static methods don't need a class object to be called.
    @staticmethod
    def compute_expected_cost(trajectory, gamma, baseline):
        """
        Compute the expected cost of this episode for gradient backprop
        DO NOT change its method signature
        :param trajectory: a list of 3-tuple of (reward: Float, policy_output_probs: torch.Tensor, action: Int)
        NOTE: policy_output_probs will have a grad_fn, i.e., it's able to backpropagate gradients from your computed cost
        :param gamma: gamma
        :param baseline: a simple running mean baseline to be subtracted from the total discounted returns
        :return: a 2-tuple of torch.tensor([cost]) of this episode that allows backprop and updated baseline
        """
        #Compute discounted cost of a trajectory
        #J = -\sum_tG_t lnP(a_t|state_t, policy) where
        #G_t is the future reward from time step t on, i.e, 
        #G_t = \sum_k=t^{H-1} gamma^{k-h}reward_k
        #Pretty sure this is the part I fucked up. 
        #Blah blah blah blah math
        cost = 0
        rewards, probs, actions = list(zip(*trajectory))
        T = len(rewards)
        #Computing the G_t
        discounted_reward = 0
        G = []
        for t in reversed(range(T)):
            discounted_reward = rewards[t] + gamma*discounted_reward
            G.insert(0, discounted_reward)
        G = torch.FloatTensor(G)
        #Baseline nonsense that I'll do later.
        p = 0.99
        G = (G - baseline)/torch.std(G, unbiased = False)
        baseline = p*baseline + (1-p)*torch.mean(G)
        #Final cost functions.
        cost = 0
        for t in range(T):
            cost = cost - G[t]*torch.log(probs[t][actions[t]])
        return cost, baseline
        
    def train(self, num_episodes, gamma):
        """
        train the policy using REINFORCE for specified number of episodes
        :param num_episodes: number of episodes to train for
        :param gamma: gamma
        :return: self
        """
        baseline = 0
        total_reward_per_episode = []
        trajectory_lengths = []
        running_average_reward = 0
        running_average_cost = 0
        for episode_i in range(num_episodes):
            self.optimizer.zero_grad()
            trajectory, trajectory_length, total_reward = self.generate_episode()
            loss, baseline = self.compute_expected_cost(trajectory, gamma, baseline)
            loss.backward()
            self.optimizer.step()
            total_reward_per_episode.append(total_reward)
            if episode_i%200 == 0 and episode_i!=0:
                running_average_reward = np.sum(total_reward_per_episode[episode_i - 200:episode_i])/episode_i
                print("Episode: %d Reward: %5d " % (episode_i, total_reward))
        #iterate over episodes to get costs and use gradient descent to minimize them. Also print out the progress. Need to figure out if you're calculating the right rewards/costs. 
            if episode_i%2000 == 0 and episode_i!=0:
                torch.save(self.policy.state_dict(), 'mypolicy.pth')
                print("Checkpoint created at Episode %d" % (episode_i))

        return self

    def generate_episode(self):
        """
        run the environment for 1 episode
        NOTE: do not limit the number
        :return: whatever you need for training
        """
        #Uses the environment and policy to determine a trajectory. 
        state = torch.FloatTensor(self.env.reset())
        total_reward = 0
        trajectory = []
        while True:
            probs = self.policy.forward(state)
            action = torch.distributions.Categorical(probs).sample()
            state, reward, finished, __ = env.step(action.item())
            total_reward = total_reward + reward
            state = torch.FloatTensor(state)
            trajectory.append([reward, probs, action.item()])
            if finished:
                break
            #if len(trajectory) == 5:
            #    break
        trajectory_length = len(trajectory)

        return trajectory, trajectory_length, total_reward



In [1011]:
#Policy class- policy is parametrized by a neural network that returns P(action|state, params) for a state input. 

class MyPolicy(nn.Module):
    #A kaiming uniform initialization for the layers. 
    def __init__(self):
        super(MyPolicy, self).__init__()
        self.net_stack = nn.Sequential(
            nn.Linear(8, 25), 
            nn.ReLU(), 
            nn.Linear(25, 16), 
            nn.ReLU(), 
            nn.Linear(16, 16), 
            nn.ReLU(), 
            nn.Linear(16, 4), 
            nn.Softmax(dim = 0),
        )
        def kaim_init(m):
            if type(m) == nn.Linear:
                torch.nn.init.kaiming_uniform_(m.weight)
        self.net_stack.apply(kaim_init)

    def forward(self, x):
        result = self.net_stack(x)
        return result

In [1012]:
#network_policy = MyPolicy()
#env = gym.make('LunarLander-v2')
#optimizer = optim.Adam(network_policy.parameters(), lr = 0.0001)
#reinforcement_learner = Reinforce(network_policy, env, optimizer)
#trajectory, trajectory_length, total_reward = reinforcement_learner.generate_episode()
#rewards, probs, actions = list(zip(*trajectory))
#print(len(rewards))
#for i in range(5):
#    print(i)
#for i in reversed(range(5)):
#    print(i)

In [1013]:
#print(actions)
#print(probs)
#probs_actions = [probs[i][0, actions[i]] for i in range(len(actions))]
#print(probs_actions)

In [1014]:
#cost, baseline = reinforcement_learner.compute_expected_cost(trajectory, 0.99, 0)
#print(cost)
#cost.backward()

In [1015]:
#a = torch.FloatTensor([1, 2, 0, 4, 12])
#print(torch.mean(a))
#print(torch.std(a, unbiased = False))
#print(torch.sqrt(torch.sum((a - torch.mean(a))**2)/len(a)))

In [1016]:
policy = MyPolicy()
optimizer = optim.Adam(policy.parameters(), lr = 0.001)
env = gym.make('LunarLander-v2')
learner = Reinforce(policy, env, optimizer)
learner.train(10000, 0.99)
torch.save(model.state_dict(), 'mypolicy.pth')

Episode: 200 Reward:  -134 
Episode: 400 Reward:  -325 
Episode: 600 Reward:  -368 
Episode: 800 Reward:   -12 
Episode: 1000 Reward:  -410 
Episode: 1200 Reward:  -111 
Episode: 1400 Reward:  -372 
Episode: 1600 Reward:  -154 
Episode: 1800 Reward:  -281 
Episode: 2000 Reward:   -42 
Checkpoint created at Episode 2000
Episode: 2200 Reward:  -394 
Episode: 2400 Reward:  -254 
Episode: 2600 Reward:   -34 
Episode: 2800 Reward:   -52 
Episode: 3000 Reward:   -76 
Episode: 3200 Reward:  -247 
Episode: 3400 Reward:   -20 
Episode: 3600 Reward:   -94 
Episode: 3800 Reward:   -19 
Episode: 4000 Reward:   -18 
Checkpoint created at Episode 4000
Episode: 4200 Reward:   -33 
Episode: 4400 Reward:  -119 
Episode: 4600 Reward:    39 
Episode: 4800 Reward:   -15 
Episode: 5000 Reward:    18 
Episode: 5200 Reward:    35 
Episode: 5400 Reward:    98 
Episode: 5600 Reward:   119 
Episode: 5800 Reward:   130 
Episode: 6000 Reward:   225 
Checkpoint created at Episode 6000
Episode: 6200 Reward:   265 


KeyboardInterrupt: 