In [3]:
import torch 
from torch import optim
from torch import nn
from torch import distributions
import gym
import Box2D
import numpy as np

In [4]:
#Bellman Backups
T_a_1 = np.array([[0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0.5, 0.5, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]])
print(T_a_1)
T_a_2 = np.array([[0, 0, 1, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0], [0.7, 0, 0.3, 0, 0], [0, 0, 0, 0, 1]])
print(T_a_2)
pi = np.array([0.6, 0.4])
Rewards = np.array([[0], [1], [-1], [1], [-1]])
print(Rewards)
V = np.zeros((5, 1))
print(V)
gamma = 0.9
V_1 = pi[0]*(T_a_1@(Rewards + gamma*V)) + pi[1]*(T_a_2@(Rewards + gamma*V))
print(V_1)
V_2 = pi[0]*(T_a_1@(Rewards + gamma*V_1)) + pi[1]*(T_a_2@(Rewards + gamma*V_1))
print(V_2)
ans = 0.6*(0.5*(-1 + (0.9*0.4))+0.5*(1-0.9*0.72)) + 0.4*(0.1)
print(ans)

[[0.  1.  0.  0.  0. ]
 [0.  0.  1.  0.  0. ]
 [0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  1. ]]
[[0.  0.  1.  0.  0. ]
 [0.  0.  0.  0.  1. ]
 [0.  1.  0.  0.  0. ]
 [0.7 0.  0.3 0.  0. ]
 [0.  0.  0.  0.  1. ]]
[[ 0]
 [ 1]
 [-1]
 [ 1]
 [-1]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[[ 0.2 ]
 [-1.  ]
 [ 0.4 ]
 [-0.72]
 [-1.  ]]
[[-0.196 ]
 [-1.144 ]
 [-0.0464]
 [-1.1664]
 [-1.9   ]]
-0.04639999999999997


In [657]:
class BanditEnv(gym.Env):

    def __init__(self):
        self.action_space = gym.spaces.Discrete(10)
        self.observation_space = gym.spaces.Box(low=np.array([-1]), high=np.array([1]), dtype=np.float32)
        self.state = np.array([0])

    def reset(self):
        return np.array([0])

    def step(self, action):
        assert int(action) in self.action_space

        done = True
        s = np.array([0])
        r = float(-(action - 7) ** 2)
        info = {}
        return s, r, done, info

In [658]:
class Reinforce:
    def __init__(self, policy, env, optimizer):
        self.policy = policy
        self.env = env
        self.optimizer = optimizer

    @staticmethod
    def compute_expected_cost(trajectory, gamma, baseline):
        """
        Compute the expected cost of this episode for gradient backprop
        DO NOT change its method signature
        :param trajectory: a list of 3-tuple of (reward: Float, policy_output_probs: torch.Tensor, action: Int)
        NOTE: policy_output_probs will have a grad_fn, i.e., it's able to backpropagate gradients from your computed cost
        :param gamma: gamma
        :param baseline: a simple running mean baseline to be subtracted from the total discounted returns
        :return: a 2-tuple of torch.tensor([cost]) of this episode that allows backprop and updated baseline
        """
        cost = 0
        rewards, probs, actions = list(zip(*trajectory))
        #print(rewards)
        #print(probs)
        #print(actions)
        #Calculating the G_t
        G = torch.FloatTensor([0])
        for t in reversed(range(len(rewards))):
            G = gamma*G + rewards[t]
            if t!=0: 
                G = torch.cat((G, torch.FloatTensor([0])), dim = 0)
        #G = torch.FloatTensor(G)
        #Baseline
        p = 0.99
        G = (G - baseline)/G.std()
        baseline = p*baseline + (1-p)*G.mean()
        #Final cost function
        probs_a = [probs[i][actions[i]] for i in range(len(rewards))]
        cost = -1*torch.sum((G*torch.log(torch.FloatTensor(probs_a))))
        cost.requires_grad = True
        cost.retain_grad()
        return cost, baseline

    def train(self, num_episodes, gamma):
        """
        train the policy using REINFORCE for specified number of episodes
        :param num_episodes: number of episodes to train for
        :param gamma: gamma
        :return: self
        """
        baseline = 0
        rewards_over_eps = []
        for episode_i in range(num_episodes):
            ### YOUR CODE HERE ###
            self.optimizer.zero_grad()
            trajectory, total_reward = self.generate_episode()
            loss, baseline = self.compute_expected_cost(trajectory, gamma, baseline)
            loss.backward()
            self.optimizer.step()
            rewards_over_eps.append(-1*loss)
            print('Episode: %d Total Reward %3d Cost %5d' % (episode_i, total_reward, loss))
            ### -------------- ###
        return self

    def generate_episode(self):
        """
        run the environment for 1 episode
        NOTE: do not limit the number
        :return: whatever you need for training
        """
        trajectory = []
        actions = torch.empty(size = (0,))
        rewards = torch.empty(size = (0,))
        state = torch.Tensor([self.env.reset()])
        is_done = False
        total_reward = 0
        while True: 
            probs = torch.squeeze(self.policy.forward(state))
            action = torch.distributions.categorical.Categorical(probs).sample().item()
            #actions = torch.cat((actions, action), dim = 0)
            state, reward, is_done, ___ = self.env.step(action)
            state = torch.Tensor([state])
            #rewards = torch.cat((rewards, reward), dim = 0)
            trajectory.append([reward, probs, action])
            total_reward += reward
            if is_done:
                break
        return trajectory, total_reward
        


In [659]:
class MyPolicy(nn.Module):
    def __init__(self):
        super(MyPolicy, self).__init__()
        ### YOUR CODE HERE AND REMOVE `pass` below ###
        self.net_stack = nn.Sequential(
            nn.Linear(4, 16),
            nn.ReLU(), 
            nn.Linear(16, 16), 
            nn.ReLU(), 
            nn.Linear(16, 16), 
            nn.ReLU(), 
            nn.Linear(16, 2), 
            nn.Softmax(dim = 0),
        )
        #nn.init.kaiming_uniform(self.net_stack)

    def forward(self, x):
        ### YOUR CODE HERE AND REMOVE `pass` below ###
        result = self.net_stack(x)
        return result

In [660]:
policy = MyPolicy()
#Kaiming initialize the parameters
optimizer = optim.Adam(policy.parameters(), lr = 0.0001)
env = gym.make('CartPole-v1')
reinforcement = Reinforce(policy, env, optimizer)

In [661]:
reinforcement.train(10000, 0.99)

 Reward  36 Cost     0
Episode: 28 Total Reward  24 Cost     0
Episode: 29 Total Reward  17 Cost     0
Episode: 30 Total Reward  15 Cost     0
Episode: 31 Total Reward  21 Cost     0
Episode: 32 Total Reward  20 Cost     0
Episode: 33 Total Reward  78 Cost     0
Episode: 34 Total Reward  31 Cost     0
Episode: 35 Total Reward  18 Cost     0
Episode: 36 Total Reward  20 Cost     0
Episode: 37 Total Reward  17 Cost     0
Episode: 38 Total Reward  22 Cost     0
Episode: 39 Total Reward  31 Cost     0
Episode: 40 Total Reward  32 Cost     0
Episode: 41 Total Reward  28 Cost     0
Episode: 42 Total Reward  14 Cost     0
Episode: 43 Total Reward  10 Cost     0
Episode: 44 Total Reward  30 Cost     0
Episode: 45 Total Reward  16 Cost     0
Episode: 46 Total Reward  12 Cost     0
Episode: 47 Total Reward  21 Cost     0
Episode: 48 Total Reward  10 Cost     0
Episode: 49 Total Reward  23 Cost     0
Episode: 50 Total Reward  11 Cost     0
Episode: 51 Total Reward  16 Cost     0
Episode: 52 Total

KeyboardInterrupt: 