In [1]:
import torch     
from torch import Tensor                 
import torch.nn as nn                   
import torch.nn.functional as F           
import torch.optim as optim      
from torch.distributions import Categorical         
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [2]:
class actorcritic(nn.Module):
  def __init__(self, inp_size, out_size):
    super(actorcritic, self).__init__()
    self.ac_fc1 = nn.Linear(inp_size, 128)
    self.ac_fc2 = nn.Linear(128, 128)
    self.ac_fc3 = nn.Linear(128, out_size)

    self.cr_fc1 = nn.Linear(inp_size, 128)
    self.cr_fc2 = nn.Linear(128, 128)
    self.cr_fc3 = nn.Linear(128, 1)
  def forward(self, obs):
    #actor
    x= F.relu(self.ac_fc1(obs))
    x= F.relu(self.ac_fc2(x))
    x = self.ac_fc3(x)

    #critic
    y= F.relu(self.cr_fc1(obs))
    y= F.relu(self.cr_fc2(y))
    y = self.cr_fc3(y)


    return y, F.softmax(x)   #x used as estimate for value function. and softmax is used to select action

In [3]:
def train():
  GAMMA= 0.99
  LR = 0.001

  env = gym.make('CartPole-v0')
  obs = env.reset()

  net = actorcritic(env.observation_space.shape[0],env.action_space.n)
  optimizer =  optim.Adam(net.parameters(), lr = LR)

  rewards_list= []
  value_list = []
  action_log_prob_list =[]

  count = 0
  max_ep = 500
  episode_r =0

  while(True):

    value, action_prob = net(torch.from_numpy(obs).float())
    act = Categorical(action_prob)
    action = act.sample()

    obs, reward, done, info = env.step(action.item())
    episode_r+=reward

    v = value.item() if not done else 0
    rewards_list.append(reward)
    value_list.append(value.item())              
    action_log_prob_list.append(act.log_prob(action))


    if(done):

      print(episode_r)
      episode_r = 0

      obs = env.reset()
      loss = 0
      loss_v = 0

      discounted_rewards_list = np.zeros_like(rewards_list, float)
      for i, rewd in zip((reversed(range(len(rewards_list)))), reversed(rewards_list)):
        v = rewd + GAMMA*v
        discounted_rewards_list[i] = v


      for a, v, r in zip(action_log_prob_list, value_list, discounted_rewards_list):
        loss += -(a *(r -v))
        loss_v += (pow((r-v), 2))

      loss_ac = loss+ loss_v
      optimizer.zero_grad()
      loss_ac.backward()
      optimizer.step()

      rewards_list= []
      value_list = []
      action_log_prob_list =[]

      count +=1
      if(count == max_ep):
        print('done!')
        env.close()
        break



In [4]:
train()



12.0
16.0
16.0
20.0
16.0
26.0
13.0
20.0
10.0
13.0
16.0
12.0
15.0
18.0
15.0
10.0
17.0
29.0
18.0
16.0
13.0
10.0
13.0
42.0
18.0
19.0
13.0
13.0
43.0
19.0
51.0
12.0
10.0
10.0
17.0
11.0
12.0
24.0
10.0
16.0
15.0
23.0
18.0
29.0
29.0
27.0
16.0
10.0
12.0
23.0
24.0
11.0
16.0
16.0
21.0
19.0
14.0
21.0
22.0
25.0
24.0
14.0
23.0
39.0
38.0
108.0
11.0
20.0
19.0
9.0
11.0
18.0
19.0
32.0
180.0
24.0
10.0
12.0
43.0
26.0
30.0
45.0
15.0
49.0
32.0
13.0
27.0
56.0
24.0
15.0
21.0
34.0
24.0
48.0
20.0
26.0
65.0
48.0
35.0
73.0
80.0
39.0
46.0
22.0
100.0
16.0
45.0
25.0
41.0
41.0
83.0
63.0
37.0
16.0
20.0
22.0
67.0
35.0
81.0
76.0
111.0
48.0
25.0
57.0
58.0
62.0
105.0
27.0
113.0
52.0
53.0
35.0
74.0
51.0
56.0
20.0
31.0
23.0
58.0
40.0
32.0
40.0
56.0
76.0
40.0
38.0
20.0
49.0
93.0
30.0
74.0
75.0
66.0
35.0
81.0
92.0
145.0
63.0
65.0
134.0
38.0
34.0
148.0
40.0
147.0
123.0
85.0
110.0
58.0
78.0
46.0
134.0
68.0
86.0
153.0
179.0
55.0
127.0
152.0
104.0
105.0
115.0
189.0
100.0
98.0
119.0
93.0
123.0
84.0
84.0
110.0
92.0
80.0
197.0
199.0