In [1]:
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gym

In [2]:
class ActorCriticNet(nn.Module):
    def __init__(self,lr,input_dims,num_actions,fc1_dims=2048,fc2_dims= 1024):
        super(ActorCriticNet,self).__init__()
        self.fc1 = nn.Linear(*input_dims,fc1_dims)
        self.fc2 = nn.Linear(fc1_dims,fc2_dims)
        self.pi = nn.Linear(fc2_dims,num_actions)
        self.v = nn.Linear(fc2_dims,1)
        self.optimizer = optim.Adam(self.parameters(),lr = lr)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    def forward(self,state):
        S = F.relu(self.fc1(state))
        S = F.relu(self.fc2(S))
        pi = self.pi(S)
        v = self.v(S)
        return (pi,v)

In [3]:
class AC_Agent():
    def __init__(self,lr,input_dims,fc1_dims,fc2_dims,num_actions,gamma=0.99):
        self.gamma = gamma
        self.lr = lr
        self.gamma = gamma
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.actor_critic = ActorCriticNet(lr,input_dims,num_actions,fc1_dims,fc2_dims)
        self.log_prob = None
        
    def select_action(self,obs):
        state = T.tensor([obs],dtype = T.float).to(self.actor_critic.device)
        probs,_ =self.actor_critic.forward(state)
        probs = F.softmax(probs,dim=1)
        action_probs = T.distributions.Categorical(probs)
        action = action_probs.sample()
        log_prob = action_probs.log_prob(action)
        self.log_prob = log_prob
        return action.item()
    def train(self,state,reward,next_state,done):
        state = T.tensor([state],dtype = T.float).to(self.actor_critic.device)
        next_state = T.tensor([next_state],dtype = T.float).to(self.actor_critic.device)
        reward = T.tensor(reward,dtype = T.float).to(self.actor_critic.device)
        
        _,critic_value = self.actor_critic.forward(state)
        _,next_critic_value = self.actor_critic.forward(next_state)
        delta = reward + self.gamma*next_critic_value*(1-int(done)) - critic_value
        
        actor_loss = -self.log_prob*delta
        critic_loss = delta**2
        (actor_loss + critic_loss).backward()
        self.actor_critic.optimizer.step()

In [None]:
env = gym.make('CartPole-v0')
agent = AC_Agent(gamma = 0.99,lr = 1e-5,input_dims=[4],num_actions=2,
                fc1_dims=2048,fc2_dims = 1024)
num_epochs = 1000
scores = []
for epoch in range(num_epochs):
    done = False
    obs = env.reset()
    score = 0
    while not done:
        action = agent.select_action(obs)
        next_obs,reward,done,info = env.step(action)
        score += reward
        agent.train(obs,reward,next_obs,done)
        obs = next_obs
    scores.append(score)
    avg_score = np.mean(scores[-100:])
    print('episode: {} score: {} average score: {}'.format(epoch,score,avg_score))

  state = T.tensor([obs],dtype = T.float).to(self.actor_critic.device)


episode: 0 score: 14.0 average score: 14.0
episode: 1 score: 36.0 average score: 25.0
episode: 2 score: 15.0 average score: 21.666666666666668
episode: 3 score: 18.0 average score: 20.75
episode: 4 score: 19.0 average score: 20.4
episode: 5 score: 12.0 average score: 19.0
episode: 6 score: 11.0 average score: 17.857142857142858
episode: 7 score: 18.0 average score: 17.875
episode: 8 score: 11.0 average score: 17.11111111111111
episode: 9 score: 16.0 average score: 17.0
episode: 10 score: 17.0 average score: 17.0
episode: 11 score: 18.0 average score: 17.083333333333332
episode: 12 score: 19.0 average score: 17.23076923076923
episode: 13 score: 45.0 average score: 19.214285714285715
episode: 14 score: 39.0 average score: 20.533333333333335
episode: 15 score: 26.0 average score: 20.875
episode: 16 score: 27.0 average score: 21.235294117647058
episode: 17 score: 45.0 average score: 22.555555555555557
episode: 18 score: 71.0 average score: 25.105263157894736
episode: 19 score: 48.0 average

episode: 160 score: 21.0 average score: 27.61
episode: 161 score: 24.0 average score: 27.22
episode: 162 score: 30.0 average score: 26.96
episode: 163 score: 25.0 average score: 26.79
episode: 164 score: 39.0 average score: 26.8
episode: 165 score: 35.0 average score: 26.82
episode: 166 score: 28.0 average score: 26.75
episode: 167 score: 36.0 average score: 26.52
episode: 168 score: 41.0 average score: 26.41
episode: 169 score: 36.0 average score: 26.36
episode: 170 score: 32.0 average score: 26.35
episode: 171 score: 34.0 average score: 26.3
episode: 172 score: 38.0 average score: 26.22
episode: 173 score: 45.0 average score: 26.34
episode: 174 score: 43.0 average score: 26.32
episode: 175 score: 34.0 average score: 26.24
episode: 176 score: 43.0 average score: 26.38
episode: 177 score: 38.0 average score: 26.49
episode: 178 score: 38.0 average score: 26.59
episode: 179 score: 42.0 average score: 26.75
episode: 180 score: 28.0 average score: 26.73
episode: 181 score: 27.0 average sco

episode: 339 score: 19.0 average score: 41.89
episode: 340 score: 13.0 average score: 41.32
episode: 341 score: 14.0 average score: 40.74
episode: 342 score: 17.0 average score: 40.38
episode: 343 score: 18.0 average score: 39.67
episode: 344 score: 16.0 average score: 39.3
episode: 345 score: 20.0 average score: 38.65
episode: 346 score: 14.0 average score: 38.11
episode: 347 score: 16.0 average score: 37.62
episode: 348 score: 17.0 average score: 37.08
episode: 349 score: 19.0 average score: 36.56
episode: 350 score: 20.0 average score: 35.99
episode: 351 score: 15.0 average score: 35.28
episode: 352 score: 17.0 average score: 34.47
episode: 353 score: 18.0 average score: 33.82
episode: 354 score: 14.0 average score: 31.96
episode: 355 score: 20.0 average score: 30.91
episode: 356 score: 15.0 average score: 29.06
episode: 357 score: 18.0 average score: 28.01
episode: 358 score: 19.0 average score: 26.75
episode: 359 score: 15.0 average score: 25.93
episode: 360 score: 15.0 average sc

episode: 518 score: 12.0 average score: 12.83
episode: 519 score: 15.0 average score: 12.81
episode: 520 score: 10.0 average score: 12.75
episode: 521 score: 12.0 average score: 12.74
episode: 522 score: 11.0 average score: 12.7
episode: 523 score: 11.0 average score: 12.68
episode: 524 score: 11.0 average score: 12.64
episode: 525 score: 13.0 average score: 12.64
episode: 526 score: 11.0 average score: 12.63
episode: 527 score: 12.0 average score: 12.62
episode: 528 score: 12.0 average score: 12.59
episode: 529 score: 11.0 average score: 12.56
episode: 530 score: 16.0 average score: 12.6
episode: 531 score: 12.0 average score: 12.57
episode: 532 score: 10.0 average score: 12.55
episode: 533 score: 12.0 average score: 12.55
episode: 534 score: 12.0 average score: 12.53
episode: 535 score: 14.0 average score: 12.52
episode: 536 score: 14.0 average score: 12.54
episode: 537 score: 14.0 average score: 12.56
episode: 538 score: 12.0 average score: 12.55
episode: 539 score: 11.0 average sco