<a href="https://colab.research.google.com/github/Sourav1429/Reinforcement_Learning/blob/main/CartPole_Reinforce_Actor_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import torch
import gym
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
from torchsummary import summary

GAMMA = 0.9

In [None]:
class PolicyNetwork(nn.Module):
  def __init__(self,num_inputs,num_actions,hidden_size,LR=3e-4):
    super(PolicyNetwork,self).__init__()

    self.num_actions = num_actions
    self.linear1 = nn.Linear(num_inputs,hidden_size);
    self.linear2 = nn.Linear(hidden_size,num_actions);
    self.optimizer = optim.Adam(self.parameters(),lr=LR);
    print(self)

  def forward(self,state):
    x=F.relu(self.linear1(state));
    #print(x);
    x=F.softmax(self.linear2(x),dim=1);
    #print(x);
    return x;

  def get_action(self,state,cond):
    if(cond==1):
      print("Printing type of our state:",type(state));
      print(state);
    state=torch.from_numpy(state).float().unsqueeze(0);
    probs=self.forward(Variable(state));
    #print(probs);
    #print(state);
    highest_prob_action=np.random.choice(self.num_actions,p=np.squeeze(probs.detach().numpy()));
    log_prob=torch.log(probs.squeeze(0)[highest_prob_action])
    return highest_prob_action,log_prob,probs;

In [None]:
def update_policy(policy_network,rewards,log_probs):
  discounted_rewards=[];
  for t in range(len(rewards)):
    Gt=0;
    pw=0
    for r in rewards[t:]:
      Gt=Gt+GAMMA**pw*r;
      pw+=1;
    discounted_rewards.append(Gt);
  discounted_rewards=torch.tensor(discounted_rewards);
  discounted_rewards=(discounted_rewards-discounted_rewards.mean())/(discounted_rewards.std()+1e-9)
  policy_gradient=[];
  #print('Discounted_rewards:',discounted_rewards);
  #print('Logarithmic probabilities:',log_probs)
  for log_prob,Gt in zip(log_probs,discounted_rewards):
    #print("I am entering");
    policy_gradient.append(-log_prob*Gt)
  policy_network.optimizer.zero_grad();
  #print(policy_gradient);
  policy_gradient=torch.stack(policy_gradient).sum();
  policy_gradient.backward();
  policy_network.optimizer.step();

In [None]:
def main():
  env = gym.make('CartPole-v0');
  policy_net = PolicyNetwork(env.observation_space.shape[0],env.action_space.n,128);
  max_episode_num=5000;
  max_steps=10000;
  numsteps=[];
  avg_numsteps=[];
  all_rewards=[];
  
  for episode in range(max_episode_num):
    state=env.reset();
    log_probs=[];
    rewards=[];
    if(episode==0):
      cond=1;
    for step in range(max_steps):
      #env.render();
      action,log_prob,probs=policy_net.get_action(state,cond);
      cond=0;
      new_state,reward,done,_=env.step(action);
      log_probs.append(log_prob);
      rewards.append(reward);
      if done:
        update_policy(policy_net,rewards,log_probs);
        numsteps.append(step);
        avg_numsteps.append(np.mean(numsteps[-10:]));
        all_rewards.append(np.sum(rewards));
        if(episode%1==0):
          sys.stdout.write("episode:{},total_reward:{},average_reward:{},length:{},\n".format(episode,np.round(np.sum(rewards),decimals=3),np.round(np.mean(all_rewards[-10:]),decimals=3),step))
        break;
      state=new_state;
  plt.plot(numsteps);
  plt.plot(avg_numsteps);
  plt.xlabel('Episode');
  plt.legend(['Number of steps','Average_Numeber_of steps']);
  plt.xlabel('Episode');
  plt.show();
  plt.figure();
  plt.plot(all_rewards);

In [None]:
env = gym.make('CartPole-v0');
PolicyNetwork(env.observation_space.shape[0],env.action_space.n,128);

PolicyNetwork(
  (linear1): Linear(in_features=4, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=2, bias=True)
)


In [None]:
if __name__=='__main__':
  main();

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
I am entering
[tensor(0.7899, grad_fn=<MulBackward0>), tensor(0.6751, grad_fn=<MulBackward0>), tensor(0.6940, grad_fn=<MulBackward0>), tensor(0.5676, grad_fn=<MulBackward0>), tensor(0.5736, grad_fn=<MulBackward0>), tensor(0.5898, grad_fn=<MulBackward0>), tensor(0.5696, grad_fn=<MulBackward0>), tensor(0.2081, grad_fn=<MulBackward0>), tensor(0.1621, grad_fn=<MulBackward0>), tensor(0.1347, grad_fn=<MulBackward0>), tensor(-0.0228, grad_fn=<MulBackward0>), tensor(-0.0952, grad_fn=<MulBackward0>), tensor(-0.4524, grad_fn=<MulBackward0>), tensor(-0.2868, grad_fn=<MulBackward0>), tensor(-0.4481, grad_fn=<MulBackward0>), tensor(-1.2173, grad_fn=<MulBackward0>), tensor(-0.7165, grad_fn=<MulBackward0>), tensor(-1.9773, grad_fn=<MulBackward0>)]
episode:148,total_reward:18.0,average_reward:23.5,length:17,
Discounted_rewards: tensor([ 0.8291,  0.8236,  0.8174,  0.8106,  0.8030,  0.7946,  0.7853,  0.7749,
         0.7633,  0.7505,  0.73

KeyboardInterrupt: ignored