In [1]:
#!pip install box2d
#!pip install wandb -q

import wandb
import torch     
from torch import Tensor        
import torch.autograd as autograd           
import torch.nn as nn                   
import torch.nn.functional as F
import torch.nn.init as init        
import torch.optim as optim      
from torch.distributions import Categorical         
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from collections import deque
import collections, itertools

In [2]:
wandb.login()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
wandb: Currently logged in as: olayemiy (use `wandb login --relogin` to force relogin)


True

In [3]:
def init_fanin(tensor): #check for fan-in DDPG paper 7. experiment details
  fanin = tensor.size(1)
  v = 1.0 / np.sqrt(fanin)
  init.uniform_(tensor, -v, v)

In [4]:
 class DDPG_AC(nn.Module):
   def __init__(self, obs_size, num_actions, linear_dim1, linear_dim2): 
     super(DDPG_AC,self).__init__()   
     self.actor_fc1= nn.Linear(obs_size, linear_dim1)
     init_fanin(self.actor_fc1.weight)
     self.bn1 = nn.LayerNorm(linear_dim1)

     self.actor_fc2 = nn.Linear(linear_dim1,linear_dim2)
     init_fanin(self.actor_fc2.weight)
     self.bn2 = nn.LayerNorm(linear_dim2)

     self.actor_fc3 = nn.Linear(linear_dim2,num_actions)
     init.uniform_(self.actor_fc3.weight, -3e-3, 3e-3)
     init.uniform_(self.actor_fc3.bias, -3e-3, 3e-3)

   def forward(self, obs):
    actor = F.relu(self.bn1(self.actor_fc1(obs)))
    actor = F.relu(self.bn2(self.actor_fc2(actor)))
    actor = torch.tanh(self.actor_fc3(actor))

    return actor


In [None]:
'''
env = gym.make('LunarLanderContinuous-v2')
obs = env.reset()
print(obs)
test = DDPG_AC(obs.shape[0],env.action_space.shape[0], 25).float() 
print(obs.shape)
print(env.action_space.shape)
print(test.forward(torch.tensor(obs).float()))
'''



"\nenv = gym.make('LunarLanderContinuous-v2')\nobs = env.reset()\nprint(obs)\ntest = DDPG_AC(obs.shape[0],env.action_space.shape[0], 25).float() \nprint(obs.shape)\nprint(env.action_space.shape)\nprint(test.forward(torch.tensor(obs).float()))\n"

In [5]:
class DDPG_CR(nn.Module):
  def __init__(self, obs_size, num_actions, linear_dim1, linear_dim2): #linear_dim1 = 400
    super(DDPG_CR,self).__init__()

    self.critic_fc1 = nn.Linear(obs_size , linear_dim1)
    init_fanin(self.critic_fc1.weight)
    self.bn1 = nn.LayerNorm(linear_dim1)

    self.critic_fc2 = nn.Linear(linear_dim1 + num_actions, linear_dim2) #inserting actions at second layer now
    init_fanin(self.critic_fc2.weight)
    self.bn2 = nn.LayerNorm(linear_dim2)

    self.critic_fc3 = nn.Linear(linear_dim2, 1)
    init.uniform_(self.critic_fc3.weight, -3e-3, 3e-3)
    init.uniform_(self.critic_fc3.bias, -3e-3, 3e-3)

  def forward(self, obs, action):
    critic = F.relu(self.bn1(self.critic_fc1(obs)))
    critic = torch.cat((critic, action), -1)
    critic = F.relu(self.bn2(self.critic_fc2(critic)))
    critic = self.critic_fc3(critic)

    return critic #returns value for critic

In [None]:
'''
env = gym.make('LunarLanderContinuous-v2')
obs = env.reset()
test = DDPG_AC(obs.shape[0],env.action_space.shape[0], 25) 
cr = DDPG_CR(obs.shape[0],env.action_space.shape[0], 25) 
obs = torch.tensor(obs)
action = test.forward(obs)
print(obs.shape, action.shape)
print(cr.forward(obs, action))
'''

"\nenv = gym.make('LunarLanderContinuous-v2')\nobs = env.reset()\ntest = DDPG_AC(obs.shape[0],env.action_space.shape[0], 25) \ncr = DDPG_CR(obs.shape[0],env.action_space.shape[0], 25) \nobs = torch.tensor(obs)\naction = test.forward(obs)\nprint(obs.shape, action.shape)\nprint(cr.forward(obs, action))\n"

In [None]:
class OUActionNoise(object):
    def __init__(self, mu, sigma=0.15, theta=.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):#kinda like init but runs when you do noise().. if we created a noise object
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
            self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):#string representation of object
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)


In [None]:
class Agent():
  def __init__(self, ENV_NAME='LunarLanderContinuous-v2', LINEAR_SIZE1=400, LINEAR_SIZE2=300, REPLAY_SIZE=int(1e6), BATCH_SIZE=64, TAU=0.001, AC_LR=0.001, CR_LR=0.001): #increased replay_size to 1 milli form 10,000
    self.GAMMA = 0.99
    self.TAU = TAU   # that T looking symbol for updating target network  
    self.batch_size = BATCH_SIZE

    self.env = gym.make(ENV_NAME)
    self.obs = self.env.reset()

    self.noise = OUActionNoise(mu=np.zeros(self.env.action_space.shape[0]))

    self.net_ac = DDPG_AC(self.obs.shape[0], self.env.action_space.shape[0], LINEAR_SIZE1, LINEAR_SIZE2).float()
    self.net_cr = DDPG_CR(self.obs.shape[0], self.env.action_space.shape[0], LINEAR_SIZE1, LINEAR_SIZE2).float()

    self.target_net_ac = DDPG_AC(self.obs.shape[0], self.env.action_space.shape[0], LINEAR_SIZE1, LINEAR_SIZE2).float()
    self.target_net_cr = DDPG_CR(self.obs.shape[0], self.env.action_space.shape[0], LINEAR_SIZE1, LINEAR_SIZE2).float()

    self.target_net_ac.load_state_dict(self.net_ac.state_dict())
    self.target_net_cr.load_state_dict(self.net_cr.state_dict())

    self.optimizer_ac = optim.Adam(self.net_ac.parameters(), lr=AC_LR)
    self.optimizer_cr = optim.Adam(self.net_cr.parameters(), lr=CR_LR)

    self.Experience = collections.namedtuple('Experience',['state', 'action', 'reward', 'done', 'state1'])
    self.memory = collections.deque(maxlen = REPLAY_SIZE)
    self.max_replay = REPLAY_SIZE

  
  def init_replay_buffer(self):
    for i in range(5000): #insted of filing the replay buffer to 1e6, im going to try filling it only up to batch size
      action = self.env.action_space.sample()
      obs1, reward, done, info = self.env.step(action)
      experience = self.Experience(self.obs, action, reward, done, obs1)
      self.memory.append(experience)
      self.obs = self.env.reset() if done else obs1
      
      if(i>=1000 and i%50==0): #update every 50 steps # not going to run
        states, actions, rewards, dones, state1s = self.get_batch()
        critic_loss, actor_loss = self.calc_loss_update(states, actions, rewards, dones, state1s)
        self.update_target()
        

  def act(self, episode_step_count, max_episode_steps):
    self.obs = torch.tensor(self.obs)
    action = self.net_ac.forward(self.obs.float())

    self.action_log = action[0].item()

    action = action + torch.Tensor(self.noise())  #basically turned up the exploration cause shit wasnt learning. *3 for MountainCarContinuous-v0, idk if different for lunar lander

    self.action_noised_log = action[0].item()

    obs1, reward, done, info = self.env.step(action.detach().numpy())

    if(episode_step_count >= max_episode_steps): #this way we reset the obs and take care of the reward func calculation
      done = True
    
    experience = self.Experience(self.obs.detach().numpy(), action.detach().numpy(), reward, done, obs1)
    
    self.obs = self.env.reset() if done else obs1

    return experience, reward, done  #returning done to count number of episodes

  def store_in_batch(self, experience):
    self.memory.append(experience)

  def get_batch(self):
    random_indexes = np.random.choice(len(self.memory), self.batch_size, replace = False) 
    
    states, actions, rewards, dones, state1s = zip(* [self.memory[index] for index in random_indexes]) # returns a list for each category

    states = torch.Tensor(np.array(states))           
    actions = torch.Tensor(np.array(actions))               
    rewards = torch.Tensor(rewards)      
    dones = torch.Tensor(dones)       
    state1s = torch.Tensor(state1s)

    return states, actions, rewards, dones, state1s

  def calc_loss_update(self, states, actions, rewards, dones, state1s):
    target_actor = self.target_net_ac(state1s.float())
    target_critic = self.target_net_cr(state1s.float(), target_actor.detach())
    target_critic = torch.squeeze(target_critic)

    critic = self.net_cr(states.float(), actions.float())
    critic = torch.squeeze(critic)
    
    #y = rewards + (self.GAMMA * target_critic * dones) 
    y = torch.empty(self.batch_size)
    for i in range(self.batch_size):
      y[i] = rewards[i] + (self.GAMMA * target_critic[i] * (1-dones[i]))

    #Critic Loss
    critic_loss = F.mse_loss(y.detach(), critic.float())
    
    self.optimizer_cr.zero_grad()
    critic_loss.backward()
    self.optimizer_cr.step()

    #Actor Loss
    actor_loss = -self.net_cr(states,self.net_ac(states)).mean()

    self.optimizer_ac.zero_grad()
    actor_loss.backward()
    self.optimizer_ac.step()

    return critic_loss, actor_loss
    
  def update_target(self):
    for target_param, param in zip(self.target_net_ac.parameters(), self.net_ac.parameters()):
      target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)

    for target_param, param in zip(self.target_net_cr.parameters(), self.net_cr.parameters()):
      target_param.data.copy_(self.TAU * param.data + (1 - self.TAU) * target_param.data)

  def close_agent(self):
    self.env.close()

In [None]:
def train():
  agent = Agent()
  agent.init_replay_buffer()
  episode_count = 0
  avg_reward = 0
  avg_100 =  collections.deque(maxlen = 100)
  episode_step_count = 0
  max_episode_steps = 1000
  
  wandb.init(project="ddpg")
  wandb.watch(agent.net_ac, log ="all")
  wandb.watch(agent.net_cr, log ="all")
  
  while (episode_count < 7000): #lol just use while true since we dont know when this bitch ass implementation is gonna converge
    with torch.autograd.set_detect_anomaly(True):
      experience, reward, done = agent.act(episode_step_count, max_episode_steps)
      agent.store_in_batch(experience)
      states, actions, rewards, dones, state1s = agent.get_batch()
      critic_loss, actor_loss = agent.calc_loss_update(states, actions, rewards, dones, state1s)

      agent.update_target()

      avg_reward+=reward
      episode_step_count += 1

      if done:
        avg_100.append(avg_reward)
        trail = (sum(avg_100)/len(avg_100))
        
        wandb.log({"Reward": avg_reward,
                   "Average 100 Rewards": trail,
                  "Actor Loss": actor_loss,
                  "Critic Loss": critic_loss,
                  "Action": agent.action_log,
                  "Noise + Action": agent.action_noised_log,
                  "Episode Step Count": episode_step_count,
                  "Replay Memory Size": len(agent.memory)})
               
        if(trail >= 200):
          torch.save(agent.net_ac.state_dict(),'lunar_saved.pth')
          agent.close_agent()
          break
        if(episode_count%20==0):
          torch.save(agent.net_ac.state_dict(),'lunar_saved.pth')

        episode_count+=1
        episode_step_count = 0
        print('episode: ',episode_count, 'reward: ', avg_reward)

        avg_reward = 0


In [11]:
train()

[34m[1mwandb[0m: Currently logged in as: [33molayemiy[0m (use `wandb login --relogin` to force relogin)


episode:  1 reward:  -734.8383805142325
episode:  2 reward:  -1160.857249845829
episode:  3 reward:  -1141.2303304683342
episode:  4 reward:  -577.081478154832
episode:  5 reward:  -1268.9980017015998
episode:  6 reward:  -1221.9929946269672
episode:  7 reward:  -1073.4173879534414
episode:  8 reward:  -468.1014564647585
episode:  9 reward:  -455.1771287013089
episode:  10 reward:  -483.8726421532127
episode:  11 reward:  -813.3322317873767
episode:  12 reward:  -681.1875754788256
episode:  13 reward:  -904.4723651416334
episode:  14 reward:  -583.6041791546422
episode:  15 reward:  -412.0855159719436
episode:  16 reward:  -328.84452742125984
episode:  17 reward:  -1055.0195683651368
episode:  18 reward:  -158.078724882326
episode:  19 reward:  -241.55167662014622
episode:  20 reward:  -115.29407536102138
episode:  21 reward:  -106.43071561003123
episode:  22 reward:  -138.72689989226689
episode:  23 reward:  -127.0810896248788
episode:  24 reward:  -241.5872518886905
episode:  25 rewa

KeyboardInterrupt: ignored

In [8]:

'''
#use to test model on computer
import gym
env = gym.make('LunarLanderContinuous-v2')
observation = env.reset()
net = DDPG_AC(observation.shape[0], env.action_space.shape[0], 400, 300).float()
net.load_state_dict(torch.load('lunar_saved.pth'))
net.eval()
for i_episode in range(10):
    observation = env.reset()
    done = False
    t = 0
    total_r = 0
    while not done:
        env.render()
        action = net(torch.tensor(observation).float())##
        observation, reward, done, info = env.step(action.detach().numpy())
        total_r+=reward
        if done:
            print("Episode finished after {} timesteps.".format(t+1), "Reward: ",total_r)
            break
        t+=1
env.close()
'''

Episode finished after 342 timesteps .Reward:  205.76836661123698
Episode finished after 1000 timesteps .Reward:  -45.21964558978234
Episode finished after 219 timesteps .Reward:  193.13217254422935
Episode finished after 261 timesteps .Reward:  203.09862346930754
Episode finished after 453 timesteps .Reward:  251.6061254850905
Episode finished after 495 timesteps .Reward:  199.5622514090789
Episode finished after 219 timesteps .Reward:  230.23015699397402
Episode finished after 308 timesteps .Reward:  229.800924026298
Episode finished after 192 timesteps .Reward:  219.18580547475423
Episode finished after 216 timesteps .Reward:  264.36335868571905
