In [1]:
#Install pybind11
!git clone https://github.com/pybind/pybind11.git
!cd pybind11 && mkdir build && cd build && cmake .. && make install

Cloning into 'pybind11'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 13925 (delta 6), reused 11 (delta 1), pack-reused 13897[K
Receiving objects: 100% (13925/13925), 5.44 MiB | 23.82 MiB/s, done.
Resolving deltas: 100% (9467/9467), done.
-- The CXX compiler identification is GNU 7.5.0
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- pybind11 v2.6.2 dev1
-- CMake 3.12.0
-- Found PythonInterp: /usr/bin/python3.6 (found version "3.6.9") 
-- Found PythonLibs: /usr/lib/x86_64-linux-gnu/libpython3.6m.so
-- PYTHON 3.6.9
-- Performing Test HAS_FLTO
-- Performing Test HAS_FLTO - Success
-- pybind11::lto enabled
-- pybind11::thin_lto enabled
-- Setting tests build type to Mi

In [2]:
#Install Eigen
!apt install libeigen3-dev
!ln -sf /usr/include/eigen3/Eigen /usr/include/Eigen

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Suggested packages:
  libeigen3-doc libmrpt-dev
The following NEW packages will be installed:
  libeigen3-dev
0 upgraded, 1 newly installed, 0 to remove and 14 not upgraded.
Need to get 810 kB of archives.
After this operation, 7,128 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libeigen3-dev all 3.3.4-4 [810 kB]
Fetched 810 kB in 1s (1,162 kB/s)
Selecting previously unselected package libeigen3-dev.
(Reading database ... 144793 files and directories currently installed.)
Preparing to unpack .../libeigen3-dev_3.3.4-4_all.deb ...
Unpacking libeigen3-dev (3.3.4-4) ...
Setting up libeigen3-dev (3.3.4-4) ...


In [3]:
# Install dependencies on colab
!git clone https://github.com/OttoJursch/DRL_robot_exploration.git

Cloning into 'DRL_robot_exploration'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 11108 (delta 31), reused 31 (delta 12), pack-reused 11049[K
Receiving objects: 100% (11108/11108), 284.76 MiB | 32.75 MiB/s, done.
Resolving deltas: 100% (124/124), done.
Checking out files: 100% (10919/10919), done.


In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np

class SizeEstimator(object):

    def __init__(self, model, input_size=(1,1,32,32), bits=32):
        '''
        Estimates the size of PyTorch models in memory
        for a given input size
        '''
        self.model = model
        self.input_size = input_size
        self.bits = bits
        return

    def get_parameter_sizes(self):
        '''Get sizes of all parameters in `model`'''
        mods = list(self.model.modules())
        sizes = []
        
        for i in range(1,len(mods)):
            m = mods[i]
            p = list(m.parameters())
            for j in range(len(p)):
                sizes.append(np.array(p[j].size()))

        self.param_sizes = sizes
        return

    def get_output_sizes(self):
        '''Run sample input through each layer to get output sizes'''
        input_ = Variable(torch.FloatTensor(*self.input_size), volatile=True)
        mods = list(self.model.modules())
        out_sizes = []
        for i in range(1, len(mods)):
            m = mods[i]
            out = m(input_)
            out_sizes.append(np.array(out.size()))
            input_ = out

        self.out_sizes = out_sizes
        return

    def calc_param_bits(self):
        '''Calculate total number of bits to store `model` parameters'''
        total_bits = 0
        for i in range(len(self.param_sizes)):
            s = self.param_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        self.param_bits = total_bits
        return

    def calc_forward_backward_bits(self):
        '''Calculate bits to store forward and backward pass'''
        total_bits = 0
        for i in range(len(self.out_sizes)):
            s = self.out_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        # multiply by 2 for both forward AND backward
        self.forward_backward_bits = (total_bits*2)
        return

    def calc_input_bits(self):
        '''Calculate bits to store input'''
        self.input_bits = np.prod(np.array(self.input_size))*self.bits
        return

    def estimate_size(self):
        '''Estimate model size in memory in megabytes and bits'''
        self.get_parameter_sizes()
        self.get_output_sizes()
        self.calc_param_bits()
        self.calc_forward_backward_bits()
        self.calc_input_bits()
        total = self.param_bits + self.forward_backward_bits + self.input_bits

        total_megabytes = (total/8)/(1024**2)
        return total_megabytes, total

In [45]:
import torch
import torch.nn as nn
import torchsummary
import numpy as np

def build_conv_feature_extractor(conv_dims, act):
  #Create Conv2D + MaxPool layers
  conv_layers = [nn.Conv2d(*conv_dim) if len(conv_dim) == 3 else nn.MaxPool2d(conv_dim) for conv_dim in conv_dims]
  total_layers = []

  #Add ReLU activations after each conv layer
  for layer in conv_layers:
    total_layers.append(layer)
    if type(layer) == nn.Conv2d:
      total_layers.append(act())
  return nn.Sequential(*total_layers)
  

def get_output_shape(model, image_dim):
    return model(torch.rand(*(image_dim))).data.shape

class RNNActor(nn.Module):
  #TODO Determine if the action space allows negative numbers
  #Potentially replace tanh with sigmoid
  def __init__(self, conv_dims, lstm_hidden, lstm_out, train_length, input_size=(1, 1,224,224), act=nn.ReLU, final_act=nn.Tanh):
    super(RNNActor, self).__init__()

    self.conv_mod = build_conv_feature_extractor(conv_dims, act)
    
    #Silly way to determine the size going into the RNN
    with torch.no_grad():
      feature_size = get_output_shape(self.conv_mod, input_size)

    print('LSTM Input Size', feature_size)

    #Construct LSTM
    self.lstm_hidden = lstm_hidden
    self.lstm = nn.LSTM(np.prod(list(feature_size)), lstm_hidden, lstm_out)
    self.train_length = train_length
    self.final_act = final_act

  def forward(self, image, positions, state):
    batch_size = x.size()[0]
    conv = self.conv_mod(x)
    flat = torch.reshape(conv.flatten(), (batch_size, self.train_length, self.lstm_hidden))
    state = torch.hstack((flat, positions))
    action, hidden = self.lstm(state, state)

    #Scale by the max/min bounds of the action space later
    return self.final_act(action), hidden

In [35]:
conv_dims = [(1, 32, 8), (32, 64, 4), (2, 2), (64, 64, 3), (64, 512, 7), (2, 2), (512, 64, 1)]

lstm_hidden = 512
lstm_out = 2
train_length = 50

rnn = RNNActor(conv_dims, lstm_hidden, lstm_out, train_length)

LSTM Input Size torch.Size([1, 64, 49, 49])


In [55]:
def build_dense_regression(linear_dims, act, final_act=None):
  print(linear_dims)
  linear_layers = [nn.Linear(*linear_dim)  for linear_dim in linear_dims]
  print(linear_layers)
  activations = [act() for layer in range(len(linear_layers) - 1)]
  if final_act is not None:
    activations.append(final_act)
  return nn.Sequential(*[val for tup in zip(*[linear_layers, activations]) for val in tup]
)

class CNNCritic(nn.Module):
  def __init__(self, conv_dims, fc_dims, input_size=(1, 1,224,224), conv_act=nn.ReLU, fc_act=nn.ReLU):
    super(CNNCritic, self).__init__()
    self.conv_mod = build_conv_feature_extractor(conv_dims, conv_act)

    #Silly way to determine the size going into the RNN
    with torch.no_grad():
      feature_size = get_output_shape(self.conv_mod, input_size)

    #Add 4 for action + position
    feature_size = np.prod(list(feature_size)) + 4
    first_output = fc_dims[0][0]
    fc_dims.insert(0, (feature_size, first_output))

    self.fc = build_dense_regression(fc_dims, fc_act)

    


  def forward(self, map, pos, action):
    batch_size = x.size()[0]
    map_feats = self.conv_mod(map)
    all_feats = torch.hstack([map_feats, pos, action])
    return self.fc(total_feats)

In [56]:
linear_dims = [(256, 128), (128, 1)]
conv_dims = [(1, 32, 8), (32, 64, 4), (2, 2), (64, 64, 3), (64, 512, 7), (2, 2), (512, 64, 1)]

critic = CNNCritic(conv_dims, linear_dims)

[(153664, 256), (256, 128), (128, 1)]
[Linear(in_features=153664, out_features=256, bias=True), Linear(in_features=256, out_features=128, bias=True), Linear(in_features=128, out_features=1, bias=True)]


In [None]:
""" Learn a policy using DDPG for the reach task"""
import numpy as np
import torch
import time
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import MultivariateNormal
from torch.nn import MSELoss
import random

import gym
import pybullet
import pybulletgym.envs

import matplotlib.pyplot as plt
import copy

np.random.seed(1000)


# TODO: A function to soft update target networks
def weighSync(target_model, source_model, tau=0.001):
  for (target, src) in zip(target_model.parameters(), source_model.parameters()):
    target.data = (1-tau) * target.data + tau * src.data 

# TODO: Write the ReplayBuffer
class Replay():
    def __init__(self, buffer_size, init_length, state_dim, action_dim, env):
        """
        A function to initialize the replay buffer.

        param: init_length : Initial number of transitions to collect
        param: state_dim : Size of the state space
        param: action_dim : Size of the action space
        param: env : gym environment object
        """
        self.buffer = np.zeros((buffer_size, 2 * state_dim + action_dim + 1))
        state = env.reset()
        self.buffer_idx = 0
        self.total_steps = 0
        last_state = env.reset()
        for i in range(init_length):
          action = np.random.uniform(-1, 1, (2,))
          state, reward, _, _ = env.step(action)
          self.buffer[self.buffer_idx, :] = np.hstack([state, last_state, action, reward])
          self.total_steps = min(self.total_steps + 1, len(self.buffer))
          self.buffer_idx = (self.buffer_idx + 1) % len(self.buffer)
          last_state = state



    # TODO: Complete the function
    def buffer_add(self, exp):
        """
        A function to add a dictionary to the buffer
        param: exp : A dictionary consisting of state, action, reward , next state and done flag
        """
        self.buffer[self.buffer_idx, :] = np.hstack([exp['state'], exp['last_state'], exp['action'].cpu().detach().numpy(), np.array([exp['reward']])])
        self.buffer_idx = (self.buffer_idx + 1) % len(self.buffer)
        self.total_steps = min(self.total_steps + 1, len(self.buffer))

    #TODO: Complete the function
    def buffer_sample(self, N):
        """
        A function to sample N points from the buffer
        param: N : Number of samples to obtain from the buffer
        """
        if N > self.total_steps:
          return torch.from_numpy(self.buffer[:self.total_steps, :]).float().to(device='cuda')

        perm = np.random.permutation(min(self.buffer.shape[0], self.total_steps))
        idx = perm[:N]
        samples = self.buffer[idx,:]
        self.total_steps += 1
        return torch.from_numpy(samples).float().to(device='cuda')


# TODO: Implement a DDPG class
class DDPG():
    def __init__(
            self,
            env,
            action_dim,
            state_dim,
            critic_lr=3e-4,
            actor_lr=3e-4,
            gamma=0.99,
            batch_size=100,
            seed=1000
    ):
        """
        param: env: An gym environment
        param: action_dim: Size of action space
        param: state_dim: Size of state space
        param: critic_lr: Learning rate of the critic
        param: actor_lr: Learning rate of the actor
        param: gamma: The discount factor
        param: batch_size: The batch size for training
        """
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.gamma = gamma
        self.batch_size = batch_size
        self.env = env
        self.state_dim = state_dim

        # TODO: Create a actor and actor_target
        self.actor = Actor(state_dim, action_dim).to(device='cuda')
        self.actor_target = copy.deepcopy(self.actor)

        # TODO: Make sure that both networks have the same initial weights

        # TODO: Create a critic and critic_target object
        self.critic = Critic(state_dim, action_dim).to(device='cuda')
        self.critic_target = copy.deepcopy(self.critic)
        # TODO: Make sure that both networks have the same initial weights

        # TODO: Define the optimizer for the actor
        self.optimizer_actor = optim.Adam(self.actor.parameters(), actor_lr)
        # TODO: Define the optimizer for the critic
        self.optimizer_critic = optim.Adam(self.critic.parameters(), critic_lr)

        # TODO: define a replay buffer
        self.ReplayBuffer = Replay(10000, 1000, state_dim, action_dim, env)

    # TODO: Complete the function
    def update_target_networks(self):
        """
        A function to update the target networks
        """
        weighSync(self.actor_target, self.actor)
        weighSync(self.critic_target, self.critic)

    # TODO: Complete the function
    def update_network(self, y_i, obs, actions):
        """
        A function to update the function just once
        """
        qs = self.critic(obs, actions).squeeze(-1)
        critic_loss = self.critic_criterion(y_i, qs)
        critic_loss.backward()

        self.optimizer_critic.step()

        act = self.actor(obs)
        qs = self.critic(obs, act)
        #print(actions.size())
        (-qs).mean().backward()
        self.optimizer_actor.step()

  
    # TODO: Complete the function
    def train(self, num_steps):
        """
        Train the policy for the given number of iterations
        :param num_steps:The number of steps to train the policy for
        """
        self.critic_criterion = MSELoss()
        noise = MultivariateNormal(torch.zeros(2), torch.diag(torch.tensor([0.1, 0.1])))
        num_episodes = 0
        i = 0
        total_reward = 0
        total_steps = 0
        episode_reward = 0
        test_env = gym.make('modified_gym_env:ReacherPyBulletEnv-v1')
        steps_list = []
        rewards = []
        while i < num_steps:
          done = False
          last_state = self.env.reset()
          num_episodes += 1
          while not done:
            i += 1
            self.optimizer_critic.zero_grad()
            self.optimizer_actor.zero_grad()

            action = self.actor(torch.from_numpy(last_state).float().to(device='cuda')) + noise.sample().to(device='cuda')
            state, reward, done, _ = self.env.step(action.cpu().detach().numpy())
            self.ReplayBuffer.buffer_add({'state':state, 'last_state':last_state, 'reward':reward, 'action':action})
            batch = self.ReplayBuffer.buffer_sample(self.batch_size)
            #print(batch.size())
            r_i = batch[:, -1]
            actions = batch[:,2*self.state_dim:2*self.state_dim+2]
            states = batch[:,self.state_dim:2*self.state_dim]
            next_states = batch[:, :self.state_dim]
            with torch.no_grad():
              crit = self.critic_target(next_states, self.actor_target(next_states)).squeeze(-1)
              y_i = r_i + self.gamma * crit

            self.update_network(y_i, states, actions)

            self.update_target_networks()

            if i % 100 == 0:
              test_done = False
              episode_reward = 0
              the_steps = 0
              s = test_env.reset()
              while not test_done:
                total_steps += 1
                the_steps += 1
                action = self.actor(torch.from_numpy(s).float().to(device='cuda')).detach().squeeze().cpu().numpy()
                n_state, r, test_done, _ = test_env.step(action)
                s = n_state
                episode_reward += r

              rewards.append(episode_reward)
              steps_list.append(the_steps)
              print('Episode reward')
              print(episode_reward)

            last_state = state

          state = self.env.reset()

        return rewards, steps_list