In [1]:
# train bipedal walker using DDPG

In [4]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import namedtuple, deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [7]:
env_id = 'BipedalWalker-v3'
env = gym.make(env_id)
env.seed(10)

[10]

In [13]:
state_size = env.observation_space.shape[0] #state size = 24

In [14]:
action_size = env.action_space.shape[0] #action size = 4

In [10]:
### Since action space is continuous, we can implement an Actor-Critic method algorithm known as DDPG

### Part 1. Actor Critic

In [17]:
class Actor(nn.Module):
    """
    Actor (policy) model.
    """
    def __init__(self, state_size, action_size, seed=10, fc_units=256):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc_units)
        self.fc2 = nn.Linear(fc_units, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(-1.5e-3, 1.5e-3)
        self.fc2.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = F.relu(self.fc1(state))
        return F.tanh(self.fc2(x))

In [18]:
class Critic(nn.Module):
    """Critic (value) model"""
    def __init__(self, state_size, action_size, seed=10, fcs1_units=600, fcs2_units=300, fca1_units=300):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fcs1_units (int): Number of nodes in the first hidden layer
            fc2_units (int): Number of nodes in the second hidden layer
        """
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fcs1 = nn.Linear(state_size, fcs1_units)
        self.fcs2 = nn.Linear(fcs1_units, fcs2_units)
        self.fca1 = nn.Linear(action_size, fca1_units)
        self.fc1 = nn.Linear(fcs2_units, 1)
        self.bn1 = nn.BatchNorm1d(fcs1_units)
        self.reset_parameters()

    def reset_parameters(self):

        self.fcs2.weight.data.uniform_(-1.5e-3, 1.5e-3)
        self.fc1.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state, action):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        xs = F.relu((self.bn1(self.fcs1(state))))
        xs = self.fcs2(xs)
        xa = self.fca1(action)
        x = F.relu(torch.add(xs, xa))
        return self.fc1(x)

In [19]:
actor, critic = Actor(state_size, action_size), Critic(state_size, action_size)

In [20]:
actor

Actor(
  (fc1): Linear(in_features=24, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=4, bias=True)
)

In [21]:
critic

Critic(
  (fcs1): Linear(in_features=24, out_features=600, bias=True)
  (fcs2): Linear(in_features=600, out_features=300, bias=True)
  (fca1): Linear(in_features=4, out_features=300, bias=True)
  (fc1): Linear(in_features=300, out_features=1, bias=True)
  (bn1): BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)