In [None]:
import torch
import torch.nn as nn
import numpy as np

In [None]:
class ReplayBuffer():
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr  = 0
        self.state = np.zeros((self.mem_size, input_shape))
        self.next_state = np.zeros((self.mem_size, input_shape))
        self.action = np.zeros((self.mem_size, n_actions))
        self.reward = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 

    def store_transition(self, state, reward, action, next_state, done):
        index = self.mem_cntr % self.mem_size
        self.state[index] = state
        self.action[index] = action
        self.reward[index] = reward
        self.next_state[index] = next_state
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    def Sample(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace = False)
        value = np.random.randint(batch)
        return self.state[value], self.action[value], self.reward[value], self.next_state[value], self.terminal_memory[value]

In [None]:
class Actor(nn.Module):
    def __init__(self, in_dim, out_size, hidden_dim = 512):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, out_size)

    def forward(self, state):
        print(state)
        state = torch.relu(self.fc1(state))
        state = torch.relu(self.fc2(state))
        state = torch.tanh(self.out(state))

        return state

In [None]:
class Critic(nn.Module):
    def __init__(self, in_dim, out_size = 1, hidden_dim = 512):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, out_size)

    def forward(self, state, action):
        input = torch.cat([state, action], dim = 1)
        input = torch.relu(self.fc1(input))
        input = torch.relu(self.fc2(input))

        return self.out(input)

In [None]:
class Agent():
    def __init__(self, input_dims, alpha = 0.001, beta = 0.002, env = None, gamma = 0.99, n_actions = 2, max_size = 1000000, tau = 0.005, fc1 = 400, fc2 = 300, batch_size = 64, noise = 0.1):
        self.input_dim = input_dims
        self.alpha = alpha
        self.beta = beta
        self.tau = tau
        self.gamma = gamma
        self.n_action = n_actions
        self.noise = noise
        self.batch_size = batch_size
        self.actor = Actor(input_dims, n_actions)
        self.critic = Critic(input_dims)
        self.target_actor = Actor(input_dims, n_actions)
        self.target_critic = Critic(input_dims)
        self.A_opt = torch.optim.Adam(self.actor.parameters(), lr = alpha)
        self.C_opt = torch.optim.Adam(self.critic.parameters(), lr = beta)
        #self.device = torch.device('cpu' if torch.cuda.is_available() else 'cuda')
        self.critic_criterion = nn.MSELoss()
        self.min_action = env.action_space.low[0]
        self.max_action = env.action_space.high[0]
        self.n_actions = n_actions
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        with torch.no_grad():
            for param_actor, param_target_actor in zip(self.actor.parameters(), self.target_actor.parameters()):
                param_target_actor.copy_(tau * param_actor + (1 - tau) * param_target_actor)

            for param_critic, param_target_critic in zip(self.critic.parameters(), self.target_critic.parameters()):
                param_target_critic.copy_(tau * param_critic + (1 - tau) * param_target_critic)

    def remember(self, state, reward, action, next_state, done):
        self.memory.store_transition(state, reward, action, next_state, done)

    def choose_action(self, state, evaluate = False):
        state = torch.tensor(state)
        state = state.to(dtype = torch.float32)

        actions = self.actor(state)

        if not evaluate:
            noise = torch.normal(mean=0.0,
                     std=self.noise,
                     size=(self.n_actions,))
            actions = actions + noise

            
        actions = torch.clamp(actions, self.min_action, self.max_action)

        return actions.detach().cpu().numpy()
    
    def learn(self):
        states, action, reward, new_state, done = self.memory.Sample(self.batch_size)
        states = torch.tensor(states)
        new_state = torch.tensor(new_state)
        reward = torch.tensor(reward)
        action = torch.tensor(action)

        y_pred = self.critic(states, action)

        target_action = self.target_actor(new_state)

        target_value = self.target_critic(new_state, target_action)

        y_target = reward + self.gamma * target_value * (1 - done)

        self.C_opt.zero_grad()

        loss = self.critic_criterion(y_target, y_pred)

        loss.backward()

        self.C_opt.step()
        
        new_action = self.actor(states)

        value = self.critic(states, new_action)

        self.A_opt.zero_grad()

        loss = -torch.mean(value)

        loss.backward()

        self.A_opt.step()

In [None]:
import gym
from gym import wrappers
import numpy as np

try:
    np.bool8
except AttributeError:
    np.bool8 = np.bool_

try:
    np.int
except AttributeError:
    np.int = int

try:
    np.float
except AttributeError:
    np.float = float

if __name__ == '__main__':
    env = gym.make('Pendulum-v1')
    agent = Agent(input_dims=env.observation_space.shape[0], env=env,
            n_actions=env.action_space.shape[0])
    n_games = 250

    best_score = env.reward_range[0]
    score_history = []

    evaluate = False

    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        while not done:
            action = agent.choose_action(observation[0], evaluate)
            observation_, reward, terminated, truncated, info = env.step(action)
        
        # 'done' is now either terminated or truncated
            done = terminated or truncated
            score += reward
            agent.remember(observation[0], action, reward, observation_[0], done)
            observation = observation_

        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score

        print('episode ', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)