In [1]:
from reinforcement_learning.deep_deterministic_policy_gradient import DDPG
from reinforcement_learning.utils.memory import Memory, Transition
from reinforcement_learning.env.arm import Arm

import torch
import os

In [2]:
EPISODES = 1000
STEPS = 200
BATCH_SIZE = 32

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
env = Arm()

s_dim = env.observation_dim
a_dim = env.action_dim
a_bound = env.action_space

model = DDPG(a_dim, s_dim, a_bound, 0.9, 0.01)
memory = Memory(30000)

In [5]:
def train():
    if os.path.isfile("./models/rl-ddpg-arm.tar"):
        model.load("./models/rl-ddpg-arm.tar")
    for i in range(EPISODES):
        score = 0
        value_loss, policy_loss = 0, 0
        state = torch.Tensor([env.reset()]).to(device)
        for j in range(STEPS):
            # env.render()
            action = model.predict(state)
            next_state, reward, done = env.step(action.cpu().numpy()[0])
            score += reward
            mask = torch.Tensor([done]).to(device)
            reward = torch.Tensor([reward]).to(device)
            next_state = torch.Tensor([next_state]).to(device)
            memory.push(state, action, mask, next_state, reward)
            state = next_state
            if len(memory) > BATCH_SIZE:
                transitions = memory.sample(BATCH_SIZE)
                batch = Transition(*zip(*transitions))
                value_loss, policy_loss = model.train(batch)
            if done:
                break
        if done:
            model.save("./models/rl-ddpg-arm.tar")
        print('Episode %d:%s Critic Loss %.2f, Action Loss %.2f' % (i+1, ' Finished Step %d,' % (j+1) if done else '', value_loss, policy_loss))

In [6]:
def test():
    model.load("./models/rl-ddpg-arm.tar")
    for _ in range(10):
        state = torch.Tensor([env.reset()]).to(device)
        while True:
            env.render()
            action = model.predict(state)
            next_state, reward, done = env.step(action.cpu().numpy()[0])
            state = torch.Tensor([next_state]).to(device)
            if done:
                break

In [7]:
train()

-0.19
Episode 659: Critic Loss 0.01, Action Loss -0.28
Episode 660: Critic Loss 0.00, Action Loss -0.15
Episode 661: Finished Step 51, Critic Loss 0.03, Action Loss -0.34
Episode 662: Finished Step 83, Critic Loss 0.04, Action Loss -0.29
Episode 663: Finished Step 96, Critic Loss 0.00, Action Loss -0.10
Episode 664: Critic Loss 0.04, Action Loss -0.44
Episode 665: Finished Step 114, Critic Loss 0.02, Action Loss -0.16
Episode 666: Critic Loss 0.01, Action Loss -0.48
Episode 667: Finished Step 54, Critic Loss 0.02, Action Loss -0.20
Episode 668: Critic Loss 0.00, Action Loss -0.31
Episode 669: Finished Step 101, Critic Loss 0.01, Action Loss -0.34
Episode 670: Critic Loss 0.00, Action Loss -0.40
Episode 671: Finished Step 60, Critic Loss 0.01, Action Loss -0.44
Episode 672: Finished Step 78, Critic Loss 0.01, Action Loss -0.20
Episode 673: Critic Loss 0.05, Action Loss -0.49
Episode 674: Critic Loss 0.00, Action Loss -0.12
Episode 675: Finished Step 101, Critic Loss 0.00, Action Loss -0

In [8]:
test()