In [1]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
import time
import random
import torch
from torch import nn, optim
from common.network import DuelingNetwork
from common.replay import PrioritizedReplayBuffer
from common.trainer import Trainer
from common.hparameter import *

""" seed """
seed=0
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

''' divice '''
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [4]:
""" Network """
net_d = DuelingNetwork(28, 13).to(device)
target_net_d = DuelingNetwork(28, 13).to(device)
optimizer_d = optim.Adam(net_d.parameters(), lr=learning_rate)
loss_func = nn.SmoothL1Loss(reduction='none')
net_a = DuelingNetwork(20, 3).to(device) #dummy

""" Replay buffer """
replay_buffer_d = PrioritizedReplayBuffer(buffer_size)
beta_func = lambda step: min(beta_end, beta_begin + (beta_end - beta_begin) * (step / beta_decay))

""" Epsilon """
epsilon_func = lambda step: max(epsilon_end, epsilon_begin - (epsilon_begin - epsilon_end) * (step / epsilon_decay))

""" Trainer """
trainer_d = Trainer(net_d, target_net_d, optimizer_d, loss_func, replay_buffer_d, gamma, device)

""" Environment """
from three_on_one import ThreeOnOne
accel_d = 2
accel_a = 2
env = ThreeOnOne(accel_defender=accel_d, accel_attacker1=accel_a, accel_attacker2=accel_a, accel_attacker3=accel_a, max_step=max_step_episode)

filename_model_d = "../model/defender_test.pth"

"""for test"""
num_episodes = 1000
print_interval_episode = 100
save_interval_episode = 100
initial_buffer_size = 1000

print('Start!')
start_time = time.time()

step = 0
for episode in range(num_episodes):
    step_episode = 0
    obs_d, obs_a1, obs_a2, obs_a3, with_b_a1, with_b_a2, with_b_a3 = env.reset()
    obs_d, obs_a1, obs_a2, obs_a3 = torch.Tensor(obs_d), torch.Tensor(obs_a1), torch.Tensor(obs_a2), torch.Tensor(obs_a3)
    done = False
    total_reward_d = 0
    total_reward_a1 = 0
    total_reward_a2 = 0
    total_reward_a3 = 0
    
    while not done:
        
        action_d = net_d.act(obs_d.float().to(device), epsilon_func(step))
        
        action_a1 = net_a.act(obs_a1.float().to(device), epsilon_func(step)) #dummy
        action_a2 = net_a.act(obs_a2.float().to(device), epsilon_func(step)) #dummy
        action_a3 = net_a.act(obs_a3.float().to(device), epsilon_func(step)) #dummy
                
        with_b_a1_tmp, with_b_a2_tmp, with_b_a3_tmp = with_b_a1, with_b_a2, with_b_a3
        
        next_obs_d, next_obs_a1, next_obs_a2, next_obs_a3, reward_d, reward_a1, reward_a2, reward_a3, done, obs_a, action_a, reward_a, next_obs_a, push_a, with_b_a1, with_b_a2, with_b_a3, to_a1, to_a2, to_a3 = \
            env.step(obs_d, obs_a1, obs_a2, obs_a3, action_d, action_a1, action_a2, action_a3, step_episode)
        
        next_obs_d, next_obs_a1, next_obs_a2, next_obs_a3 = torch.Tensor(next_obs_d), torch.Tensor(next_obs_a1), torch.Tensor(next_obs_a2), torch.Tensor(next_obs_a3)
                
        total_reward_d += reward_d
        total_reward_a1 += reward_a1
        total_reward_a2 += reward_a2
        total_reward_a3 += reward_a3
        
        replay_buffer_d.push([obs_d, action_d, reward_d, next_obs_d, done])
                
        obs_d = next_obs_d
        obs_a1 = next_obs_a1
        obs_a2 = next_obs_a2
        obs_a3 = next_obs_a3

        if len(replay_buffer_d) >= initial_buffer_size:
            trainer_d.update(batch_size, beta_func(step))
            
        if (step + 1) % target_update_interval == 0:
            target_net_d.load_state_dict(net_d.state_dict())
        
        step += 1
        step_episode += 1

    if (episode + 1) % print_interval_episode == 0:
        print(f'Episode: {episode + 1},  Step: {step + 1}, Episode_step: {step_episode + 1}, Reward_d: {round(total_reward_d, 2)}')

#     if (episode + 1) % save_interval_episode == 0:
#         torch.save(net_d.state_dict(), filename_model_d)
        
end_time = time.time()
diff_time = (end_time - start_time)/3600
print('Finish!', round(diff_time, 2), '[h]')


Start!
Episode: 100,  Step: 19271, Episode_step: 157, Reward_d: 1
Episode: 200,  Step: 29027, Episode_step: 219, Reward_d: 0
Episode: 300,  Step: 33789, Episode_step: 21, Reward_d: 0
Episode: 400,  Step: 43423, Episode_step: 70, Reward_d: 0
Episode: 500,  Step: 50553, Episode_step: 101, Reward_d: 0
Episode: 600,  Step: 60966, Episode_step: 21, Reward_d: 1
Episode: 700,  Step: 69276, Episode_step: 24, Reward_d: 1
Episode: 800,  Step: 75937, Episode_step: 19, Reward_d: 1
Episode: 900,  Step: 82789, Episode_step: 61, Reward_d: 1
Episode: 1000,  Step: 89657, Episode_step: 11, Reward_d: 1
Finish! 0.03 [h]
