In [1]:
import os
import numpy as np

from tianshou.policy import DDPGPolicy
from tianshou.utils.net.common import Net
from tianshou.exploration import GaussianNoise
from tianshou.trainer import offpolicy_trainer
from tianshou.utils.net.continuous import Actor, Critic
from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer, to_numpy
import torch
import datetime
import pickle
import matplotlib.pyplot as plt



In [4]:
from CF_env import CFEnv

usedRewardFunction = 'liming'

env = CFEnv(usedRewardFunction)
train_envs = CFEnv(usedRewardFunction)
test_envs = CFEnv(usedRewardFunction)


state_shape = env.observation_space.shape 
action_shape = env.action_space.shape
print("Observations shape:", state_shape)
print("Actions shape:", action_shape)
print(env.rewardName)

Observations shape: (3,)
Actions shape: (1,)
liming


In [3]:

resume_path = None

seed = 1
actor_lr = 1e-4
critic_lr = 1e-3
tau = .001
gamma = .99
exploration_noise = .15
buffer_size = 100000
batch_size = 256

# seed
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# model
net_a = Net(state_shape, hidden_sizes=[256, 256], device='cuda' if torch.cuda.is_available() else 'cpu')
actor = Actor(net_a, action_shape, device='cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    actor = actor.to('cuda')
actor_optim = torch.optim.Adam(actor.parameters(), lr=actor_lr)

net_c = Net(state_shape, action_shape,
            hidden_sizes=[256, 256],
            concat=True, device='cuda' if torch.cuda.is_available() else 'cpu')
critic = Critic(net_c, device='cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    critic = critic.to('cuda')
critic_optim = torch.optim.Adam(critic.parameters(), lr=critic_lr)

policy = DDPGPolicy(
    actor, actor_optim, critic, critic_optim,
    tau=tau, gamma=gamma,
    exploration_noise=GaussianNoise(sigma=exploration_noise),
    estimation_step=1, action_space=action_shape)

# load a previous policy
if resume_path:
    policy.load_state_dict(torch.load(resume_path, map_location='cuda' if torch.cuda.is_available() else 'cpu'))
    print("Loaded agent from: ", resume_path)

# collector

buffer = ReplayBuffer(buffer_size)
train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs)
train_collector.collect(n_step=buffer_size // 4, random=True)

# log
t0 = datetime.datetime.now().strftime("%m%d_%H%M%S")
log_file = f'seed_{seed}_{t0}_ddpg'
log_path = os.path.join('log', 'ddpg')

def save_fn(policy):
    torch.save(policy.state_dict(), os.path.join(log_path, log_file + '_policy.pth'))
    
# trainer
result = offpolicy_trainer(
    policy, train_collector, test_collector, max_epoch=15,
    step_per_epoch=10000, step_per_collect=1, episode_per_test= 1,
    batch_size=batch_size, save_fn=save_fn,
    update_per_step=1, test_in_train=False)

# Let's watch its performance!
# print('\n start testing')
# policy.eval()
# test_envs.seed(seed)
# test_collector.reset()
# result = test_collector.collect(n_episode=1)
# print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}')

#save policy
# torch.save(policy.state_dict(), os.path.join(log_path, log_file + '_policy.pth'))

Epoch #1: 10001it [02:09, 77.47it/s, env_step=10000, len=1614, loss/actor=7421.031, loss/critic=427594.035, n/ep=0, n/st=1, rew=-3175363.81]                           
Epoch #2:   0%|          | 15/10000 [00:00<02:01, 81.87it/s, env_step=10015, len=1614, loss/actor=7462.664, loss/critic=316008.168, n/ep=0, n/st=1, rew=-3175363.81]

Epoch #1: test_reward: -3167989.250000 ± 0.000000, best_reward: -1734.443604 ± 0.000000 in #0


Epoch #2: 10001it [02:06, 79.05it/s, env_step=20000, len=1614, loss/actor=18602.580, loss/critic=1595386.015, n/ep=0, n/st=1, rew=-3169050.61]                           
Epoch #3:   0%|          | 14/10000 [00:00<01:57, 85.28it/s, env_step=20014, len=1614, loss/actor=18666.871, loss/critic=1348980.447, n/ep=0, n/st=1, rew=-3169050.61]

Epoch #2: test_reward: -3167989.250000 ± 0.000000, best_reward: -1734.443604 ± 0.000000 in #0


Epoch #3: 10001it [02:06, 79.22it/s, env_step=30000, len=1614, loss/actor=30573.313, loss/critic=1964113.387, n/ep=0, n/st=1, rew=-3161062.73]                           
Epoch #4:   0%|          | 15/10000 [00:00<01:51, 89.74it/s, env_step=30015, len=1614, loss/actor=30694.356, loss/critic=2361068.220, n/ep=0, n/st=1, rew=-3161062.73]

Epoch #3: test_reward: -3167989.250000 ± 0.000000, best_reward: -1734.443604 ± 0.000000 in #0


Epoch #4: 10001it [02:06, 79.35it/s, env_step=40000, len=1614, loss/actor=43216.449, loss/critic=7929119.890, n/ep=0, n/st=1, rew=-3168184.35]                           
Epoch #5:   0%|          | 15/10000 [00:00<02:00, 82.96it/s, env_step=40015, len=1614, loss/actor=43413.108, loss/critic=9240920.619, n/ep=0, n/st=1, rew=-3168184.35]

Epoch #4: test_reward: -3167989.250000 ± 0.000000, best_reward: -1734.443604 ± 0.000000 in #0


Epoch #5: 10001it [02:06, 78.78it/s, env_step=50000, len=1614, loss/actor=55289.766, loss/critic=8471543.328, n/ep=0, n/st=1, rew=-3179741.22]                           
Epoch #6:   0%|          | 18/10000 [00:00<01:37, 102.59it/s, env_step=50018, len=1614, loss/actor=55468.651, loss/critic=10623230.201, n/ep=0, n/st=1, rew=-3179741.22]

Epoch #5: test_reward: -3167989.250000 ± 0.000000, best_reward: -1734.443604 ± 0.000000 in #0


Epoch #6:  48%|####8     | 4803/10000 [00:58<01:03, 81.64it/s, env_step=54802, len=1614, loss/actor=60525.509, loss/critic=10525524.827, n/ep=0, n/st=1, rew=-3168994.47]


KeyboardInterrupt: 

In [2]:
# this is for plotting the dynamic of reward during evaluation (not plotting now for this version)
# needs to be modified corresponding when the reward function in the training part is changed.
# ideally this process should be automatic (to update)
def get_reward(acceleration, newSpacing, follower_speed, leader_speed):
    newTimeGap = newSpacing / (follower_speed + .001)
    expected_speed = 33.5
    expected_time_gap = 1
    alpha = 1
    beta = 1
    gamma = 1
    delta = 4
    reward = 0
    
    # time headway
    penalty1 = [[0]]
    if expected_time_gap > newTimeGap > 0:
        penalty1 = - alpha * (100 - 100 * np.sqrt(max(0, expected_time_gap-(expected_time_gap-newTimeGap)**2)))
        reward += penalty1

    # speed reward
    reward2 = beta * min(expected_speed, follower_speed)
    reward += reward2

    #speed diff reward
    reward3 = [[0]]
    if newTimeGap < expected_time_gap and leader_speed > follower_speed:
        reward3 = gamma * (leader_speed - follower_speed) * (expected_time_gap - newTimeGap)
        reward += reward3
        
    penalty4 = - delta * acceleration ** 2
    reward += penalty4

    return reward, penalty1, reward2, reward3, penalty4

In [None]:
policy.load_state_dict(torch.load(os.getcwd() + '\\' + os.path.join(log_path, log_file + '_policy.pth'), 
                                  map_location='cuda' if torch.cuda.is_available() else 'cpu'))
print("Loaded agent from: ", log_file)
policy.eval()
rewardHist = [[np.nan] * 4]

from CF_env import vehicle
SIM_RESOLUTION = .1
USED_HISTORY_STAMP  = 1
leadSpeedProfile = np.genfromtxt('test_leader_high.csv', delimiter=',')
leaderLoc = [0]
followerLoc = [-30]
for v in leadSpeedProfile[1:USED_HISTORY_STAMP]:
    leaderLoc.append(leaderLoc[-1] + v * SIM_RESOLUTION)
    followerLoc.append(followerLoc[-1] + v * SIM_RESOLUTION)
follower = vehicle(leadSpeedProfile[:USED_HISTORY_STAMP], followerLoc)
leader = vehicle(leadSpeedProfile[:USED_HISTORY_STAMP], leaderLoc)
followerSpeedProfile = leadSpeedProfile[:USED_HISTORY_STAMP]
followerSpacing = np.array([30] * USED_HISTORY_STAMP)
observation = np.concatenate((leader.speedT, follower.speedT,
                          [leader.locT[i] - follower.locT[i] for i in range(USED_HISTORY_STAMP)]))

t = USED_HISTORY_STAMP
while True:
    with torch.no_grad():  
        obs = np.array([observation])
        result = policy.actor(obs)
    act = to_numpy(result[0])
    action = policy.map_action(act)
    
    follower.action_a(action)
    leader.action_v(leadSpeedProfile[t])
    newSpacing = leader.location - follower.location
    newStates = np.concatenate((leader.speedT, follower.speedT,
                          [leader.locT[i] - follower.locT[i] for i in range(USED_HISTORY_STAMP)]))
    followerSpeedProfile = np.append(followerSpeedProfile, [follower.speed])
    followerSpacing = np.append(followerSpacing, newSpacing)
    reward, penalty1, reward2, reward3, penalty4 = get_reward(action, 
                                                              newSpacing, 
                                                              follower.speed, 
                                                              leader.speed)
#     rewardHist.append([i[0][0] for i in [penalty1, reward2, reward3, penalty4]])
    #current obs
    t += 1
    observation = newStates
    if newSpacing < 0 or t >= len(leadSpeedProfile):
        done = 1
    else:
        done = 0
        
    if done:
        break




In [3]:

plt.figure(figsize = (10, 3))
plt.plot(np.arange(len(leadSpeedProfile)) / 10, leadSpeedProfile, label = 'leader')
plt.plot(np.arange(len(followerSpeedProfile)) / 10, followerSpeedProfile, label = 'follower')
plt.xlabel('time(s)')
plt.ylabel('speed(m/s)')
# plt.ylim(22.5, 32.5)
plt.grid(True)
plt.legend()
plt.show()

plt.figure(figsize = (10, 3))
plt.plot(np.arange(len(followerSpeedProfile)) / 10, followerSpacing)
plt.xlabel('time(s)')
plt.ylabel('spacing(m)')
plt.grid(True)
# plt.ylim(20, 45)

plt.show()



NameError: name 'plt' is not defined

In [None]:
print(round(min(followerSpeedProfile[:1000]),2), round(min(followerSpeedProfile[1000:]),2))

In [None]:
file = open(os.path.join(log_path, log_file + 'trajInfo'), 'wb')
pickle.dump([leadSpeedProfile, followerSpeedProfile, followerSpacing], file)
file.close()