<a href="https://colab.research.google.com/github/spdin/time-series-prediction-lstm-pytorch/blob/master/Time_Series_Prediction_with_LSTM_Using_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Human driving - GAIL

GAIL use specific idea like generative adversarial network (GAN).  it is a competition between the discriminator and policy.

Discriminator try to distinguish the real data and fake data generated by policy. 
- Higher (D(s,a)) value: Discriminator thinks the pair is from the expert.
- Lower (D(s,a)) value: Discriminator thinks the pair is from the policy.
- Policy’s goal: Generate pairs that make the discriminator uncertain, ideally resulting in (D(s,a)) values that are close to those of the expert’s pairs.
 
real data label - 1, while fake data label - 0

The policy tries to fool the discriminator. it is designd to maximize E[-log(1-D(s,a))], that is maximize E[log(D(s,a))].


# Library

In [1]:
import os

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn

from Utils.Environment_LC import ENVIRONMENT
from Utils.PPO import PPO
from Utils.GAIL import DISCRIMINATOR_FUNCTION   # NEW compare to PPO code

Path = os.getcwd()
print(Path)
PATH = "Trained_model/GAIL_0.pth"

#CUDA
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
if torch.cuda.is_available():
    torch.cuda.empty_cache()

c:\Users\Yannis\OneDrive - Georgia Institute of Technology\Desktop\New folder
cuda:0


In [2]:
try:
    expert_traj = np.load("Expert_trajectory/expert_traj.npy", allow_pickle=True)
except:
    print("Train, generate and save expert trajectories")
    assert False

try:
    testing_traj = np.load("Expert_trajectory/testing_traj.npy", allow_pickle=True)
except:
    print("Train, generate and save expert trajectories")
    assert False


# Hyperparameters

In [3]:
A_para = 'normal'
B_para = '2000'
Env = ENVIRONMENT(
    para_B                          = B_para,                               
    para_A                          = A_para, 
    noise                           = False)
Model_B = Env.model()

In [4]:
K_epochs = 16               # update policy for K epochs
eps_clip = 0.25             # clip parameter for PPO
gamma = 0.99                # discount factor

lr_actor = 0.001            # learning rate for actor network
lr_critic = 0.001           # learning rate for critic network

has_continuous_action_space = True

update_episode = 2#8

total_episodes   = 400
save_model_freq = 1


state_dim = 11
action_dim = 2 
action_bound = 2

ppo_agent  = PPO(state_dim, action_dim, action_bound, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.4)


# Print model's state_dict
print("Model's state_dict:")
for param_tensor in ppo_agent.policy.state_dict():
    print(param_tensor, "\t", ppo_agent.policy.state_dict()[param_tensor].size())


Model's state_dict:
actor_acc.0.weight 	 torch.Size([64, 11])
actor_acc.0.bias 	 torch.Size([64])
actor_acc.2.weight 	 torch.Size([64, 64])
actor_acc.2.bias 	 torch.Size([64])
actor_acc.4.weight 	 torch.Size([1, 64])
actor_acc.4.bias 	 torch.Size([1])
actor_yaw.0.weight 	 torch.Size([64, 11])
actor_yaw.0.bias 	 torch.Size([64])
actor_yaw.2.weight 	 torch.Size([64, 64])
actor_yaw.2.bias 	 torch.Size([64])
actor_yaw.4.weight 	 torch.Size([1, 64])
actor_yaw.4.bias 	 torch.Size([1])
critic.0.weight 	 torch.Size([64, 11])
critic.0.bias 	 torch.Size([64])
critic.2.weight 	 torch.Size([64, 64])
critic.2.bias 	 torch.Size([64])
critic.4.weight 	 torch.Size([1, 64])
critic.4.bias 	 torch.Size([1])


In [5]:
# NEW compare to PPO code
# Discriminator
D_epochs    = 16     # update discriminator for D epochs

lr_gail = 0.0001

expert_sample_size = expert_traj.shape[0] #len(expert_traj) 

Discriminator = DISCRIMINATOR_FUNCTION(state_dim, action_dim, lr_gail, D_epochs, expert_traj, expert_sample_size)

# Training

In [6]:
LC_end_pos = 0.5      # position deviation   0.5
LC_end_yaw = 0.005      # speed     0.02
PARAS = ['aggressive', 'normal', 'cautious']

path_idx    = 0
std = 0.7
Time_len = 500
lane_wid = 3.75               
veh_len  = 5.0
v_0      = 30   

#for episode in tqdm(range(total_episodes)):
for episode in range(total_episodes):
    print(episode)

    # Initialize the data
    states      = []
    actions     = []

    for A_para in PARAS:  # interact with different driver types iteratively, aggressive, normal, and cautious
        B_para = A_para

        # Environment
        Env.AB_update(A_para,B_para)
        Env.reset()    
        LC_start = False   
        LC_starttime = 0
        LC_endtime   = 0
        LC_mid       = 0

        for t in range(1, Time_len):  
            s_t, env_t = Env.observe()       #Observation        
            if t != env_t + 1:
                print('warning: time inconsistency!')

            Dat = Env.read()

            #Lane change indication
            if Dat[t-1,24]!=0 and LC_start == False and LC_starttime == 0:                 # if LC is true at the end of last time step
                LC_start = True  
                LC_starttime = t
            # finish lane change - stop in the center of the target lane
            elif abs(Dat[t-1,25] - 0.5*lane_wid) <= LC_end_pos and abs(Dat[t-1,26]) <= LC_end_yaw and LC_start == True and LC_endtime == 0:       
                LC_start = False
                LC_endtime   = t
              
            # B cross the line    
            if Dat[t-1,25]<=lane_wid and LC_mid==0:
                LC_mid = t

            # Low-level task: action              
            if LC_start == False:
                # longitudinal
                if Dat[t-1,25] > lane_wid:     # B in lane 2, B follow F   
                    act_0 = Model_B(Dat[t-1,9] - Dat[t-1,12] - veh_len, Dat[t-1,13], v_0, (Dat[t-1,10] - Dat[t-1,13], Dat[t-1,11] - Dat[t-1,14]))
                elif Dat[t-1,25] <= lane_wid:  # B cross the line, B follow E
                    act_0 = Model_B(Dat[t-1,0] - Dat[t-1,12] - veh_len, Dat[t-1,13], v_0, (Dat[t-1,1] - Dat[t-1,13], Dat[t-1,2] - Dat[t-1,14]))
                
                
                # lateral
                act_1 = 0                   # yaw rate
                action_B = [act_0, act_1*10]

                Env.run(action_B)
            else:
                state, _ = Env.observe()

                action = ppo_agent.select_action(state)
                Env.run(action) # run human behavior
                state_next, _ = Env.observe()

                reward = Discriminator.reward(state, action)

                Dat_tmp = Env.read()
                if abs(Dat_tmp[t,25] - 0.5*lane_wid) <= LC_end_pos and abs(Dat_tmp[t,26]) <= LC_end_yaw:
                    done = True
                else:
                    done = False
                                    
                ppo_agent.buffer.rewards.append(reward)
                ppo_agent.buffer.is_terminals.append(done)

                states.append(state)
                actions.append(action)
            
    ############# Policy updates ##########
    ppo_agent.update()

    # Discriminator
    states    = torch.FloatTensor(np.array(states)).squeeze().to(device)
    actions   = torch.FloatTensor(np.array(actions)).to(device)
    Discriminator.update(ppo_agent, states, actions)

    ############# Record episode results ##########
    # write conditions you want to save the trained model during the training
    # Code
    
    if episode % save_model_freq  == 0:
        path_idx   += 1
        max_score = -np.inf
        PATH       = "Trained_model/GAIL_"+str(path_idx)+".pth"
        ppo_agent.save(PATH)


0
1
2
3


KeyboardInterrupt: 