In [None]:
import gym
import gym_mod_cartpole
from random import random, randint, uniform
from env.decom_lunar_lander import LunarLander as LunarLander_decom_reward
from copy import deepcopy
import os
import torch
import numpy as np
from tqdm import tqdm

from GVF_learner import GVF_learner
from memory.memory import ReplayBuffer_decom
from models.dqn_model import DQNModel

FloatTensor = torch.cuda.FloatTensor
LongTensor = torch.cuda.LongTensor

# Writer = SummaryWriter(log_dir="CartPole_summary")

In [None]:
# ENV_NAME = 'CartPoleMod-v0'
ENV_NAME = 'LunarLander_decom'
env = LunarLander_decom_reward()
# env._max_episode_steps = 500
ACTION_DICT = {
    "NOOP": 0,
    "LEFT":1,
    "MAIN":2,
    "RIGHT":3
}

In [3]:
# Set result saveing floder
result_floder = ENV_NAME
result_file = ENV_NAME + "/results.txt"
if not os.path.isdir(result_floder):
    os.mkdir(result_floder)

In [4]:
hyperparams_Lunarlander = {
    'epsilon_decay_steps' : 200000, 
    'final_epsilon' : 0.01,
    'batch_size' : 128, 
    'update_steps' : 3, 
    'memory_size' : 100000, 
    'beta' : 0.99, 
    'model_replace_freq' : 1,
    'learning_rate' : 0.0001,
    'decom_reward_len': 8,
    'soft_tau': 5e-4
}

In [5]:
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space = len(ACTION_DICT)):
        
        self.env = env
        self.max_episode_steps = env._max_episode_steps
        
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon. 
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps', 
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        self.soft_tau = hyper_params['soft_tau']

        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        self.episode = 0
        self.steps = 0
        self.best_reward = -float("inf")
        self.action_space = action_space

        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
        """
        state = env.reset()
        self.state_len = len(state)
        input_len = self.state_len + action_space
        output_len = 1
        self.decom_reward_len = hyper_params["decom_reward_len"]
        
        self.action_vector = self.get_action_vector()
        self.eval_model = DQNModel(input_len, output_len, learning_rate = hyper_params['learning_rate'])
        self.target_model = DQNModel(input_len, output_len)
        
#         memory: Store and sample experience replay.
        self.memory = ReplayBuffer_decom(hyper_params['memory_size'])
        
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']
        
#         if os.path.isdir("CartPole_summary/Lunarlander/DQN(unconstraint)/"):
#             shutil.rmtree("CartPole_summary/Lunarlander/DQN(unconstraint)/")
        
        
    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate
    
    def get_action_vector(self):
        action_vector = np.zeros((self.action_space, self.action_space))
        for i in range(len(action_vector)):
            action_vector[i, i] = 1
        
        return FloatTensor(action_vector)
    
    def concat_state_action(self, states, actions = None, is_full_action = False):
        if is_full_action:
            com_state = FloatTensor(states).repeat((1, self.action_space)).view((-1, self.state_len))
            actions = self.action_vector.repeat((len(states), 1))
        else:
            com_state = states.clone()
            actions = actions.clone()
        state_action = torch.cat((com_state, actions), 1)
        return state_action
        
    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon, 
                               self.final_epsilon,
                               self.steps,
                               self.epsilon_decay_steps)
        self.epsilon = epsilon
        
        if p < epsilon:
            return randint(0, self.action_space - 1)
        else:
            return self.greedy_policy(state)[0]
        
    def greedy_policy(self, state):
        state_ft = FloatTensor(state).view(-1, self.state_len)
        state_action = self.concat_state_action(state_ft, is_full_action = True)
        feature_vectors, q_values = self.eval_model.predict_batch(state_action)
        q_v, best_action = q_values.max(0)
        return best_action.item(), q_v, feature_vectors[best_action.item()]
    
    def update_batch(self):
#         print(self.update_steps)
        if len(self.memory) < self.batch_size or self.steps % self.update_steps != 0:
            return

        batch = self.memory.sample(self.batch_size)

        (states_actions, _, reward, next_states,
         is_terminal, _) = batch
        
#         states_actions = states_actions
        next_states = FloatTensor(next_states)
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        
        batch_index = torch.arange(self.batch_size,
                                   dtype=torch.long)
        
        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states_actions)
        next_state_actions = self.concat_state_action(next_states, is_full_action = True)
        _, q_next = self.target_model.predict_batch(next_state_actions)
        q_next = q_next.view((-1, self.action_space))
        q_max, idx = q_next.detach().max(1)

        q_max = (1 - terminal) * q_max
        q_target = reward + self.beta * q_max
        q_target = q_target.unsqueeze(1)
        
        self.eval_model.fit(q_values, q_target)
        
    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []
        
        for i in range(test_number):
            # learn
            self.learn(test_interval)
            # evaluate
            avg_reward = self.evaluate((i + 1) * test_interval)
            all_results.append(avg_reward)
            
        return all_results
    
    def get_features_decom(self, state, next_state, done):
        
        threshold_x = 1
        threshold_c_v = 1
        threshold_angle = 0.07
        threshold_p_v = 0.7
        
        features_decom = np.ones(self.decom_reward_len)
#         cart_position, cart_velocity, pole_angle, pole_velocity = state
        next_cart_position, next_cart_velocity, next_pole_angle, next_pole_velocity = next_state
            
        if threshold_x < next_cart_position:
            features_decom[0] = -1
        if -threshold_x > next_cart_position:
            features_decom[1] = -1        
    
        if threshold_c_v < next_cart_velocity:
            features_decom[2] = -1
        if -threshold_c_v > next_cart_velocity:
            features_decom[3] = -1   
            
        if threshold_angle < next_pole_angle:
            features_decom[4] = -1
        if -threshold_angle > next_pole_angle:
            features_decom[5] = -1   
        
        if threshold_p_v < next_pole_velocity:
            features_decom[6] = -1
        if -threshold_p_v > next_pole_velocity:
            features_decom[7] = -1   
        return features_decom
    
    def learn(self, test_interval):
        
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0
            
            while steps < self.max_episode_steps and not done:
                steps += 1
                self.steps += 1
                
                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, _, features_decom = self.env.step(action)
                
#                 features_decom = self.get_features_decom(state, next_state, steps < self.max_episode_steps and done)
                action_vector = np.zeros(self.action_space)
                action_vector[action] = 1
                
                self.memory.add(np.concatenate((state.copy(), action_vector.copy()), axis=0), -1, reward, next_state.copy(), steps < self.max_episode_steps and done, features_decom)
                self.update_batch()
                
                if self.steps % self.model_replace_freq == 0:
                    if self.model_replace_freq == 1:
                        self.target_model.replace_soft(self.eval_model, tau = self.soft_tau)
                    else:
                        self.target_model.replace(self.eval_model)
                state = next_state

    def evaluate(self, episode_num, trials = 10):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0
            
            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)[0]
                state, reward, done, _, _ = self.env.step(action)
                total_reward += reward
            
        avg_reward = total_reward / trials
        print(avg_reward)
        print("storage len:", len(self.memory))
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
            print("save")
#         Writer.add_scalars(main_tag='CartPole/DQN',
#                                 tag_scalar_dict = {'DQN(unconstraint)':avg_reward}, 
# #                                 scalar_value=,
#                                 global_step=episode_num)
    
        return avg_reward
    
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')
        self.memory.save(result_floder)
        
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
        self.memory.load(result_floder)

## Train Cart Pole DQN agent
Generating policy and feature dataset

In [6]:
training_episodes, test_interval = 10000, 100
agent = DQN_agent(env, hyperparams_Lunarlander)
result = agent.learn_and_evaluate(training_episodes, test_interval)

Training:   0%|          | 0/100 [00:00<?, ?it/s]

Using GPU
Using GPU


Training:   2%|▏         | 2/100 [00:00<00:14,  6.66it/s]


KeyboardInterrupt: 

## GVF learner
Train GVF model base on the dataset and policy above

In [6]:
CP_GVF_PARAMETERS = {
    "batch size" : 128, # update batch size
    "learning rate" : 0.0001,
    "feature num" : 8, # numbers/length of feature
    "state length" : 8,
    "discount factor" : [0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99], # for each features respectively
    "action space": 4,
#     'model_replace_freq' : 300,
#     'soft_tau': 0.001
    'model_replace_freq' : 100,
    'soft_tau': 5e-4
}
dqn = DQN_agent(env, hyperparams_Lunarlander)
dqn.load_model()
def policy(state_actions):
#     print(1)
#     print(state_actions.size())
    _, q_next = dqn.eval_model.predict_batch(state_actions)
#     print(q_next.size())
    q_next = q_next.view((-1, dqn.action_space))
#     print(q_next)
    q_max, idx = q_next.detach().max(1)
#     print(3)
#     print(q_max)
#     print(idx)
#     input()
    return idx

def get_dataset():
    dataset = dqn.memory._storage
    print(dataset[:10])
    return np.array(dataset).tolist()


Using GPU
Using GPU


In [7]:
dataset = get_dataset()
gvf_learner = GVF_learner(CP_GVF_PARAMETERS, dataset, policy)
gvf_learner.learn_and_eval(1000, 10)

[(array([ 0.10382108,  0.60593271,  0.14626195, -0.67588508,  0.06564762,
       -0.0192917 ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ]), -1, 1.4214518519328976, array([ 0.10537729,  0.5908259 ,  0.15596962, -0.67139524,  0.06530218,
       -0.00690881,  0.        ,  0.        ], dtype=float32), False, [-0.014613074054139696, -0.00225600574639373, -0.00034543871879577637, 0.0, 0.0, 0.01, 0.0, 0]), (array([ 0.10537729,  0.59082592,  0.15596962, -0.67139524,  0.06530218,
       -0.00690881,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ]), -1, 4.328498174842946, array([ 0.10685253,  0.5764145 ,  0.14798655, -0.6404877 ,  0.06484399,
       -0.00916355,  0.        ,  0.        ], dtype=float32), False, [-0.01391495120266295, -0.031911842190797124, -0.00045818835496902466, 0.0, 0.0, 0.01, 0.0, 0]), (array([ 0.10685253,  0.57641453,  0.14798655, -0.64048767,  0.06484399,
       -0.00916355,  0.        ,  0.     

  0%|          | 1/1000 [00:03<58:34,  3.52s/it]

train loss : 0.003196578075243023


  1%|          | 11/1000 [00:36<54:42,  3.32s/it]

train loss : 0.0004636180208837542


  2%|▏         | 21/1000 [01:09<55:03,  3.37s/it]

train loss : 0.00037199559958253886


  3%|▎         | 31/1000 [01:43<54:15,  3.36s/it]

train loss : 0.0003664212837133317


  4%|▍         | 41/1000 [02:17<53:58,  3.38s/it]

train loss : 0.0003526521075381915


  5%|▌         | 51/1000 [02:50<53:13,  3.37s/it]

train loss : 0.0003497677548542706


  6%|▌         | 61/1000 [03:24<52:26,  3.35s/it]

train loss : 0.00035120272828197396


  7%|▋         | 71/1000 [03:57<51:39,  3.34s/it]

train loss : 0.0003643697391662662


  8%|▊         | 81/1000 [04:31<50:59,  3.33s/it]

train loss : 0.000363384985345695


  9%|▉         | 91/1000 [05:04<50:33,  3.34s/it]

train loss : 0.0003498347004285425


 10%|█         | 101/1000 [05:37<49:58,  3.34s/it]

train loss : 0.00034310379487578757


 11%|█         | 111/1000 [06:11<49:43,  3.36s/it]

train loss : 0.0003572095902432697


 12%|█▏        | 121/1000 [06:45<49:25,  3.37s/it]

train loss : 0.00035589936851000394


 13%|█▎        | 131/1000 [07:18<48:52,  3.37s/it]

train loss : 0.0003597012577269374


 14%|█▍        | 141/1000 [07:52<48:08,  3.36s/it]

train loss : 0.0003550283631401868


 15%|█▌        | 151/1000 [08:26<48:01,  3.39s/it]

train loss : 0.0003536543958118338


 16%|█▌        | 161/1000 [09:00<47:43,  3.41s/it]

train loss : 0.00035802731049106257


 17%|█▋        | 171/1000 [09:33<46:34,  3.37s/it]

train loss : 0.00036100297322375595


 18%|█▊        | 181/1000 [10:08<48:29,  3.55s/it]

train loss : 0.00035819332916079663


 19%|█▉        | 191/1000 [10:43<46:15,  3.43s/it]

train loss : 0.00036647299716338134


 20%|██        | 201/1000 [11:17<44:36,  3.35s/it]

train loss : 0.0003508519522021459


 21%|██        | 211/1000 [11:50<44:09,  3.36s/it]

train loss : 0.0003873337318276977


 22%|██▏       | 221/1000 [12:24<43:26,  3.35s/it]

train loss : 0.00036987531978032275


 23%|██▎       | 231/1000 [12:57<42:41,  3.33s/it]

train loss : 0.0003675684486963915


 24%|██▍       | 241/1000 [13:31<43:06,  3.41s/it]

train loss : 0.0003709778020748972


 25%|██▌       | 251/1000 [14:06<43:19,  3.47s/it]

train loss : 0.00038518775140324306


 26%|██▌       | 261/1000 [14:40<41:44,  3.39s/it]

train loss : 0.0003844504587954003


 27%|██▋       | 271/1000 [15:14<41:05,  3.38s/it]

train loss : 0.00039342637697104937


 28%|██▊       | 281/1000 [15:48<40:39,  3.39s/it]

train loss : 0.00037602953957584315


 29%|██▉       | 291/1000 [16:21<39:45,  3.37s/it]

train loss : 0.00038580651468861745


 30%|███       | 301/1000 [16:55<39:24,  3.38s/it]

train loss : 0.0004248920443055132


 31%|███       | 311/1000 [17:29<38:27,  3.35s/it]

train loss : 0.0004056327092434343


 32%|███▏      | 321/1000 [18:02<37:56,  3.35s/it]

train loss : 0.00039802700009420596


 33%|███▎      | 331/1000 [18:36<37:15,  3.34s/it]

train loss : 0.0004469767594194907


 34%|███▍      | 341/1000 [19:09<36:53,  3.36s/it]

train loss : 0.0004284293128150281


 35%|███▌      | 351/1000 [19:43<36:42,  3.39s/it]

train loss : 0.0004055026495099619


 36%|███▌      | 361/1000 [20:17<36:44,  3.45s/it]

train loss : 0.0003913175769923928


 37%|███▋      | 371/1000 [20:51<35:17,  3.37s/it]

train loss : 0.000407024529222396


 38%|███▊      | 381/1000 [21:25<34:57,  3.39s/it]

train loss : 0.0003798374208602179


 39%|███▉      | 391/1000 [21:58<34:07,  3.36s/it]

train loss : 0.0004144185302888348


 40%|████      | 401/1000 [22:32<33:19,  3.34s/it]

train loss : 0.00041388103841258215


 41%|████      | 411/1000 [23:05<33:17,  3.39s/it]

train loss : 0.0004252979950372712


 42%|████▏     | 421/1000 [23:39<32:18,  3.35s/it]

train loss : 0.0004048300931020878


 43%|████▎     | 431/1000 [24:13<31:52,  3.36s/it]

train loss : 0.00040890725002732916


 44%|████▍     | 441/1000 [24:47<31:21,  3.37s/it]

train loss : 0.00048215793622078556


 45%|████▌     | 451/1000 [25:21<31:30,  3.44s/it]

train loss : 0.0004322719939067648


 46%|████▌     | 461/1000 [25:54<30:18,  3.37s/it]

train loss : 0.0004144295462282271


 47%|████▋     | 471/1000 [26:28<29:59,  3.40s/it]

train loss : 0.0004117388008588888


 48%|████▊     | 481/1000 [27:02<29:14,  3.38s/it]

train loss : 0.0004516238655933403


 49%|████▉     | 491/1000 [27:36<28:33,  3.37s/it]

train loss : 0.0004906936722193949


 50%|█████     | 501/1000 [28:10<27:54,  3.36s/it]

train loss : 0.0004827405004400362


 51%|█████     | 511/1000 [28:43<27:36,  3.39s/it]

train loss : 0.0004720130416950511


 52%|█████▏    | 521/1000 [29:17<26:40,  3.34s/it]

train loss : 0.00044805536241195574


 53%|█████▎    | 531/1000 [29:50<26:02,  3.33s/it]

train loss : 0.0004552314112562409


 54%|█████▍    | 541/1000 [30:23<25:24,  3.32s/it]

train loss : 0.0004845858985272175


 55%|█████▌    | 551/1000 [30:57<24:51,  3.32s/it]

train loss : 0.0004974324048789013


 56%|█████▌    | 561/1000 [31:30<24:44,  3.38s/it]

train loss : 0.00048524845778793834


 57%|█████▋    | 571/1000 [32:04<23:44,  3.32s/it]

train loss : 0.0005126559171680352


 58%|█████▊    | 581/1000 [32:37<23:19,  3.34s/it]

train loss : 0.0005297528853675093


KeyboardInterrupt: 

In [8]:

test_data = torch.tensor([[0.015588092617690563, -0.0004975938936695457, 4.084892424316422e-08, 6.387576578781307e-10, 0.0024009563494473696, -9.596109151743804e-08, 1, 1, 0, 0, 1, 0]]).cuda()
print(test_data.size())
r = gvf_learner.eval_model(test_data)
print(r)

torch.Size([1, 12])
tensor([[ 0.0022,  0.0017, -0.0011, -0.0634, -0.1049,  0.0058, -0.0025,  0.7100]],
       device='cuda:0', grad_fn=<AddmmBackward>)
