In [1]:
import gym
import gym_mod_cartpole
from random import random, randint, uniform
from copy import deepcopy
import os
import torch
import numpy as np
from tqdm import tqdm

from GVF_learner import GVF_learner
from memory.memory import ReplayBuffer_decom
from models.dqn_model import DQNModel

FloatTensor = torch.cuda.FloatTensor
LongTensor = torch.cuda.LongTensor

# Writer = SummaryWriter(log_dir="CartPole_summary")

In [2]:
# ENV_NAME = 'CartPoleMod-v0'
ENV_NAME = 'CartPole-v1'
env = gym.make(ENV_NAME)
# env._max_episode_steps = 500
ACTION_DICT = {
    "LEFT": 0,
    "RIGHT": 1
}

  result = entry_point.load(False)


In [3]:
# Set result saveing floder
result_floder = ENV_NAME
result_file = ENV_NAME + "/results.txt"
if not os.path.isdir(result_floder):
    os.mkdir(result_floder)

In [4]:
hyperparams_CarPole = {
    'epsilon_decay_steps' : 200000, 
    'final_epsilon' : 0.05,
    'batch_size' : 128, 
    'update_steps' : 5, 
    'memory_size' : 200000, 
    'beta' : 0.99, 
    'model_replace_freq' : 500,
    'learning_rate' : 0.00001,
    'decom_reward_len': 8,
    'soft_tau': 5e-4
}

In [5]:
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space = len(ACTION_DICT)):
        
        self.env = env
        self.max_episode_steps = env._max_episode_steps
        
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon. 
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps', 
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        self.soft_tau = hyper_params['soft_tau']

        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        self.episode = 0
        self.steps = 0
        self.best_reward = -float("inf")
        self.action_space = action_space

        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
        """
        state = env.reset()
        self.state_len = len(state)
        input_len = self.state_len + action_space
        output_len = 1
        self.decom_reward_len = hyper_params["decom_reward_len"]
        
        self.action_vector = self.get_action_vector()
        self.eval_model = DQNModel(input_len, output_len, learning_rate = hyper_params['learning_rate'])
        self.target_model = DQNModel(input_len, output_len)
        
#         memory: Store and sample experience replay.
        self.memory = ReplayBuffer_decom(hyper_params['memory_size'])
        
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']
        
#         if os.path.isdir("CartPole_summary/Lunarlander/DQN(unconstraint)/"):
#             shutil.rmtree("CartPole_summary/Lunarlander/DQN(unconstraint)/")
        
        
    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate
    
    def get_action_vector(self):
        action_vector = np.zeros((self.action_space, self.action_space))
        for i in range(len(action_vector)):
            action_vector[i, i] = 1
        
        return FloatTensor(action_vector)
    
    def concat_state_action(self, states, actions = None, is_full_action = False):
        if is_full_action:
            com_state = FloatTensor(states).repeat((1, self.action_space)).view((-1, self.state_len))
            actions = self.action_vector.repeat((len(states), 1))
        else:
            com_state = states.clone()
            actions = actions.clone()
        state_action = torch.cat((com_state, actions), 1)
        return state_action
        
    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon, 
                               self.final_epsilon,
                               self.steps,
                               self.epsilon_decay_steps)
        self.epsilon = epsilon
        
        if p < epsilon:
            return randint(0, self.action_space - 1)
        else:
            return self.greedy_policy(state)[0]
        
    def greedy_policy(self, state):
        state_ft = FloatTensor(state).view(-1, self.state_len)
        state_action = self.concat_state_action(state_ft, is_full_action = True)
        feature_vectors, q_values = self.eval_model.predict_batch(state_action)
        q_v, best_action = q_values.max(0)
        return best_action.item(), q_v, feature_vectors[best_action.item()]
    
    def update_batch(self):
#         print(self.update_steps)
        if len(self.memory) < self.batch_size or self.steps % self.update_steps != 0:
            return

        batch = self.memory.sample(self.batch_size)

        (states_actions, _, reward, next_states,
         is_terminal, _) = batch
        
#         states_actions = states_actions
        next_states = FloatTensor(next_states)
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        
        batch_index = torch.arange(self.batch_size,
                                   dtype=torch.long)
        
        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states_actions)
        next_state_actions = self.concat_state_action(next_states, is_full_action = True)
        _, q_next = self.target_model.predict_batch(next_state_actions)
        q_next = q_next.view((-1, self.action_space))
        q_max, idx = q_next.detach().max(1)

        q_max = (1 - terminal) * q_max
        q_target = reward + self.beta * q_max
        q_target = q_target.unsqueeze(1)
        
        self.eval_model.fit(q_values, q_target)
        
    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []
        
        for i in range(test_number):
            # learn
            self.learn(test_interval)
            # evaluate
            avg_reward = self.evaluate((i + 1) * test_interval)
            all_results.append(avg_reward)
            
        return all_results
    
    def get_features_decom(self, state, next_state, done):
        
        threshold_x = 1
        threshold_c_v = 1
        threshold_angle = 0.07
        threshold_p_v = 0.7
        
        features_decom = np.ones(self.decom_reward_len)
#         cart_position, cart_velocity, pole_angle, pole_velocity = state
        next_cart_position, next_cart_velocity, next_pole_angle, next_pole_velocity = next_state
            
        if threshold_x < next_cart_position:
            features_decom[0] = -1
        if -threshold_x > next_cart_position:
            features_decom[1] = -1        
    
        if threshold_c_v < next_cart_velocity:
            features_decom[2] = -1
        if -threshold_c_v > next_cart_velocity:
            features_decom[3] = -1   
            
        if threshold_angle < next_pole_angle:
            features_decom[4] = -1
        if -threshold_angle > next_pole_angle:
            features_decom[5] = -1   
        
        if threshold_p_v < next_pole_velocity:
            features_decom[6] = -1
        if -threshold_p_v > next_pole_velocity:
            features_decom[7] = -1   
        return features_decom
    
    def learn(self, test_interval):
        
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0
            
            while steps < self.max_episode_steps and not done:
                steps += 1
                self.steps += 1
                
                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, _ = self.env.step(action)
                
                features_decom = self.get_features_decom(state, next_state, steps < self.max_episode_steps and done)
                action_vector = np.zeros(self.action_space)
                action_vector[action] = 1
                
                self.memory.add(np.concatenate((state.copy(), action_vector.copy()), axis=0), -1, reward, next_state, steps < self.max_episode_steps and done, features_decom)
                self.update_batch()
                
                if self.steps % self.model_replace_freq == 0:
                    if self.model_replace_freq == 1:
                        self.target_model.replace_soft(self.eval_model, tau = self.soft_tau)
                    else:
                        self.target_model.replace(self.eval_model)
                state = next_state

    def evaluate(self, episode_num, trials = 10):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0
            
            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)[0]
                state, reward, done, _ = self.env.step(action)
                total_reward += reward
            
        avg_reward = total_reward / trials
        print(avg_reward)
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
            print("save")
#         Writer.add_scalars(main_tag='CartPole/DQN',
#                                 tag_scalar_dict = {'DQN(unconstraint)':avg_reward}, 
# #                                 scalar_value=,
#                                 global_step=episode_num)
    
        return avg_reward
    
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')
        self.memory.save(result_floder)
        
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
        self.memory.load(result_floder)

## Train Cart Pole DQN agent
Generating policy and feature dataset

In [6]:
training_episodes, test_interval = 10000, 100
agent = DQN_agent(env, hyperparams_CarPole)
result = agent.learn_and_evaluate(training_episodes, test_interval)

Training:   7%|▋         | 7/100 [00:00<00:01, 58.97it/s]

Using GPU
Using GPU


Training: 100%|██████████| 100/100 [00:01<00:00, 77.63it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 232.72it/s]
Training:   9%|▉         | 9/100 [00:00<00:01, 83.02it/s]

9.5
save


Training: 100%|██████████| 100/100 [00:01<00:00, 80.26it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 220.29it/s]
Training:  10%|█         | 10/100 [00:00<00:00, 92.84it/s]

9.9
save


Training: 100%|██████████| 100/100 [00:01<00:00, 83.37it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 218.38it/s]
Training:   8%|▊         | 8/100 [00:00<00:01, 79.62it/s]

10.0
save


Training: 100%|██████████| 100/100 [00:01<00:00, 70.79it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 22.79it/s]
Training:   0%|          | 0/100 [00:00<?, ?it/s]

96.7
save


Training: 100%|██████████| 100/100 [00:01<00:00, 69.24it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 32.11it/s]
Training:   7%|▋         | 7/100 [00:00<00:01, 59.98it/s]

69.7


Training: 100%|██████████| 100/100 [00:01<00:00, 68.80it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 28.50it/s]
Training:   9%|▉         | 9/100 [00:00<00:01, 64.85it/s]

79.4


Training: 100%|██████████| 100/100 [00:01<00:00, 67.89it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 21.86it/s]


104.4


Training:   5%|▌         | 5/100 [00:00<00:02, 43.58it/s]

save


Training: 100%|██████████| 100/100 [00:01<00:00, 67.91it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 15.45it/s]


147.6


Training:   4%|▍         | 4/100 [00:00<00:02, 39.58it/s]

save


Training: 100%|██████████| 100/100 [00:01<00:00, 61.69it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 13.87it/s]


163.8


Training:   5%|▌         | 5/100 [00:00<00:02, 44.94it/s]

save


Training: 100%|██████████| 100/100 [00:01<00:00, 59.18it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  7.60it/s]


303.1


Training:   7%|▋         | 7/100 [00:00<00:01, 67.38it/s]

save


Training: 100%|██████████| 100/100 [00:01<00:00, 51.70it/s]
Evaluating: 100%|██████████| 10/10 [00:00<00:00, 10.37it/s]
Training:   3%|▎         | 3/100 [00:00<00:04, 21.72it/s]

220.7


Training: 100%|██████████| 100/100 [00:02<00:00, 48.71it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  6.26it/s]


366.4


Training:   4%|▍         | 4/100 [00:00<00:02, 37.60it/s]

save


Training: 100%|██████████| 100/100 [00:02<00:00, 47.66it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.75it/s]
Training:   5%|▌         | 5/100 [00:00<00:02, 45.58it/s]

263.0


Training: 100%|██████████| 100/100 [00:02<00:00, 42.94it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  6.46it/s]
Training:   4%|▍         | 4/100 [00:00<00:02, 34.27it/s]

355.0


Training: 100%|██████████| 100/100 [00:02<00:00, 42.96it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  6.50it/s]
Training:   6%|▌         | 6/100 [00:00<00:01, 57.24it/s]

352.9


Training: 100%|██████████| 100/100 [00:02<00:00, 40.00it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  6.33it/s]
Training:   4%|▍         | 4/100 [00:00<00:03, 31.93it/s]

359.1


Training: 100%|██████████| 100/100 [00:03<00:00, 33.21it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  6.13it/s]
Training:   4%|▍         | 4/100 [00:00<00:02, 33.53it/s]

361.6


Training: 100%|██████████| 100/100 [00:03<00:00, 32.34it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  6.08it/s]


368.6


Training:   4%|▍         | 4/100 [00:00<00:02, 37.59it/s]

save


Training:  58%|█████▊    | 58/100 [00:01<00:01, 31.81it/s]


KeyboardInterrupt: 

## GVF learner
Train GVF model base on the dataset and policy above

In [7]:
CP_GVF_PARAMETERS = {
    "batch size" : 64, # update batch size
    "learning rate" : 0.0001,
    "feature num" : 8, # numbers/length of feature
    "state length" : 4,
    "discount factor" : [0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99], # for each features respectively
    "action space": 2
}
dqn = DQN_agent(env, hyperparams_CarPole)
dqn.load_model()
def policy(state_actions):
    _, q_next = dqn.eval_model.predict_batch(state_actions)
    q_next = q_next.view((-1, dqn.action_space))
    q_max, idx = q_next.detach().max(1)
    return idx

def get_dataset():
    dataset = dqn.memory._storage
    print(dataset[:10])
    return np.array(dataset).tolist()


Using GPU
Using GPU


In [8]:
dataset = get_dataset()
gvf_learner = GVF_learner(CP_GVF_PARAMETERS, dataset, policy)
gvf_learner.learn_and_eval(1000, 10)

[(array([ 0.01070925, -0.04095484,  0.02713253,  0.04840564,  1.        ,
        0.        ]), -1, 1.0, array([ 0.00989016, -0.23645513,  0.02810065,  0.34952413]), False, array([1., 1., 1., 1., 1., 1., 1., 1.])), (array([ 0.00989016, -0.23645513,  0.02810065,  0.34952413,  1.        ,
        0.        ]), -1, 1.0, array([ 0.00516105, -0.43196524,  0.03509113,  0.650934  ]), False, array([1., 1., 1., 1., 1., 1., 1., 1.])), (array([ 0.00516105, -0.43196524,  0.03509113,  0.650934  ,  1.        ,
        0.        ]), -1, 1.0, array([-0.00347825, -0.62755791,  0.04810981,  0.95445708]), False, array([ 1.,  1.,  1.,  1.,  1.,  1., -1.,  1.])), (array([-0.00347825, -0.62755791,  0.04810981,  0.95445708,  0.        ,
        1.        ]), -1, 1.0, array([-0.01602941, -0.43311503,  0.06719895,  0.67726904]), False, array([1., 1., 1., 1., 1., 1., 1., 1.])), (array([-0.01602941, -0.43311503,  0.06719895,  0.67726904,  0.        ,
        1.        ]), -1, 1.0, array([-0.02469171, -0.2389879 

  0%|          | 1/1000 [00:02<45:03,  2.71s/it]

train loss : 0.16801116568269478


  1%|          | 11/1000 [00:29<44:33,  2.70s/it]

train loss : 0.026220173393445994


  2%|▏         | 21/1000 [00:56<44:15,  2.71s/it]

train loss : 0.013907237748605776


  3%|▎         | 31/1000 [01:23<43:37,  2.70s/it]

train loss : 0.009850291251403633


  4%|▍         | 41/1000 [01:50<43:14,  2.71s/it]

train loss : 0.007926718303004565


  5%|▌         | 51/1000 [02:18<43:02,  2.72s/it]

train loss : 0.006812760228427576


  6%|▌         | 61/1000 [02:45<42:29,  2.72s/it]

train loss : 0.0059693687004756905


  7%|▋         | 71/1000 [03:12<41:56,  2.71s/it]

train loss : 0.005369925528489195


  8%|▊         | 81/1000 [03:39<41:32,  2.71s/it]

train loss : 0.004878321036359675


  9%|▉         | 91/1000 [04:06<41:10,  2.72s/it]

train loss : 0.0045395614753540115


 10%|█         | 101/1000 [04:33<40:04,  2.67s/it]

train loss : 0.004157194351447658


 11%|█         | 111/1000 [05:00<39:43,  2.68s/it]

train loss : 0.003966465432833581


 12%|█▏        | 121/1000 [05:27<39:39,  2.71s/it]

train loss : 0.00373618696556983


 13%|█▎        | 131/1000 [05:54<39:21,  2.72s/it]

train loss : 0.0035337545165648166


 14%|█▍        | 141/1000 [06:21<38:54,  2.72s/it]

train loss : 0.0033796570704422244


 15%|█▌        | 151/1000 [06:48<38:11,  2.70s/it]

train loss : 0.0032901710998403896


 16%|█▌        | 161/1000 [07:15<37:23,  2.67s/it]

train loss : 0.0030956453264885902


 17%|█▋        | 171/1000 [07:42<37:09,  2.69s/it]

train loss : 0.0030141202674175893


 18%|█▊        | 181/1000 [08:09<37:00,  2.71s/it]

train loss : 0.0029343197924770296


 18%|█▊        | 183/1000 [08:14<36:46,  2.70s/it]

KeyboardInterrupt: 