# 1

In [6]:
import tensorflow as tf
import numpy as np
import tensorflow.keras.layers as k1
# policy & value via Keras Model API
class ProbabilityDistribution(tf.keras.Model):
    def call(self, logits): # logits: [batch_size, num_classes]
        # sample a random categorical action from given logits
        return tf.squeeze(tf.random.categorical(logits=logits, num_samples=1), axis=1)
        # return of tf.random.categorical(): [batch_size, num_samples]
        # return of tf.squeeze(logits, num_samples): Removes dimensions of size 1 from the shape of a tensor. 
            # logits: 2-D Tensor with shape [batch_size, num_classes]. Each slice [i, :] represents the unnormalized log-probabilities for all classes.
                # log probabilities: https://ratsgo.github.io/deep%20learning/2017/09/24/loss/
                    # MNIST를 예로 들면, 입력값 X는 (mini_batch=256, 24*24=784)의 모양
                    # theta 네트워크에 대하여, 각각의 입력 xi(i(0~253), :(784))에 대한
                    # yi(0~9)의 로그 확률을 나타낸다.
        # 즉, tf.squeeze(tf.random.categorical())는 batch 속 하나의 입력 데이터가 어떤 클래스에 속할지를 확률로 나타낸다.

class Model(tf.keras.Model):
    def __init__(self, num_actions):
        super().__init__('mlp_policy')
        # no tf.get_variable(), just simple Keras API
        self.hidden1 = k1.Dense(128, activation='relu')
        self.hidden2 = k1.Dense(128, activation='relu')
        self.value = k1.Dense(1, name='value')
        # logits are unnormalized log probabilites
        self.logits = k1.Dense(num_actions, name='policy_logits')
        self.dist = ProbabilityDistribution()
        
    def call(self, inputs):
        # inputs is a numpy array, convert to Tensor
        x = tf.convert_to_tensor(inputs, dtype=tf.float32)
        # separate hidden layers from the same input tensor
        hidden_logs = self.hidden1(x)
        hidden_vals = self.hidden2(x)
        return self.logits(hidden_logs), self.value(hidden_vals)
    
    def action_value(self, obs):
        # executes call() under the hood
        logits, value = self.predict(obs)
        action = self.dist.predict(logits)
        # a simpler option, will become clear later wht we don't use it
        # action = tf.random.categorical(logits, 1)
        return np.squeeze(action, axis=-1), np.squeeze(value, axis=-1)
import gym
env = gym.make('CartPole-v0')
model = Model(num_actions=env.action_space.n)
obs = env.reset()
# no feed_dict or tf.Session() needed at all
action, value = model.action_value(obs[None, :])
print(action, value) # [1]
# Random Agent
class A2CAgent:
    def __init__(self, model):
        self.model = model
    def test(self, env, render=True):
        obs, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(obs[None, :])
            obs, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
        return ep_reward
    
agent = A2CAgent(model)
reward_sum = agent.test(env)
print("%d out of 200" % reward_sum)

In [12]:
# Loss / Objective Function
# In actor-critic we train on three objectives: improving policy with advantage weighted gradients plus entropy maximization, and minizing value estimate errors.
import tensorflow.keras.losses as kls
import tensorflow.keras.optimizers as ko

class A2CAgent:
    def __init__(self, model):
        # hyperparameters for loss terms
        self.params = {'value': 0.5, 'entropy': 0.0001}
        self.model = model
        self.model.compile(optimizer = ko.RMSprop(lr=0.0007),
                          # define separate losses for policy logits and value estimate
                          loss = [self._logits_loss, self._value_loss]
                          )
        
    def test(self, env, render=True):
        # unchanged from previous section
        obs, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(obs[None, :])
            obs, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
        return ep_reward
    
    def _value_loss(self, returns, value):
        # value loss is typically MSE between value estimates and returns
        return self.params['value']*kls.mean_squared_error(returns, value)
    
    def _logits_loss(self, acts_and_advs, logits):
        # a trick to input actions and advantages through same API
        actions, advantages = tf.split(acts_and_advs, 2, axis=-1)
        # sparse categorical CE loss obj that supports sample_weight arg on call()
        # from_logits argument ensures transformation into normalized probabilities
        weighted_sparse_ce = kls.SparseCategoricalCrossentropy(from_logits=True)
        # policy loss is defined by policy gradients, weighted by advantages
        # note: we only calculate the loss on the actions we've actually taken
        actions = tf.cast(actions, tf.int32)
        policy_loss = weighted_sparse_ce(actions, logits, sample_weight=advantages)
        # entropy loss can be calculated via CE over itself
        entropy_loss = kls.categorical_crossentropy(logits, logits, from_logits=True)
        # here signs are flipped because optimizer minimizes
        return policy_loss - self.params['entropy']*entropy_loss


In [13]:
class A2CAgent:
    def __init__(self, model):
        # hyperparameters for loss terms
        self.params = {'value': 0.5, 'entropy': 0.0001, 'gamma': 0.99}
        # unchanged from previous section
        self.model = model
        self.model.compile(optimizer = ko.RMSprop(lr=0.0007),
                          # define separate losses for policy logits and value estimate
                          loss = [self._logits_loss, self._value_loss]
                          )
    def train(self, env, batch_sz=32, updates=1000):
        # storage helpers for a single batch of data
        actions = np.empty((batch_sz,), dtype=np.int32)
        rewards, dones, values = np.empty((3, batch_sz))
        observations = np.empty((batch_sz,) + env.observation_space.shape)
        # training loop: collect samples, send to optimizer, repeat updates times
        ep_rews = [0.0]
        next_obs = env.reset()
        for update in range(updates):
            for step in range(batch_sz):
                observations[step] = next_obs.copy()
                actions[step], values[step] = self.model.action_value(next_obs[None, :])
                next_obs, rewards[step], dones[step], _ = env.step(actions[step])

                ep_rews[-1] += rewards[step]
                if dones[step]:
                    ep_rews.append(0.0)
                    next_obs = env.reset()

            _, next_value = self.model.action_value(next_obs[None, :])
            returns, advs = self._returns_advantages(rewards, dones, values, next_value)
            # a trick to input actions and advantages through same API
            acts_and_advs = np.concatenate([actions[:, None], advs[:, None]], axis=-1)
            # performs a full training step on the collected batch
            # note: no need to mess around with gradients, Keras API handles it
            losses = self.model.train_on_batch(observations, [acts_and_advs, returns])
        return ep_rews

    def _returns_advantages(self, rewards, dones, values, next_value):
        # next_value is the bootstrap value estimate of a future state (the critic)
        returns = np.append(np.zeros_like(rewards), next_value, axis=-1)
        # returns are calculated as discounted sum of future rewards
        for t in reversed(range(rewards.shape[0])):
            returns[t] = rewards[t] + self.params['gamma'] * returns[t+1] * (1-dones[t])
        returns = returns[:-1]
        # advantages are returns - baseline, value estimates in our case
        advantages = returns - values
        return returns, advantages

    def test(self, env, render=True):
        # unchanged from previous section
        obs, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(obs[None, :])
            obs, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
        return ep_reward
    
    def _value_loss(self, returns, value):
        # value loss is typically MSE between value estimates and returns
        return self.params['value']*kls.mean_squared_error(returns, value)
    
    def _logits_loss(self, acts_and_advs, logits):
        # a trick to input actions and advantages through same API
        actions, advantages = tf.split(acts_and_advs, 2, axis=-1)
        # sparse categorical CE loss obj that supports sample_weight arg on call()
        # from_logits argument ensures transformation into normalized probabilities
        weighted_sparse_ce = kls.SparseCategoricalCrossentropy(from_logits=True)
        # policy loss is defined by policy gradients, weighted by advantages
        # note: we only calculate the loss on the actions we've actually taken
        actions = tf.cast(actions, tf.int32)
        policy_loss = weighted_sparse_ce(actions, logits, sample_weight=advantages)
        # entropy loss can be calculated via CE over itself
        entropy_loss = kls.categorical_crossentropy(logits, logits, from_logits=True)
        # here signs are flipped because optimizer minimizes
        return policy_loss - self.params['entropy']*entropy_loss


In [14]:
env = gym.make('CartPole-v0')
model = Model(num_actions=env.action_space.n)
obs = env.reset()

agent = A2CAgent(model)
reward_sum = agent.test(env)
print("%d out of 200" % reward_sum)

rewards_history = agent.train(env)
print("Finished training, testing...")
print("%d out of 200" % agent.test(env)) # 200 out of 200

20 out of 200
Finished training, testing...
200 out of 200


# 2

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD, Adadelta, Adam
from time import sleep
import numpy as np
import random
import gym

# Environment
env = gym.make('CartPole-v1').env
inputCount = env.observation_space.shape[0]
actionsCount = env.action_space.n

# Neural Network
model = Sequential()
model.add(Dense(24, input_dim=inputCount, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(actionsCount, activation='linear'))
model.compile(loss='mse', optimizer=Adam(), metrics=['mae'])

# Load weights
# model.load_weights("weights.h5")

# Hyperparameters
gamma = 1.0
epsilon = 1.0
epsilonMin = 0.01
epsilonDecay = 0.999
episodes = 5000

# Memory (Remember & Replay)
memory = []
batch_size = 64
memoryMax = 50000

# Training
for e in range(episodes):
    s = env.reset()
    s = np.array([s])

    for time in range(500):
        # Act greedy sometimes
        if np.random.rand() <= epsilon:
            a = random.randrange(actionsCount)
        else:
            a = np.argmax(model.predict(s))

        newS, r, done, _ = env.step(a)
        newS = np.array([newS])
        target = r + gamma * np.max(model.predict(newS))
        target_f = model.predict(s)[0]
        target_f[a] = target
        model.fit(s, target_f.reshape(-1, actionsCount), epochs=1, verbose=0)
        memory.append((s, a, r, newS, done))
        s = newS

        # free first items in memory
        if len(memory)==memoryMax:
            del memory[:5000]

        if done:
            print("episode: {}/{}, score: {}".format(e, episodes, time))
            break

    if epsilon > epsilonMin:
        epsilon *= epsilonDecay

    # Replay memory
    if len(memory) > batch_size:
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
              target = reward + gamma * np.max(model.predict(next_state))

            target_f = model.predict(state)[0]
            target_f[action] = target
            model.fit(state, target_f.reshape(-1, actionsCount), epochs=1, verbose=0)


# Save weights
model.save_weights("weights.h5")

# Play game
print("\nPlaying Game...")
sleep(1)

s = env.reset()
done = False
while not done:
    env.render()
    a = np.argmax(model.predict(np.array([s])))
    newS, r, done, _ = env.step(a)
    s = newS
    
# https://raw.githubusercontent.com/OmarAflak/CartPole-DQN/master/dqn.py

Instructions for updating:
Use tf.cast instead.
episode: 0/5000, score: 44
episode: 1/5000, score: 11
episode: 2/5000, score: 16
episode: 3/5000, score: 9
episode: 4/5000, score: 10
episode: 5/5000, score: 25
episode: 6/5000, score: 16
episode: 7/5000, score: 11
episode: 8/5000, score: 20
episode: 9/5000, score: 17
episode: 10/5000, score: 23
episode: 11/5000, score: 23
episode: 12/5000, score: 26
episode: 13/5000, score: 9
episode: 14/5000, score: 19
episode: 15/5000, score: 16
episode: 16/5000, score: 15
episode: 17/5000, score: 28
episode: 18/5000, score: 9
episode: 19/5000, score: 20
episode: 20/5000, score: 20
episode: 21/5000, score: 17
episode: 22/5000, score: 24
episode: 23/5000, score: 24
episode: 24/5000, score: 12
episode: 25/5000, score: 33
episode: 26/5000, score: 19
episode: 27/5000, score: 22
episode: 28/5000, score: 24
episode: 29/5000, score: 9
episode: 30/5000, score: 34
episode: 31/5000, score: 11
episode: 32/5000, score: 22
episode: 33/5000, score: 47
episode: 34/50

episode: 278/5000, score: 11
episode: 279/5000, score: 22
episode: 280/5000, score: 11
episode: 281/5000, score: 32
episode: 282/5000, score: 16
episode: 283/5000, score: 12
episode: 284/5000, score: 15
episode: 285/5000, score: 10
episode: 286/5000, score: 36
episode: 287/5000, score: 36
episode: 288/5000, score: 11
episode: 289/5000, score: 9
episode: 290/5000, score: 25
episode: 291/5000, score: 11
episode: 292/5000, score: 37
episode: 293/5000, score: 14
episode: 294/5000, score: 10
episode: 295/5000, score: 12
episode: 296/5000, score: 14
episode: 297/5000, score: 25
episode: 298/5000, score: 21
episode: 299/5000, score: 19
episode: 300/5000, score: 13
episode: 301/5000, score: 8
episode: 302/5000, score: 14
episode: 303/5000, score: 14
episode: 304/5000, score: 20
episode: 305/5000, score: 18
episode: 306/5000, score: 15
episode: 307/5000, score: 14
episode: 308/5000, score: 18
episode: 309/5000, score: 11
episode: 310/5000, score: 23
episode: 311/5000, score: 13
episode: 312/500

episode: 562/5000, score: 8
episode: 563/5000, score: 11
episode: 564/5000, score: 10
episode: 565/5000, score: 10
episode: 566/5000, score: 10
episode: 567/5000, score: 10
episode: 568/5000, score: 14
episode: 569/5000, score: 15
episode: 570/5000, score: 14
episode: 571/5000, score: 13
episode: 572/5000, score: 18
episode: 573/5000, score: 29
episode: 574/5000, score: 7
episode: 575/5000, score: 9
episode: 576/5000, score: 11
episode: 577/5000, score: 10
episode: 578/5000, score: 9
episode: 579/5000, score: 9
episode: 580/5000, score: 21
episode: 581/5000, score: 12
episode: 582/5000, score: 16
episode: 583/5000, score: 8
episode: 584/5000, score: 17
episode: 585/5000, score: 21
episode: 586/5000, score: 16
episode: 587/5000, score: 9
episode: 588/5000, score: 12
episode: 589/5000, score: 7
episode: 590/5000, score: 12
episode: 591/5000, score: 16
episode: 592/5000, score: 9
episode: 593/5000, score: 17
episode: 594/5000, score: 11
episode: 595/5000, score: 8
episode: 596/5000, score

episode: 848/5000, score: 53
episode: 849/5000, score: 94
episode: 850/5000, score: 47
episode: 851/5000, score: 66
episode: 852/5000, score: 46
episode: 853/5000, score: 40
episode: 854/5000, score: 53
episode: 855/5000, score: 142
episode: 856/5000, score: 109
episode: 857/5000, score: 28
episode: 858/5000, score: 36
episode: 859/5000, score: 22
episode: 860/5000, score: 143
episode: 861/5000, score: 105
episode: 862/5000, score: 51
episode: 863/5000, score: 24
episode: 864/5000, score: 10
episode: 865/5000, score: 42
episode: 866/5000, score: 51
episode: 867/5000, score: 59
episode: 868/5000, score: 74
episode: 869/5000, score: 120
episode: 870/5000, score: 46
episode: 871/5000, score: 59
episode: 872/5000, score: 54
episode: 873/5000, score: 35
episode: 874/5000, score: 20
episode: 875/5000, score: 40
episode: 876/5000, score: 30
episode: 877/5000, score: 56
episode: 878/5000, score: 42
episode: 879/5000, score: 30
episode: 880/5000, score: 18
episode: 881/5000, score: 56
episode: 

episode: 1125/5000, score: 66
episode: 1126/5000, score: 50
episode: 1127/5000, score: 47
episode: 1128/5000, score: 34
episode: 1129/5000, score: 120
episode: 1130/5000, score: 103
episode: 1131/5000, score: 80
episode: 1132/5000, score: 55
episode: 1133/5000, score: 54
episode: 1134/5000, score: 94
episode: 1135/5000, score: 62
episode: 1136/5000, score: 77
episode: 1137/5000, score: 280
episode: 1138/5000, score: 57
episode: 1139/5000, score: 85
episode: 1140/5000, score: 47
episode: 1141/5000, score: 73
episode: 1142/5000, score: 108
episode: 1143/5000, score: 59
episode: 1144/5000, score: 131
episode: 1145/5000, score: 44
episode: 1146/5000, score: 49
episode: 1147/5000, score: 62
episode: 1148/5000, score: 37
episode: 1149/5000, score: 145
episode: 1150/5000, score: 60
episode: 1151/5000, score: 45
episode: 1152/5000, score: 91
episode: 1153/5000, score: 55
episode: 1154/5000, score: 52
episode: 1155/5000, score: 80
episode: 1156/5000, score: 58
episode: 1157/5000, score: 112
epi

episode: 1397/5000, score: 83
episode: 1398/5000, score: 172
episode: 1399/5000, score: 70
episode: 1400/5000, score: 104
episode: 1401/5000, score: 71
episode: 1402/5000, score: 85
episode: 1403/5000, score: 83
episode: 1404/5000, score: 66
episode: 1405/5000, score: 98
episode: 1406/5000, score: 63
episode: 1407/5000, score: 52
episode: 1408/5000, score: 88
episode: 1409/5000, score: 106
episode: 1410/5000, score: 294
episode: 1411/5000, score: 159
episode: 1412/5000, score: 40
episode: 1413/5000, score: 93
episode: 1414/5000, score: 67
episode: 1415/5000, score: 73
episode: 1416/5000, score: 61
episode: 1417/5000, score: 98
episode: 1418/5000, score: 74
episode: 1419/5000, score: 79
episode: 1420/5000, score: 57
episode: 1421/5000, score: 58
episode: 1422/5000, score: 57
episode: 1423/5000, score: 59
episode: 1424/5000, score: 58
episode: 1425/5000, score: 153
episode: 1426/5000, score: 58
episode: 1427/5000, score: 69
episode: 1428/5000, score: 47
episode: 1429/5000, score: 171
epi

episode: 1667/5000, score: 150
episode: 1668/5000, score: 54
episode: 1669/5000, score: 243
episode: 1670/5000, score: 83
episode: 1671/5000, score: 94
episode: 1672/5000, score: 68
episode: 1673/5000, score: 83
episode: 1674/5000, score: 112
episode: 1675/5000, score: 118
episode: 1676/5000, score: 117
episode: 1677/5000, score: 97
episode: 1678/5000, score: 162
episode: 1679/5000, score: 84
episode: 1680/5000, score: 77
episode: 1681/5000, score: 50
episode: 1682/5000, score: 148
episode: 1683/5000, score: 150
episode: 1684/5000, score: 93
episode: 1685/5000, score: 93
episode: 1686/5000, score: 90
episode: 1687/5000, score: 103
episode: 1688/5000, score: 71
episode: 1689/5000, score: 65
episode: 1690/5000, score: 80
episode: 1691/5000, score: 121
episode: 1692/5000, score: 88
episode: 1693/5000, score: 150
episode: 1694/5000, score: 204
episode: 1695/5000, score: 76
episode: 1696/5000, score: 98
episode: 1697/5000, score: 136
episode: 1698/5000, score: 167
episode: 1699/5000, score:

episode: 1937/5000, score: 33
episode: 1938/5000, score: 24
episode: 1939/5000, score: 24
episode: 1940/5000, score: 19
episode: 1941/5000, score: 32
episode: 1942/5000, score: 7
episode: 1943/5000, score: 8
episode: 1944/5000, score: 9
episode: 1945/5000, score: 8
episode: 1946/5000, score: 9
episode: 1947/5000, score: 9
episode: 1948/5000, score: 8
episode: 1949/5000, score: 8
episode: 1950/5000, score: 10
episode: 1951/5000, score: 10
episode: 1952/5000, score: 9
episode: 1953/5000, score: 7
episode: 1954/5000, score: 9
episode: 1955/5000, score: 9
episode: 1956/5000, score: 9
episode: 1957/5000, score: 8
episode: 1958/5000, score: 12
episode: 1959/5000, score: 9
episode: 1960/5000, score: 9
episode: 1961/5000, score: 11
episode: 1962/5000, score: 9
episode: 1963/5000, score: 9
episode: 1964/5000, score: 13
episode: 1965/5000, score: 9
episode: 1966/5000, score: 9
episode: 1967/5000, score: 11
episode: 1968/5000, score: 9
episode: 1969/5000, score: 9
episode: 1970/5000, score: 9
epi

episode: 2217/5000, score: 10
episode: 2218/5000, score: 7
episode: 2219/5000, score: 11
episode: 2220/5000, score: 9
episode: 2221/5000, score: 9
episode: 2222/5000, score: 8
episode: 2223/5000, score: 10
episode: 2224/5000, score: 8
episode: 2225/5000, score: 11
episode: 2226/5000, score: 7
episode: 2227/5000, score: 10
episode: 2228/5000, score: 10
episode: 2229/5000, score: 7
episode: 2230/5000, score: 8
episode: 2231/5000, score: 8
episode: 2232/5000, score: 8
episode: 2233/5000, score: 9
episode: 2234/5000, score: 9
episode: 2235/5000, score: 8
episode: 2236/5000, score: 10
episode: 2237/5000, score: 8
episode: 2238/5000, score: 11
episode: 2239/5000, score: 9
episode: 2240/5000, score: 10
episode: 2241/5000, score: 10
episode: 2242/5000, score: 10
episode: 2243/5000, score: 9
episode: 2244/5000, score: 9
episode: 2245/5000, score: 8
episode: 2246/5000, score: 9
episode: 2247/5000, score: 8
episode: 2248/5000, score: 9
episode: 2249/5000, score: 8
episode: 2250/5000, score: 10
ep

episode: 2499/5000, score: 9
episode: 2500/5000, score: 9
episode: 2501/5000, score: 9
episode: 2502/5000, score: 9
episode: 2503/5000, score: 9
episode: 2504/5000, score: 9
episode: 2505/5000, score: 10
episode: 2506/5000, score: 8
episode: 2507/5000, score: 8
episode: 2508/5000, score: 9
episode: 2509/5000, score: 11
episode: 2510/5000, score: 7
episode: 2511/5000, score: 10
episode: 2512/5000, score: 11
episode: 2513/5000, score: 9
episode: 2514/5000, score: 8
episode: 2515/5000, score: 10
episode: 2516/5000, score: 8
episode: 2517/5000, score: 8
episode: 2518/5000, score: 9
episode: 2519/5000, score: 9
episode: 2520/5000, score: 9
episode: 2521/5000, score: 11
episode: 2522/5000, score: 11
episode: 2523/5000, score: 9
episode: 2524/5000, score: 8
episode: 2525/5000, score: 8
episode: 2526/5000, score: 9
episode: 2527/5000, score: 9
episode: 2528/5000, score: 9
episode: 2529/5000, score: 9
episode: 2530/5000, score: 8
episode: 2531/5000, score: 10
episode: 2532/5000, score: 8
episod

episode: 2781/5000, score: 8
episode: 2782/5000, score: 8
episode: 2783/5000, score: 8
episode: 2784/5000, score: 8
episode: 2785/5000, score: 9
episode: 2786/5000, score: 9
episode: 2787/5000, score: 9
episode: 2788/5000, score: 8
episode: 2789/5000, score: 9
episode: 2790/5000, score: 11
episode: 2791/5000, score: 9
episode: 2792/5000, score: 8
episode: 2793/5000, score: 9
episode: 2794/5000, score: 9
episode: 2795/5000, score: 9
episode: 2796/5000, score: 7
episode: 2797/5000, score: 9
episode: 2798/5000, score: 10
episode: 2799/5000, score: 10
episode: 2800/5000, score: 9
episode: 2801/5000, score: 7
episode: 2802/5000, score: 9
episode: 2803/5000, score: 9
episode: 2804/5000, score: 7
episode: 2805/5000, score: 8
episode: 2806/5000, score: 8
episode: 2807/5000, score: 11
episode: 2808/5000, score: 7
episode: 2809/5000, score: 9
episode: 2810/5000, score: 9
episode: 2811/5000, score: 8
episode: 2812/5000, score: 10
episode: 2813/5000, score: 7
episode: 2814/5000, score: 10
episode:

episode: 3064/5000, score: 9
episode: 3065/5000, score: 8
episode: 3066/5000, score: 8
episode: 3067/5000, score: 7
episode: 3068/5000, score: 8
episode: 3069/5000, score: 8
episode: 3070/5000, score: 7
episode: 3071/5000, score: 9
episode: 3072/5000, score: 8
episode: 3073/5000, score: 8
episode: 3074/5000, score: 9
episode: 3075/5000, score: 8
episode: 3076/5000, score: 8
episode: 3077/5000, score: 8
episode: 3078/5000, score: 8
episode: 3079/5000, score: 7
episode: 3080/5000, score: 9
episode: 3081/5000, score: 8
episode: 3082/5000, score: 8
episode: 3083/5000, score: 9
episode: 3084/5000, score: 8
episode: 3085/5000, score: 9
episode: 3086/5000, score: 9
episode: 3087/5000, score: 9
episode: 3088/5000, score: 9
episode: 3089/5000, score: 8
episode: 3090/5000, score: 7
episode: 3091/5000, score: 7
episode: 3092/5000, score: 9
episode: 3093/5000, score: 9
episode: 3094/5000, score: 9
episode: 3095/5000, score: 8
episode: 3096/5000, score: 9
episode: 3097/5000, score: 7
episode: 3098/

episode: 3347/5000, score: 8
episode: 3348/5000, score: 9
episode: 3349/5000, score: 8
episode: 3350/5000, score: 9
episode: 3351/5000, score: 9
episode: 3352/5000, score: 9
episode: 3353/5000, score: 8
episode: 3354/5000, score: 10
episode: 3355/5000, score: 9
episode: 3356/5000, score: 11
episode: 3357/5000, score: 10
episode: 3358/5000, score: 8
episode: 3359/5000, score: 9
episode: 3360/5000, score: 10
episode: 3361/5000, score: 8
episode: 3362/5000, score: 9
episode: 3363/5000, score: 9
episode: 3364/5000, score: 8
episode: 3365/5000, score: 10
episode: 3366/5000, score: 10
episode: 3367/5000, score: 7
episode: 3368/5000, score: 7
episode: 3369/5000, score: 8
episode: 3370/5000, score: 8
episode: 3371/5000, score: 8
episode: 3372/5000, score: 9
episode: 3373/5000, score: 8
episode: 3374/5000, score: 9
episode: 3375/5000, score: 9
episode: 3376/5000, score: 8
episode: 3377/5000, score: 8
episode: 3378/5000, score: 9
episode: 3379/5000, score: 9
episode: 3380/5000, score: 10
episode

episode: 3629/5000, score: 9
episode: 3630/5000, score: 8
episode: 3631/5000, score: 9
episode: 3632/5000, score: 9
episode: 3633/5000, score: 9
episode: 3634/5000, score: 9
episode: 3635/5000, score: 7
episode: 3636/5000, score: 10
episode: 3637/5000, score: 8
episode: 3638/5000, score: 9
episode: 3639/5000, score: 7
episode: 3640/5000, score: 8
episode: 3641/5000, score: 9
episode: 3642/5000, score: 9
episode: 3643/5000, score: 8
episode: 3644/5000, score: 9
episode: 3645/5000, score: 8
episode: 3646/5000, score: 8
episode: 3647/5000, score: 7
episode: 3648/5000, score: 9
episode: 3649/5000, score: 9
episode: 3650/5000, score: 9
episode: 3651/5000, score: 8
episode: 3652/5000, score: 8
episode: 3653/5000, score: 9
episode: 3654/5000, score: 9
episode: 3655/5000, score: 9
episode: 3656/5000, score: 8
episode: 3657/5000, score: 7
episode: 3658/5000, score: 9
episode: 3659/5000, score: 8
episode: 3660/5000, score: 10
episode: 3661/5000, score: 9
episode: 3662/5000, score: 9
episode: 366

episode: 3911/5000, score: 17
episode: 3912/5000, score: 26
episode: 3913/5000, score: 104
episode: 3914/5000, score: 212
episode: 3915/5000, score: 41
episode: 3916/5000, score: 33
episode: 3917/5000, score: 33
episode: 3918/5000, score: 20
episode: 3919/5000, score: 52
episode: 3920/5000, score: 19
episode: 3921/5000, score: 10
episode: 3922/5000, score: 8
episode: 3923/5000, score: 8
episode: 3924/5000, score: 13
episode: 3925/5000, score: 15
episode: 3926/5000, score: 16
episode: 3927/5000, score: 95
episode: 3928/5000, score: 83
episode: 3929/5000, score: 91
episode: 3930/5000, score: 87
episode: 3931/5000, score: 188
episode: 3932/5000, score: 94
episode: 3933/5000, score: 62
episode: 3934/5000, score: 222
episode: 3935/5000, score: 62
episode: 3936/5000, score: 38
episode: 3937/5000, score: 32
episode: 3938/5000, score: 41
episode: 3939/5000, score: 43
episode: 3940/5000, score: 54
episode: 3941/5000, score: 196
episode: 3942/5000, score: 53
episode: 3943/5000, score: 58
episode

episode: 4184/5000, score: 72
episode: 4185/5000, score: 55
episode: 4186/5000, score: 48
episode: 4187/5000, score: 40
episode: 4188/5000, score: 47
episode: 4189/5000, score: 97
episode: 4190/5000, score: 76
episode: 4191/5000, score: 63
episode: 4192/5000, score: 90
episode: 4193/5000, score: 55
episode: 4195/5000, score: 47
episode: 4196/5000, score: 57
episode: 4197/5000, score: 104
episode: 4198/5000, score: 53
episode: 4199/5000, score: 78
episode: 4200/5000, score: 58
episode: 4201/5000, score: 55
episode: 4202/5000, score: 244
episode: 4203/5000, score: 38
episode: 4204/5000, score: 81
episode: 4205/5000, score: 81
episode: 4206/5000, score: 57
episode: 4207/5000, score: 71
episode: 4208/5000, score: 71
episode: 4209/5000, score: 457
episode: 4211/5000, score: 39
episode: 4212/5000, score: 279
episode: 4213/5000, score: 107
episode: 4214/5000, score: 96
episode: 4215/5000, score: 208
episode: 4216/5000, score: 50
episode: 4217/5000, score: 234
episode: 4218/5000, score: 92
epi

episode: 4472/5000, score: 97
episode: 4473/5000, score: 133
episode: 4474/5000, score: 163
episode: 4475/5000, score: 101
episode: 4476/5000, score: 69
episode: 4477/5000, score: 67
episode: 4478/5000, score: 297
episode: 4479/5000, score: 67
episode: 4480/5000, score: 71
episode: 4481/5000, score: 231
episode: 4482/5000, score: 95
episode: 4483/5000, score: 89
episode: 4484/5000, score: 269
episode: 4485/5000, score: 307
episode: 4486/5000, score: 89
episode: 4487/5000, score: 83
episode: 4488/5000, score: 91
episode: 4489/5000, score: 90
episode: 4490/5000, score: 91
episode: 4491/5000, score: 124
episode: 4492/5000, score: 69
episode: 4493/5000, score: 73
episode: 4494/5000, score: 71
episode: 4495/5000, score: 75
episode: 4496/5000, score: 212
episode: 4497/5000, score: 195
episode: 4498/5000, score: 168
episode: 4499/5000, score: 285
episode: 4501/5000, score: 77
episode: 4502/5000, score: 82
episode: 4503/5000, score: 65
episode: 4504/5000, score: 58
episode: 4505/5000, score: 6

episode: 4770/5000, score: 193
episode: 4771/5000, score: 118
episode: 4776/5000, score: 118
episode: 4778/5000, score: 138
episode: 4781/5000, score: 283
episode: 4783/5000, score: 93
episode: 4784/5000, score: 75
episode: 4785/5000, score: 84
episode: 4786/5000, score: 83
episode: 4787/5000, score: 87
episode: 4788/5000, score: 86
episode: 4789/5000, score: 90
episode: 4790/5000, score: 110
episode: 4791/5000, score: 86
episode: 4792/5000, score: 101
episode: 4793/5000, score: 111
episode: 4794/5000, score: 168
episode: 4795/5000, score: 229
episode: 4796/5000, score: 160
episode: 4798/5000, score: 86
episode: 4799/5000, score: 107
episode: 4800/5000, score: 155
episode: 4801/5000, score: 99
episode: 4802/5000, score: 113
episode: 4803/5000, score: 150
episode: 4804/5000, score: 127
episode: 4805/5000, score: 110
episode: 4806/5000, score: 99
episode: 4807/5000, score: 112
episode: 4808/5000, score: 94
episode: 4809/5000, score: 95
episode: 4810/5000, score: 134
episode: 4811/5000, s

KeyboardInterrupt: 

# 3

In [1]:
# OpenGym CartPole-v0
# -------------------
#
# This code demonstrates use a full DQN implementation
# to solve OpenGym CartPole-v0 problem.
#
# Made as part of blog series Let's make a DQN, available at: 
# https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/
# 
# author: Jaromir Janisch, 2016

import random, numpy, math, gym, sys
from keras import backend as K

import tensorflow as tf

#----------
HUBER_LOSS_DELTA = 1.0
LEARNING_RATE = 0.00025

#----------
def huber_loss(y_true, y_pred):
    err = y_true - y_pred

    cond = K.abs(err) < HUBER_LOSS_DELTA
    L2 = 0.5 * K.square(err)
    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)

    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow :-(

    return K.mean(loss)

#-------------------- BRAIN ---------------------------
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *

class Brain:
    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.model = self._createModel()
        self.model_ = self._createModel() 

    def _createModel(self):
        model = Sequential()

        model.add(Dense(units=64, activation='relu', input_dim=stateCnt))
        model.add(Dense(units=actionCnt, activation='linear'))

        opt = RMSprop(lr=LEARNING_RATE)
        model.compile(loss=huber_loss, optimizer=opt)

        return model

    def train(self, x, y, epochs=1, verbose=0):
        self.model.fit(x, y, batch_size=64, epochs=epochs, verbose=verbose)

    def predict(self, s, target=False):
        if target:
            return self.model_.predict(s)
        else:
            return self.model.predict(s)

    def predictOne(self, s, target=False):
        return self.predict(s.reshape(1, self.stateCnt), target=target).flatten()

    def updateTargetModel(self):
        self.model_.set_weights(self.model.get_weights())

#-------------------- MEMORY --------------------------
class Memory:   # stored as ( s, a, r, s_ )
    samples = []

    def __init__(self, capacity):
        self.capacity = capacity

    def add(self, sample):
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

    def isFull(self):
        return len(self.samples) >= self.capacity

#-------------------- AGENT ---------------------------
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64

GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001      # speed of decay

UPDATE_TARGET_FREQUENCY = 1000

class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.brain = Brain(stateCnt, actionCnt)
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, self.actionCnt-1)
        else:
            return numpy.argmax(self.brain.predictOne(s))

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)        

        if self.steps % UPDATE_TARGET_FREQUENCY == 0:
            self.brain.updateTargetModel()

        # debug the Q function in poin S
        if self.steps % 100 == 0:
            S = numpy.array([-0.01335408, -0.04600273, -0.00677248, 0.01517507])
            pred = agent.brain.predictOne(S)
            print(pred[0])
            sys.stdout.flush()

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)

    def replay(self):    
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)

        no_state = numpy.zeros(self.stateCnt)

        states = numpy.array([ o[0] for o in batch ])
        states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])

        p = self.brain.predict(states)
        p_ = self.brain.predict(states_, target=True)

        x = numpy.zeros((batchLen, self.stateCnt))
        y = numpy.zeros((batchLen, self.actionCnt))
        
        for i in range(batchLen):
            o = batch[i]
            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
            
            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * numpy.amax(p_[i])

            x[i] = s
            y[i] = t

        self.brain.train(x, y)


class RandomAgent:
    memory = Memory(MEMORY_CAPACITY)

    def __init__(self, actionCnt):
        self.actionCnt = actionCnt

    def act(self, s):
        return random.randint(0, self.actionCnt-1)

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)

    def replay(self):
        pass

#-------------------- ENVIRONMENT ---------------------
class Environment:
    def __init__(self, problem):
        self.problem = problem
        self.env = gym.make(problem)

    def run(self, agent):
        s = self.env.reset()
        R = 0 

        while True:            
            # self.env.render()

            a = agent.act(s)

            s_, r, done, info = self.env.step(a)

            if done: # terminal state
                s_ = None

            agent.observe( (s, a, r, s_) )
            agent.replay()            

            s = s_
            R += r

            if done:
                break

        # print("Total reward:", R)

#-------------------- MAIN ----------------------------
PROBLEM = 'CartPole-v0'
env = Environment(PROBLEM)

stateCnt  = env.env.observation_space.shape[0]
actionCnt = env.env.action_space.n

agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)

try:
    while randomAgent.memory.isFull() == False:
        env.run(randomAgent)

    agent.memory.samples = randomAgent.memory.samples
    randomAgent = None

    while True:
        env.run(agent)
finally:
    agent.brain.model.save("cartpole-dqn.h5")
    
# https://raw.githubusercontent.com/jaromiru/AI-blog/master/CartPole-DQN.py

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
-0.015657602
0.0783036
0.18130182
0.31887686
0.47881737
0.6558297
0.80548745
0.89265084
0.92815
0.93315613
0.93050325
1.2500848
1.5623959
1.8389462
1.9212722
1.9657929
1.9893937
1.9870706
1.9900262
1.9973521
1.9925257
2.4352868
2.8087146
2.913518
2.9719424
2.9826565
3.0041914
3.0251684
3.0042512
3.0160487
3.0173023
3.5468283
3.8865848
3.9380972
3.9836948
3.9973202
4.018398
4.0322356
4.0041914
4.01945
4.025466
4.6263885
4.9327765
4.9686217
4.9912586
4.992047
5.011396
5.025155
5.0248094
5.029554
4.9973793
5.661647
5.927794
5.9860196
5.9907165
5.990317
6.0215993
6.0080056
6.014282
5.9932747
6.018365
6.7257795
6.9457893
6.96747
6.982707
7.0028124
6.9748363
6.985699
6.9891853
7.0022964
6.9796076
7.7260094
7.8994164
7.938446
7.9407754
7.9576344
7.9750786
7.936467
7.9310284
7.9413586
7.954927
8.723637
8.865764
8.902823
8.889548
8.906164
8.873818
8.891449
8.872156
8.890643
8.883745
9.684632
9.796808
9.839751
9.830242
9.846

59.51991
59.501503
59.511963
59.469326
59.489086
59.4846
59.523525
59.517696
59.941067
59.98869
60.023117
60.02038
59.984444
59.993336
60.006943
60.020363
60.030552
60.009125
60.4671
60.4253
60.47276
60.468075
60.468857
60.45041
60.451256
60.515415
60.42937
60.49687
60.98348
60.946774
60.926327
60.949284
60.972908
60.966305
60.943066
60.934246
60.88365
60.91684
61.393723
61.404514
61.395596
61.379093
61.396492
61.362423
61.38581
61.37258
61.376595
61.368046
61.78213
61.764023
61.805775
61.779793
61.7765
61.785698
61.82384
61.81228
61.78654
61.801765
62.20765
62.238094
62.270752
62.257095
62.25036
62.23004
62.256264
62.24748
62.275364
62.21962
62.66165
62.60666
62.665855
62.650352
62.66555
62.64361
62.627907
62.636505
62.63668
62.61836
63.076473
63.07608
63.040123
63.084778
63.085743
63.05413
63.059162
63.060135
63.100086
63.076366
63.473427
63.507023
63.492584
63.51095
63.489433
63.4648
63.4966
63.483974
63.49347
63.494484
63.845627
63.858658
63.84903
63.8807
63.874577
63.92828
63.8924

86.22324
86.24481
86.26214
86.228424
86.21142
86.24149
86.261856
86.42199
86.4476
86.435844
86.4546
86.439316
86.44556
86.41787
86.43091
86.43386
86.41797
86.56291
86.55642
86.60116
86.58372
86.58216
86.59203
86.58914
86.57835
86.59575
86.5605
86.72858
86.74379
86.70907
86.72091
86.75198
86.7366
86.748184
86.74424
86.74
86.751114
86.91978
86.944695
86.93003
86.91043
86.93184
86.85529
86.92795
86.91033
86.891335
86.889336
87.07689
87.07055
87.04042
87.0663
87.038055
87.07272
87.0824
87.06071
87.073364
87.047516
87.22494
87.22683
87.23717
87.198975
87.24797
87.255104
87.218155
87.23243
87.26487
87.25332
87.42532
87.43797
87.445114
87.44491
87.42651
87.46325
87.453835
87.46558
87.45712
87.453156
87.61863
87.637505
87.6629
87.637276
87.64991
87.62779
87.64507
87.66236
87.65013
87.64099
87.83921
87.80089
87.80881
87.82325
87.82396
87.80735
87.80971
87.82191
87.83114
87.80296
87.91055
87.92029
87.94305
87.94972
87.91693
87.89372
87.92398
87.91848
87.9018
87.90673
88.07262
88.05105
88.05318
8

  % delta_t_median)


97.23662
97.22523
97.18774
97.209366
97.2344
97.33755
97.36018
97.37394
97.32065
97.33049
97.31367
97.28618
97.34599
97.33726
97.26244
97.41138
97.414085
97.36777
97.39147
97.34872
97.369125
97.40163
97.370255
97.38272
97.36951
97.49884
97.52806
97.51205
97.524025
97.500496
97.47919
97.46926
97.50816
97.47988
97.49754
97.66002
97.58205
97.55257
97.49806
97.55261
97.52968
97.56077
97.51446
97.54635
97.54748
97.6066
97.64458
97.59438
97.61711
97.58484
97.61595
97.61412
97.58209
97.59171
97.579735
97.75301
97.74754
97.68082
97.74784
97.71114
97.772224
97.726105
97.65864
97.7055
97.69413
97.832115
97.77776
97.77778
97.79534
97.766426
97.77194
97.754036
97.76744
97.767265
97.80473
97.91618
97.90576
97.944916
97.90074
97.938774
97.93769
97.89778
97.86239
97.92281
97.903366
98.07681
98.05581
98.030136
98.02804
97.99341
98.030624
98.028824
97.99537
98.00681
98.038605
98.18561
98.20317
98.15794
98.20084
98.20033
98.16484
98.17646
98.18513
98.17323
98.20067
98.358826
98.35611
98.323044
98.32445


101.668175
101.69732
101.6805
101.598816
101.663
101.63609
101.6439
101.67591
101.6084
101.64993
101.6168
101.62969
101.66015
101.64621
101.61357
101.64581
101.682785
101.68668
101.625206
101.697365
101.716484
101.62978
101.68584
101.696915
101.6749
101.64871
101.7196
101.75037
101.74071
101.720085
101.733795
101.72174
101.74134
101.70993
101.7329
101.73633
101.74627
101.80553
101.75983
101.75411
101.76573
101.753006
101.75547
101.764824
101.74761
101.77816
101.76358
101.73859
101.73699
101.81712
101.77712
101.77777
101.790184
101.77702
101.80305
101.7394
101.7726
101.779144
101.83269
101.7444
101.795944
101.78358
101.75181
101.82176
101.803215
101.82396
101.79984
101.83244
101.804474
101.82423
101.86541
101.848724
101.82041
101.853424
101.87034
101.8841
101.766624
101.80037
101.77002
101.8008
101.77251
101.84299
101.77851
101.787926
101.767715
101.8428
101.81733
101.837585
101.76393
101.841034
101.8667
101.85003
101.87716
101.85741
101.840195
101.84681
101.79878
101.79544
101.81352
10

102.74275
102.719154
102.75249
102.73069
102.72907
102.75411
102.73834
102.725525
102.72468
102.745804
102.70206
102.727936
102.70358
102.73358
102.73201
102.745575
102.717674
102.73001
102.720345
102.70641
102.72359
102.75946
102.71636
102.74685
102.73699
102.743866
102.71445
102.75574
102.75728
102.75176
102.77256
102.74454
102.78282
102.754745
102.777214
102.78545
102.74994
102.752464
102.79437
102.76564
102.75743
102.76312
102.816124
102.82078
102.780876
102.792694
102.77146
102.79975
102.78006
102.78003
102.77165
102.76597
102.80913
102.7721
102.82344
102.760765
102.739136
102.799164
102.82292
102.846344
102.884415
102.833405
102.82959
102.8066
102.805435
102.8116
102.843544
102.84188
102.831406
102.81171
102.8231
102.83596
102.84579
102.859505
102.834206
102.817696
102.8368
102.874435
102.81248
102.85177
102.79913
102.82496
102.830414
102.81177
102.82407
102.82381
102.77722
102.79942
102.81888
102.800995
102.835396
102.822845
102.855484
102.8173
102.77065
102.82278
102.815895
102

102.71016
102.69365
102.690025
102.640755
102.653885
102.615746
102.608894
102.62675
102.582306
102.61596
102.63515
102.64927
102.60606
102.63249
102.61233
102.615585
102.61358
102.62583
102.60081
102.66612
102.63942
102.67248
102.61784
102.598854
102.57618
102.58812
102.5774
102.54926
102.5686
102.60175
102.572235
102.578674
102.6021
102.60518
102.6013
102.62322
102.662994
102.55036
102.63369
102.57055
102.63356
102.60024
102.62858
102.58528
102.62901
102.609856
102.61481
102.605515
102.57949
102.63319
102.59831
102.61133
102.58107
102.571335
102.51538
102.55646
102.58253
102.52837
102.58463
102.56413
102.59516
102.572876
102.57921
102.56178
102.55932
102.555534
102.50217
102.510796
102.476715
102.54619
102.5301
102.539154
102.54786
102.55901
102.54627
102.56469
102.57092
102.54964
102.53881
102.513916
102.54442
102.57347
102.57175
102.50659
102.51204
102.53717
102.55922
102.51509
102.50718
102.5187
102.52932
102.56307
102.557915
102.52561
102.478065
102.476974
102.49733
102.47916
102

102.7341
102.78067
102.81813
102.77402
102.75956
102.76158
102.78279
102.747215
102.69374
102.74922
102.71199
102.7317
102.71663
102.72256
102.71318
102.723785
102.74969
102.75291
102.711464
102.73925
102.70318
102.736755
102.70944
102.67944
102.71852
102.707985
102.6772
102.7076
102.70863
102.67583
102.699554
102.68171
102.689255
102.6813
102.68489
102.66949
102.707565
102.68088
102.662575
102.66511
102.61214
102.6254
102.63383
102.68944
102.66681
102.6184
102.64202
102.66047
102.668304
102.63328
102.6322
102.64283
102.66317
102.64899
102.6509
102.650536
102.66768
102.64194
102.61882
102.62643
102.67514
102.644264
102.64081
102.67914
102.64509
102.66908
102.65261
102.6553
102.56693
102.64499
102.646454
102.64241
102.6474
102.61179
102.63428
102.58575
102.62197
102.66687
102.60871
102.65123
102.65206
102.61326
102.64966
102.647316
102.6377
102.64607
102.66967
102.6072
102.69852
102.65991
102.63484
102.65564
102.66813
102.604904
102.622604
102.63802
102.6437
102.64703
102.59772
102.6297

101.507515
101.583755
101.56107
101.53851
101.558975
101.55823
101.570206
101.50759
101.53211
101.49361
101.451035
101.48845
101.47721
101.489334
101.50467
101.47807
101.47189
101.5077
101.50634
101.47345
101.47334
101.475555
101.47191
101.46747
101.468506
101.48113
101.4661
101.48938
101.46199
101.44891
101.469154
101.42057
101.44101
101.47222
101.47621
101.44034
101.47622
101.46221
101.45683
101.48885
101.38236
101.47455
101.465164
101.43567
101.464355
101.427124
101.427635
101.45914
101.43133
101.42177
101.43711
101.42579
101.44139
101.436516
101.42763
101.4407
101.43691
101.46383
101.46587
101.42797
101.391
101.42705
101.40921
101.450584
101.438286
101.4315
101.48385
101.40598
101.43269
101.487625
101.40494
101.44352
101.46243
101.44175
101.43184
101.45585
101.44836
101.4664
101.45272
101.48414
101.48892
101.48381
101.42378
101.459236
101.469955
101.48142
101.48996
101.466965
101.45718
101.48247
101.43901
101.46061
101.44355
101.475555
101.43136
101.46191
101.46541
101.48452
101.44

101.15063
101.135056
101.145546
101.11968
101.129524
101.11266
101.10823
101.10232
101.131996
101.07789
101.12204
101.11829
101.09559
101.1008
101.11923
101.1313
101.130714
101.097855
101.11838
101.12395
101.09697
101.14325
101.13925
101.16455
101.144196
101.12559
101.128746
101.13184
101.1319
101.1442
101.10569
101.11127
101.10661
101.115944
101.10307
101.1188
101.118744
101.13505
101.12326
101.14227
101.11854
101.080864
101.03611
101.06358
101.12712
101.095924
101.07546
101.06774
101.06732
101.11449
101.09181
101.068085
101.04836
101.06266
101.041084
101.07681
101.04593
101.01237
101.050125
101.05577
100.979034
101.04371
101.03789
101.02672
101.03753
101.02943
101.02638
101.034775
101.06145
101.02593
101.020905
101.005035
101.008766
101.05081
101.03307
100.98252
100.97268
101.00845
101.035126
100.98162
101.01293
100.992775
100.96283
101.00175
100.96786
101.01884
101.00928
101.013336
101.016716
100.96797
101.00125
101.00458
100.96886
101.00957
101.01975
100.94159
101.062195
101.00443


100.70776
100.72697
100.75139
100.74058
100.778145
100.77237
100.73504
100.743645
100.73465
100.72076
100.753136
100.69998
100.741585
100.71234
100.728645
100.785164
100.80664
100.759094
100.765366
100.808495
100.74357
100.75825
100.798454
100.78294
100.67952
100.77203
100.73994
100.775314
100.72751
100.76386
100.72766
100.774185
100.769226
100.69893
100.705536
100.75634
100.75635
100.72138
100.713234
100.7378
100.67424
100.768616
100.74219
100.73668
100.74083
100.73845
100.755554
100.74215
100.765175
100.75305
100.7161
100.75233
100.73273
100.766594
100.76176
100.778435
100.73744
100.789825
100.78535
100.782906
100.81764
100.77317
100.78253
100.79975
100.740524
100.75851
100.7184
100.74166
100.75768
100.73741
100.72447
100.74753
100.74783
100.781685
100.79222
100.797325
100.748535
100.75596
100.77758
100.803825
100.804726
100.78179
100.844246
100.82481
100.86799
100.857834
100.88165
100.80517
100.857155
100.83905
100.83458
100.80448
100.84201
100.84929
100.81262
100.8668
100.828636
10

100.933136
100.90963
100.888985
100.9085
100.916916
100.917885
100.941475
100.94164
100.9508
100.9352
100.928566
100.94562
100.93108
100.91618
100.94015
100.89757
100.922844
100.88818
100.84413
100.91338
100.94188
100.91051
100.86547
100.872635
100.90118
100.888145
100.901
100.90643
100.85879
100.883865
100.91742
100.9266
100.90555
100.88368
100.89096
100.90927
100.892
100.893616
100.9248
100.887024
100.86988
100.906
100.85864
100.885345
100.86206
100.95613
100.94416
100.960915
100.90884
100.95089
100.932396
100.940544
100.97039
100.96657
100.94539
100.96346
100.969536
100.953186
100.91625
100.941246
100.978004
100.984375
100.90955
100.96612
100.98289
100.942406
100.947784
100.94705
100.93605
100.88502
100.92987
100.94932
100.90045
100.932495
100.90907
100.94553
100.92315
100.96662
100.909744
100.94248
100.9471
100.974365
100.937294
100.947586
100.91618
100.92135
100.9165
100.921844
100.880974
100.904366
100.9319
100.938354
100.9018
100.85636
100.88487
100.89203
100.858055
100.90651
10

101.49185
101.5565
101.54692
101.54974
101.56953
101.562065
101.51936
101.55451
101.56767
101.573235
101.59567
101.595184
101.62277
101.59367
101.57703
101.557686
101.56053
101.60881
101.60147
101.62071
101.649475
101.63703
101.5938
101.58953
101.63308
101.61677
101.62152
101.60814
101.592026
101.57985
101.659256
101.6067
101.615685
101.6244
101.663704
101.65489
101.65377
101.617
101.63405
101.62504
101.61159
101.5978
101.64604
101.6204
101.641495
101.63576
101.649
101.64782
101.654434
101.66006
101.61877
101.67144
101.667915
101.63864
101.659485
101.668724
101.65526
101.64515
101.6293
101.6563
101.66236
101.63059
101.615105
101.64221
101.650246
101.62785
101.648796
101.66902
101.61929
101.64081
101.67519
101.669876
101.70464
101.64648
101.6557
101.6653
101.600975
101.67713
101.67678
101.72209
101.697655
101.71961
101.71283
101.70238
101.70967
101.72268
101.7249
101.665306
101.66339
101.63046
101.66401
101.66775
101.63316
101.71411
101.66875
101.632904
101.64814
101.676025
101.703384
1

101.44589
101.49973
101.44083
101.49462
101.48062
101.44371
101.46099
101.461914
101.48152
101.50063
101.45554
101.47244
101.5319
101.49288
101.484856
101.46216
101.49126
101.47176
101.48458
101.51763
101.50913
101.48528
101.51426
101.52052
101.486336
101.53953
101.4509
101.51107
101.48048
101.50699
101.518776
101.497
101.53634
101.53934
101.528015
101.52532
101.493744
101.49521
101.53295
101.532394
101.53296
101.51838
101.563156
101.57207
101.58273
101.54106
101.547226
101.57236
101.51503
101.552536
101.540344
101.51181
101.52379
101.53287
101.54919
101.537254
101.514915
101.541855
101.523834
101.53653
101.54764
101.5323
101.548035
101.56187
101.551506
101.54142
101.49683
101.50675
101.51295
101.54525
101.528496
101.541725
101.54581
101.51652
101.51607
101.52954
101.5153
101.49815
101.54591
101.509186
101.52605
101.49196
101.50611
101.47349
101.47761
101.507034
101.54816
101.53033
101.51498
101.49835
101.478645
101.51396
101.53635
101.55032
101.50394
101.51584
101.52427
101.49527
101.

101.504974
101.43024
101.50196
101.48065
101.52023
101.47939
101.44665
101.528946
101.50384
101.490685
101.47212
101.45371
101.458885
101.43806
101.49416
101.475266
101.49709
101.44278
101.48348
101.48717
101.444954
101.48658
101.47931
101.482124
101.463295
101.51894
101.47745
101.49362
101.47488
101.49055
101.51922
101.49545
101.53278
101.54519
101.51784
101.504074
101.49994
101.4739
101.54888
101.55019
101.492775
101.506004
101.49214
101.50264
101.50608
101.49476
101.4874
101.437454
101.485664
101.518875
101.4858
101.475815
101.47629
101.49357
101.43882
101.505646
101.5011
101.49808
101.47373
101.48533
101.46697
101.45162
101.46802
101.438576
101.45147
101.438354
101.42927
101.45318
101.47832
101.49731
101.4533
101.431244
101.42813
101.45526
101.41762
101.47966
101.44743
101.43736
101.42485
101.41952
101.41653
101.42629
101.41169
101.44734
101.435104
101.39202
101.43351
101.45841
101.45958
101.47993
101.43674
101.4624
101.41255
101.475136
101.46502
101.470024
101.466
101.472275
101.4

102.392624
102.42033
102.445984
102.39691
102.44399
102.43406
102.44049
102.466606
102.38442
102.436195
102.43759
102.45365
102.47789
102.47451
102.42984
102.49198
102.47905
102.44723
102.50538
102.47171
102.47368
102.44095
102.462006
102.43234
102.472305
102.48944
102.512886
102.45122
102.42706
102.47843
102.48848
102.48776
102.44826
102.49209
102.51038
102.50224
102.535645
102.50619
102.47756
102.5115
102.5177
102.521034
102.53478
102.51134
102.46731
102.51253
102.55109
102.51013
102.520515
102.52694
102.5221
102.53363
102.511314
102.54165
102.53283
102.57969
102.54244
102.53903
102.54678
102.52541
102.49047
102.52586
102.54971
102.486534
102.5203
102.44762
102.541504
102.524574
102.531746
102.52961
102.547134
102.50783
102.50064
102.52379
102.51694
102.5239
102.49535
102.53057
102.516014
102.50501
102.508896
102.548195
102.54096
102.54782
102.5146
102.55807
102.57006
102.5341
102.54363
102.58347
102.55515
102.5437
102.5018
102.55312
102.51716
102.53817
102.53081
102.52768
102.527306

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-a579e3a62a05>", line 233, in <module>
    env.run(agent)
  File "<ipython-input-1-a579e3a62a05>", line 197, in run
    a = agent.act(s)
  File "<ipython-input-1-a579e3a62a05>", line 119, in act
    return numpy.argmax(self.brain.predictOne(s))
  File "<ipython-input-1-a579e3a62a05>", line 67, in predictOne
    return self.predict(s.reshape(1, self.stateCnt), target=target).flatten()
  File "<ipython-input-1-a579e3a62a05>", line 64, in predict
    return self.model.predict(s)
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/keras/engine/training.py", line 1169, in predict
    steps=steps)
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/keras/engine/training_arrays.py", line 294, in predict_loop
    batch_outs = f(ins_batc

KeyboardInterrupt: 

# 4

In [4]:
# -*- coding: utf-8 -*-
"""
Reinforcement Learning (DQN) Tutorial
=====================================
**Author**: `Adam Paszke <https://github.com/apaszke>`_


This tutorial shows how to use PyTorch to train a Deep Q Learning (DQN) agent
on the CartPole-v0 task from the `OpenAI Gym <https://gym.openai.com/>`__.

**Task**

The agent has to decide between two actions - moving the cart left or
right - so that the pole attached to it stays upright. You can find an
official leaderboard with various algorithms and visualizations at the
`Gym website <https://gym.openai.com/envs/CartPole-v0>`__.

.. figure:: /_static/img/cartpole.gif
   :alt: cartpole

   cartpole

As the agent observes the current state of the environment and chooses
an action, the environment *transitions* to a new state, and also
returns a reward that indicates the consequences of the action. In this
task, rewards are +1 for every incremental timestep and the environment
terminates if the pole falls over too far or the cart moves more then 2.4
units away from center. This means better performing scenarios will run
for longer duration, accumulating larger return.

The CartPole task is designed so that the inputs to the agent are 4 real
values representing the environment state (position, velocity, etc.).
However, neural networks can solve the task purely by looking at the
scene, so we'll use a patch of the screen centered on the cart as an
input. Because of this, our results aren't directly comparable to the
ones from the official leaderboard - our task is much harder.
Unfortunately this does slow down the training, because we have to
render all the frames.

Strictly speaking, we will present the state as the difference between
the current screen patch and the previous one. This will allow the agent
to take the velocity of the pole into account from one image.

**Packages**


First, let's import needed packages. Firstly, we need
`gym <https://gym.openai.com/docs>`__ for the environment
(Install using `pip install gym`).
We'll also use the following from PyTorch:

-  neural networks (``torch.nn``)
-  optimization (``torch.optim``)
-  automatic differentiation (``torch.autograd``)
-  utilities for vision tasks (``torchvision`` - `a separate
   package <https://github.com/pytorch/vision>`__).

"""

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


env = gym.make('CartPole-v0').unwrapped

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


######################################################################
# Replay Memory
# -------------
#
# We'll be using experience replay memory for training our DQN. It stores
# the transitions that the agent observes, allowing us to reuse this data
# later. By sampling from it randomly, the transitions that build up a
# batch are decorrelated. It has been shown that this greatly stabilizes
# and improves the DQN training procedure.
#
# For this, we're going to need two classses:
#
# -  ``Transition`` - a named tuple representing a single transition in
#    our environment. It maps essentially maps (state, action) pairs
#    to their (next_state, reward) result, with the state being the
#    screen difference image as described later on.
# -  ``ReplayMemory`` - a cyclic buffer of bounded size that holds the
#    transitions observed recently. It also implements a ``.sample()``
#    method for selecting a random batch of transitions for training.
#

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


######################################################################
# Now, let's define our model. But first, let quickly recap what a DQN is.
#
# DQN algorithm
# -------------
#
# Our environment is deterministic, so all equations presented here are
# also formulated deterministically for the sake of simplicity. In the
# reinforcement learning literature, they would also contain expectations
# over stochastic transitions in the environment.
#
# Our aim will be to train a policy that tries to maximize the discounted,
# cumulative reward
# :math:`R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t`, where
# :math:`R_{t_0}` is also known as the *return*. The discount,
# :math:`\gamma`, should be a constant between :math:`0` and :math:`1`
# that ensures the sum converges. It makes rewards from the uncertain far
# future less important for our agent than the ones in the near future
# that it can be fairly confident about.
#
# The main idea behind Q-learning is that if we had a function
# :math:`Q^*: State \times Action \rightarrow \mathbb{R}`, that could tell
# us what our return would be, if we were to take an action in a given
# state, then we could easily construct a policy that maximizes our
# rewards:
#
# .. math:: \pi^*(s) = \arg\!\max_a \ Q^*(s, a)
#
# However, we don't know everything about the world, so we don't have
# access to :math:`Q^*`. But, since neural networks are universal function
# approximators, we can simply create one and train it to resemble
# :math:`Q^*`.
#
# For our training update rule, we'll use a fact that every :math:`Q`
# function for some policy obeys the Bellman equation:
#
# .. math:: Q^{\pi}(s, a) = r + \gamma Q^{\pi}(s', \pi(s'))
#
# The difference between the two sides of the equality is known as the
# temporal difference error, :math:`\delta`:
#
# .. math:: \delta = Q(s, a) - (r + \gamma \max_a Q(s', a))
#
# To minimise this error, we will use the `Huber
# loss <https://en.wikipedia.org/wiki/Huber_loss>`__. The Huber loss acts
# like the mean squared error when the error is small, but like the mean
# absolute error when the error is large - this makes it more robust to
# outliers when the estimates of :math:`Q` are very noisy. We calculate
# this over a batch of transitions, :math:`B`, sampled from the replay
# memory:
#
# .. math::
#
#    \mathcal{L} = \frac{1}{|B|}\sum_{(s, a, s', r) \ \in \ B} \mathcal{L}(\delta)
#
# .. math::
#
#    \text{where} \quad \mathcal{L}(\delta) = \begin{cases}
#      \frac{1}{2}{\delta^2}  & \text{for } |\delta| \le 1, \\
#      |\delta| - \frac{1}{2} & \text{otherwise.}
#    \end{cases}
#
# Q-network
# ^^^^^^^^^
#
# Our model will be a convolutional neural network that takes in the
# difference between the current and previous screen patches. It has two
# outputs, representing :math:`Q(s, \mathrm{left})` and
# :math:`Q(s, \mathrm{right})` (where :math:`s` is the input to the
# network). In effect, the network is trying to predict the *expected return* of
# taking each action given the current input.
#

class DQN(nn.Module):

    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))


######################################################################
# Input extraction
# ^^^^^^^^^^^^^^^^
#
# The code below are utilities for extracting and processing rendered
# images from the environment. It uses the ``torchvision`` package, which
# makes it easy to compose image transforms. Once you run the cell it will
# display an example patch that it extracted.
#

resize = T.Compose([T.ToPILImage(),
                    T.Resize(40, interpolation=Image.CUBIC),
                    T.ToTensor()])


def get_cart_location(screen_width):
    world_width = env.x_threshold * 2
    scale = screen_width / world_width
    return int(env.state[0] * scale + screen_width / 2.0)  # MIDDLE OF CART

def get_screen():
    # Returned screen requested by gym is 400x600x3, but is sometimes larger
    # such as 800x1200x3. Transpose it into torch order (CHW).
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
    # Cart is in the lower half, so strip off the top and bottom of the screen
    _, screen_height, screen_width = screen.shape
    screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]
    view_width = int(screen_width * 0.6)
    cart_location = get_cart_location(screen_width)
    if cart_location < view_width // 2:
        slice_range = slice(view_width)
    elif cart_location > (screen_width - view_width // 2):
        slice_range = slice(-view_width, None)
    else:
        slice_range = slice(cart_location - view_width // 2,
                            cart_location + view_width // 2)
    # Strip off the edges, so that we have a square image centered on a cart
    screen = screen[:, :, slice_range]
    # Convert to float, rescale, convert to torch tensor
    # (this doesn't require a copy)
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen)
    # Resize, and add a batch dimension (BCHW)
    return resize(screen).unsqueeze(0).to(device)


env.reset()
plt.figure()
plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(),
           interpolation='none')
plt.title('Example extracted screen')
plt.show()


######################################################################
# Training
# --------
#
# Hyperparameters and utilities
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# This cell instantiates our model and its optimizer, and defines some
# utilities:
#
# -  ``select_action`` - will select an action accordingly to an epsilon
#    greedy policy. Simply put, we'll sometimes use our model for choosing
#    the action, and sometimes we'll just sample one uniformly. The
#    probability of choosing a random action will start at ``EPS_START``
#    and will decay exponentially towards ``EPS_END``. ``EPS_DECAY``
#    controls the rate of the decay.
# -  ``plot_durations`` - a helper for plotting the durations of episodes,
#    along with an average over the last 100 episodes (the measure used in
#    the official evaluations). The plot will be underneath the cell
#    containing the main training loop, and will update after every
#    episode.
#

BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)


steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


episode_durations = []


def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())


######################################################################
# Training loop
# ^^^^^^^^^^^^^
#
# Finally, the code for training our model.
#
# Here, you can find an ``optimize_model`` function that performs a
# single step of the optimization. It first samples a batch, concatenates
# all the tensors into a single one, computes :math:`Q(s_t, a_t)` and
# :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our
# loss. By defition we set :math:`V(s) = 0` if :math:`s` is a terminal
# state. We also use a target network to compute :math:`V(s_{t+1})` for
# added stability. The target network has its weights kept frozen most of
# the time, but is updated with the policy network's weights every so often.
# This is usually a set number of steps but we shall use episodes for
# simplicity.
#

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.uint8)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()


######################################################################
#
# Below, you can find the main training loop. At the beginning we reset
# the environment and initialize the ``state`` Tensor. Then, we sample
# an action, execute it, observe the next screen and the reward (always
# 1), and optimize our model once. When the episode ends (our model
# fails), we restart the loop.
#
# Below, `num_episodes` is set small. You should download
# the notebook and run lot more epsiodes, such as 300+ for meaningful
# duration improvements.
#

num_episodes = 50
for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    last_screen = get_screen()
    current_screen = get_screen()
    state = current_screen - last_screen
    for t in count():
        # Select and perform an action
        action = select_action(state)
        _, reward, done, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device)

        # Observe new state
        last_screen = current_screen
        current_screen = get_screen()
        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            plot_durations()
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')
env.render()
env.close()
plt.ioff()
plt.show()

######################################################################
# Here is the diagram that illustrates the overall resulting data flow.
#
# .. figure:: /_static/img/reinforcement_learning_diagram.jpg
#
# Actions are chosen either randomly or based on a policy, getting the next
# step sample from the gym environment. We record the results in the
# replay memory and also run optimization step on every iteration.
# Optimization picks a random batch from the replay memory to do training of the
# new policy. "Older" target_net is also used in optimization to compute the
# expected Q values; it is updated occasionally to keep it current.
#

# https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
# https://raw.githubusercontent.com/pytorch/tutorials/master/intermediate_source/reinforcement_q_learning.py

<Figure size 432x288 with 0 Axes>

Complete


<Figure size 432x288 with 0 Axes>

# 5

In [3]:
# double DQN


import gym
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import numpy as np
import random
from matplotlib import pyplot as plt


# CARTPOLE GAME SETTINGS
OBSERVATION_SPACE_DIMS = 4
ACTION_SPACE = [0,1]

# AGENT/NETWORK HYPERPARAMETERS
EPSILON_INITIAL = 0.5 # exploration rate
EPSILON_DECAY = 0.99
EPSILON_MIN = 0.01
ALPHA = 0.001 # learning rate
GAMMA = 0.99 # discount factor
TAU = 0.1 # target network soft update hyperparameter
EXPERIENCE_REPLAY_BATCH_SIZE = 32
AGENT_MEMORY_LIMIT = 2000
MIN_MEMORY_FOR_EXPERIENCE_REPLAY = 500



def create_dqn():
    # not actually that deep
    nn = Sequential()
    nn.add(Dense(64, input_dim=OBSERVATION_SPACE_DIMS, activation='relu'))
    nn.add(Dense(64, activation='relu'))
    nn.add(Dense(len(ACTION_SPACE), activation='linear'))
    nn.compile(loss='mse', optimizer=Adam(lr=ALPHA))
    return nn
                  
                  
class DoubleDQNAgent(object):

       
    def __init__(self):
        self.memory = []
        self.online_network = create_dqn()
        self.target_network = create_dqn()
        self.epsilon = EPSILON_INITIAL
        self.has_talked = False
    
    
    def act(self, state):
        if self.epsilon > np.random.rand():
            # explore
            return np.random.choice(ACTION_SPACE)
        else:
            # exploit
            state = self._reshape_state_for_net(state)
            q_values = self.online_network.predict(state)[0]
            return np.argmax(q_values)


    def experience_replay(self):

        minibatch = random.sample(self.memory, EXPERIENCE_REPLAY_BATCH_SIZE)
        minibatch_new_q_values = []

        for experience in minibatch:
            state, action, reward, next_state, done = experience
            state = self._reshape_state_for_net(state)
            experience_new_q_values = self.online_network.predict(state)[0]
            if done:
                q_update = reward
            else:
                next_state = self._reshape_state_for_net(next_state)
                # using online network to SELECT action
                online_net_selected_action = np.argmax(self.online_network.predict(next_state))
                # using target network to EVALUATE action
                target_net_evaluated_q_value = self.target_network.predict(next_state)[0][online_net_selected_action]
                q_update = reward + GAMMA * target_net_evaluated_q_value
            experience_new_q_values[action] = q_update
            minibatch_new_q_values.append(experience_new_q_values)
        minibatch_states = np.array([e[0] for e in minibatch])
        minibatch_new_q_values = np.array(minibatch_new_q_values)
        self.online_network.fit(minibatch_states, minibatch_new_q_values, verbose=False, epochs=1)
        
        
    def update_target_network(self):
        q_network_theta = self.online_network.get_weights()
        target_network_theta = self.target_network.get_weights()
        counter = 0
        for q_weight, target_weight in zip(q_network_theta,target_network_theta):
            target_weight = target_weight * (1-TAU) + q_weight * TAU
            target_network_theta[counter] = target_weight
            counter += 1
        self.target_network.set_weights(target_network_theta)


    def remember(self, state, action, reward, next_state, done):
        if len(self.memory) <= AGENT_MEMORY_LIMIT:
            experience = (state, action, reward, next_state, done)
            self.memory.append(experience)
                  
                  
    def update_epsilon(self):
        self.epsilon = max(self.epsilon * EPSILON_DECAY, EPSILON_MIN)


    def _reshape_state_for_net(self, state):
        return np.reshape(state,(1, OBSERVATION_SPACE_DIMS))  


def test_agent():
    env = gym.make('CartPole-v0')
    env.seed(1)
    trials = []
    NUMBER_OF_TRIALS=10
    MAX_TRAINING_EPISODES = 2000
    MAX_STEPS_PER_EPISODE = 200

    for trial_index in range(NUMBER_OF_TRIALS):
        agent = DoubleDQNAgent()
        trial_episode_scores = []

        for episode_index in range(1, MAX_TRAINING_EPISODES+1):
            state = env.reset()
            episode_score = 0

            for _ in range(MAX_STEPS_PER_EPISODE):
                action = agent.act(state)
                next_state, reward, done, _ = env.step(action)
                episode_score += reward
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                if len(agent.memory) > MIN_MEMORY_FOR_EXPERIENCE_REPLAY:
                    agent.experience_replay()
                    agent.update_target_network()
                if done:
                    break
            
            trial_episode_scores.append(episode_score)
            agent.update_epsilon()
            last_100_avg = np.mean(trial_episode_scores[-100:])
            print('E %d scored %d, avg %.2f' % (episode_index, episode_score, last_100_avg))
            if len(trial_episode_scores) >= 100 and last_100_avg >= 195.0:
                print('Trial %d solved in %d episodes!' % (trial_index, (episode_index - 100)))
                break
        trials.append(np.array(trial_episode_scores))
    return np.array(trials)



def plot_trials(trials):
    _, axis = plt.subplots()    

    for i, trial in enumerate(trials):
        steps_till_solve = trial.shape[0]-100
        # stop trials at 2000 steps
        if steps_till_solve < 1900:
            bar_color = 'b'
            bar_label = steps_till_solve
        else:
            bar_color = 'r'
            bar_label = 'Stopped at 2000'
        plt.bar(np.arange(i,i+1), steps_till_solve, 0.5, color=bar_color, align='center', alpha=0.5)
        axis.text(i-.25, steps_till_solve + 20, bar_label, color=bar_color)

    plt.ylabel('Episodes Till Solve')
    plt.xlabel('Trial')
    trial_labels = [str(i+1) for i in range(len(trials))]
    plt.xticks(np.arange(len(trials)), trial_labels)
    # remove y axis labels and ticks
    axis.yaxis.set_major_formatter(plt.NullFormatter())
    plt.tick_params(axis='both', left='off')

    plt.title('Double DQN CartPole v-0 Trials')
    plt.show()


def plot_individual_trial(trial):
    plt.plot(trial)
    plt.ylabel('Steps in Episode')
    plt.xlabel('Episode')
    plt.title('Double DQN CartPole v-0 Steps in Select Trial')
    plt.show()


if __name__ == '__main__':
    trials = test_agent()
    # print 'Saving', file_name
    # np.save('double_dqn_cartpole_trials.npy', trials)
    # trials = np.load('double_dqn_cartpole_trials.npy')
    plot_trials(trials)
    plot_individual_trial(trials[1])
    
# https://raw.githubusercontent.com/lsimmons2/double-dqn-cartpole-solution/master/double_dqn.py
# https://medium.com/@leosimmons/double-dqn-implementation-to-solve-openai-gyms-cartpole-v-0-df554cd0614d

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
E 1 scored 19, avg 19.00
E 2 scored 11, avg 15.00
E 3 scored 13, avg 14.33
E 4 scored 12, avg 13.75
E 5 scored 19, avg 14.80
E 6 scored 14, avg 14.67
E 7 scored 11, avg 14.14
E 8 scored 11, avg 13.75
E 9 scored 13, avg 13.67
E 10 scored 15, avg 13.80
E 11 scored 10, avg 13.45
E 12 scored 9, avg 13.08
E 13 scored 14, avg 13.15
E 14 scored 9, avg 12.86
E 15 scored 15, avg 13.00
E 16 scored 13, avg 13.00
E 17 scored 13, avg 13.00
E 18 scored 9, avg 12.78
E 19 scored 11, avg 12.68
E 20 scored 10, avg 12.55
E 21 scored 13, avg 12.57
E 22 scored 16, avg 12.73
E 23 scored 8, avg 12.52
E 24 scored 15, avg 12.62
E 25 scored 16, avg 12.76
E 26 scored 12, avg 12.73
E 27 scored 9, avg 12.59
E 28 scored 10, avg 12.50
E 29 scored 10, avg 12.41
E 30 scored 12, avg 12.40
E 31 scored 10, avg 12.32
E 32 scored 12, avg 12.31
E 33 scored 9, avg 12.21
E 34 scored 8, avg 12.09
E 35 scored 10, avg 12.03
E 36 scored 10, avg 11.97
E 37 sco

E 1 scored 44, avg 44.00
E 2 scored 48, avg 46.00
E 3 scored 92, avg 61.33
E 4 scored 79, avg 65.75
E 5 scored 49, avg 62.40
E 6 scored 57, avg 61.50
E 7 scored 125, avg 70.57
E 8 scored 44, avg 67.25
E 9 scored 44, avg 64.67
E 10 scored 9, avg 59.10
E 11 scored 14, avg 55.00
E 12 scored 18, avg 51.92
E 13 scored 13, avg 48.92
E 14 scored 10, avg 46.14
E 15 scored 8, avg 43.60
E 16 scored 12, avg 41.62
E 17 scored 9, avg 39.71
E 18 scored 13, avg 38.22
E 19 scored 12, avg 36.84
E 20 scored 9, avg 35.45
E 21 scored 11, avg 34.29
E 22 scored 47, avg 34.86
E 23 scored 18, avg 34.13
E 24 scored 28, avg 33.88
E 25 scored 18, avg 33.24
E 26 scored 25, avg 32.92
E 27 scored 42, avg 33.26
E 28 scored 15, avg 32.61
E 29 scored 33, avg 32.62
E 30 scored 31, avg 32.57
E 31 scored 35, avg 32.65
E 32 scored 26, avg 32.44
E 33 scored 62, avg 33.33
E 34 scored 29, avg 33.21
E 35 scored 64, avg 34.09
E 36 scored 16, avg 33.58
E 37 scored 29, avg 33.46
E 38 scored 53, avg 33.97
E 39 scored 23, avg 33.6

E 137 scored 200, avg 110.48
E 138 scored 200, avg 112.36
E 139 scored 200, avg 114.23
E 140 scored 200, avg 116.12
E 141 scored 200, avg 118.01
E 142 scored 200, avg 119.91
E 143 scored 200, avg 121.82
E 144 scored 200, avg 123.71
E 145 scored 200, avg 125.59
E 146 scored 200, avg 127.50
E 147 scored 200, avg 129.40
E 148 scored 200, avg 131.28
E 149 scored 200, avg 133.18
E 150 scored 200, avg 135.09
E 151 scored 200, avg 137.00
E 152 scored 200, avg 138.92
E 153 scored 200, avg 140.80
E 154 scored 200, avg 142.63
E 155 scored 200, avg 144.49
E 156 scored 200, avg 146.35
E 157 scored 200, avg 148.18
E 158 scored 200, avg 150.00
E 159 scored 200, avg 151.90
E 160 scored 200, avg 153.81
E 161 scored 200, avg 155.71
E 162 scored 200, avg 157.61
E 163 scored 200, avg 159.49
E 164 scored 200, avg 161.38
E 165 scored 200, avg 163.22
E 166 scored 200, avg 165.13
E 167 scored 200, avg 166.81
E 168 scored 200, avg 168.55
E 169 scored 200, avg 169.53
E 170 scored 200, avg 171.05
E 171 scored 2

E 84 scored 60, avg 30.24
E 85 scored 155, avg 31.71
E 86 scored 56, avg 31.99
E 87 scored 136, avg 33.18
E 88 scored 56, avg 33.44
E 89 scored 121, avg 34.43
E 90 scored 37, avg 34.46
E 91 scored 46, avg 34.58
E 92 scored 46, avg 34.71
E 93 scored 48, avg 34.85
E 94 scored 103, avg 35.57
E 95 scored 64, avg 35.87
E 96 scored 95, avg 36.49
E 97 scored 69, avg 36.82
E 98 scored 182, avg 38.31
E 99 scored 184, avg 39.78
E 100 scored 200, avg 41.38
E 101 scored 200, avg 43.19
E 102 scored 200, avg 45.07
E 103 scored 200, avg 46.73
E 104 scored 200, avg 48.59
E 105 scored 200, avg 50.46
E 106 scored 200, avg 52.36
E 107 scored 63, avg 52.91
E 108 scored 200, avg 54.62
E 109 scored 200, avg 56.47
E 110 scored 200, avg 58.30
E 111 scored 200, avg 60.20
E 112 scored 200, avg 62.09
E 113 scored 200, avg 63.95
E 114 scored 200, avg 65.79
E 115 scored 200, avg 67.63
E 116 scored 200, avg 69.51
E 117 scored 200, avg 71.40
E 118 scored 200, avg 73.31
E 119 scored 200, avg 75.16
E 120 scored 200, a

E 185 scored 101, avg 143.85
E 186 scored 139, avg 143.70
E 187 scored 200, avg 143.70
E 188 scored 120, avg 142.94
E 189 scored 200, avg 143.20
E 190 scored 200, avg 143.44
E 191 scored 108, avg 142.75
E 192 scored 147, avg 143.14
E 193 scored 106, avg 142.70
E 194 scored 129, avg 141.99
E 195 scored 129, avg 141.73
E 196 scored 85, avg 141.19
E 197 scored 157, avg 140.87
E 198 scored 94, avg 140.30
E 199 scored 200, avg 141.00
E 200 scored 182, avg 141.54
E 201 scored 200, avg 141.95
E 202 scored 107, avg 141.45
E 203 scored 123, avg 140.73
E 204 scored 200, avg 140.85
E 205 scored 200, avg 140.85
E 206 scored 200, avg 140.85
E 207 scored 113, avg 140.78
E 208 scored 99, avg 140.14
E 209 scored 200, avg 140.91
E 210 scored 138, avg 140.78
E 211 scored 91, avg 140.36
E 212 scored 200, avg 140.66
E 213 scored 200, avg 141.56
E 214 scored 83, avg 141.07
E 215 scored 200, avg 141.07
E 216 scored 86, avg 140.72
E 217 scored 200, avg 140.95
E 218 scored 97, avg 140.45
E 219 scored 200, avg

E 167 scored 123, avg 157.50
E 168 scored 199, avg 159.05
E 169 scored 153, avg 159.04
E 170 scored 173, avg 160.13
E 171 scored 200, avg 161.83
E 172 scored 153, avg 162.55
E 173 scored 160, avg 163.59
E 174 scored 175, avg 165.01
E 175 scored 200, avg 165.86
E 176 scored 151, avg 166.65
E 177 scored 163, avg 167.66
E 178 scored 200, avg 168.44
E 179 scored 159, avg 169.09
E 180 scored 134, avg 169.84
E 181 scored 161, avg 171.10
E 182 scored 177, avg 172.11
E 183 scored 141, avg 172.59
E 184 scored 176, avg 172.97
E 185 scored 200, avg 172.97
E 186 scored 139, avg 172.36
E 187 scored 143, avg 172.47
E 188 scored 125, avg 172.52
E 189 scored 184, avg 173.06
E 190 scored 126, avg 172.79
E 191 scored 175, avg 172.54
E 192 scored 136, avg 172.13
E 193 scored 131, avg 172.08
E 194 scored 186, avg 172.26
E 195 scored 200, avg 172.26


KeyboardInterrupt: 

# 6

In [1]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import gym
import numpy as np
import random as ran

env = gym.make('CartPole-v1')

# 꺼내서 사용할 리플레이 갯수
REPLAY = 10
# 리플레이를 저장할 리스트
REPLAY_MEMORY = []
# 미니배치
MINIBATCH = 50

INPUT = env.observation_space.shape[0]
OUTPUT = env.action_space.n

# 하이퍼파라미터
LEARNING_LATE = 0.01
NUM_EPISODE = 2000

DISCOUNT = 0.99


# 네트워크 구성
x=tf.placeholder(dtype=tf.float32, shape=(1,4))

W1 = tf.get_variable('W1',shape=[INPUT,10],initializer=tf.contrib.layers.xavier_initializer())
W2 = tf.get_variable('W4',shape=[10, OUTPUT],initializer=tf.contrib.layers.xavier_initializer())

L1=tf.nn.tanh(tf.matmul(x,W1))
Q_pre = tf.matmul(L1,W2)

y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))

# 손실 함수
loss = tf.reduce_sum(tf.square(y-Q_pre))
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_LATE)
train = optimizer.minimize(loss)

init = tf.global_variables_initializer()

rList=[]

with tf.Session() as sess:
    sess.run(init)
    for episode in range(5000):

        s = env.reset()

        e = 1. / ((episode/25)+1)
        rall = 0
        d = False
        count=0

        while not d:
            # env.render()
            count+=1

            # 현재 상태(s)로 Q값을 예측
            s_t = np.reshape(s,[1,INPUT])
            Q = sess.run(Q_pre, feed_dict={x:s_t})

            # e-greedy 를 사용하여 action값 구함
            if e > np.random.rand(1):
                a = env.action_space.sample()
            else:
                a = np.argmax(Q)

            # action을 취함
            s1, r, d, _ = env.step(a)

            # state, action, reward, next_state, done 을 메모리에 저장
            REPLAY_MEMORY.append([s_t,a,r,s1,d])

            # 메모리에 50000개 이상의 값이 들어가면 가장 먼저 들어간 것부터 삭제
            if len(REPLAY_MEMORY) > 50000:
                del REPLAY_MEMORY[0]

            rall += r
            s = s1

        # 10 번의 스탭마다 미니배치로 학습
        if episode % 10 == 1 :

            for i in range(MINIBATCH):

                # 메모리에서 사용할 리플레이를 랜덤하게 가져옴
                for sample in ran.sample(REPLAY_MEMORY, REPLAY):

                    s_t_r, a_r, r_r, s1_r ,d_r = sample

                    # DQN 알고리즘으로 학습
                    if d_r:
                        Q[0, a_r] = -100
                    else:
                        s1_t_r= np.reshape(s1_r,[1,INPUT])

                        Q1 = sess.run(Q_pre, feed_dict={x: s1_t_r})

                        Q[0, a_r] = r_r + DISCOUNT * np.max(Q1)

                    sess.run(train, feed_dict={x: s_t_r, y: Q})



        rList.append(rall)
        print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(episode, count, rall, np.mean(rList)))


    for episode in range(500):
        # state 초기화
        s = env.reset()

        rall = 0
        d = False
        count = 0
        # 에피소드가 끝나기 전까지 반복
        while not d :
            env.render()
            count += 1
            # state 값의 전처리
            s_t = np.reshape(s, [1, INPUT])

            # 현재 상태의 Q값을 에측
            Q = sess.run(Q_pre, feed_dict={x: s_t})
            a = np.argmax(Q)

            # 결정된 action으로 Environment에 입력
            s, r, d, _ = env.step(a)

            # 총 reward 합
            rall += r


        rList.append(rall)

        print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall,
                                                                        np.mean(rList)))
# https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/master/CartPole/CartPole_DQN_NIPS2013.py


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Episode 0 finished after 18 timesteps with r=18.0. Running score: 18.0
Episode 1 finished after 13 timesteps with r=13.0. Running score: 15.5
Episode 2 finished after 24 timesteps with r=24.0. Running score: 18.333333333333332
Episode 3 finished after 13 timesteps with r=13.0. Running score: 17.0
Episode 4 finished after 13 timesteps with r=13.0. Running score: 16.2
Episode 5 finished after 11 timesteps with r=11.0. Running score: 15.333333333333334
Episode 6 finished after 10 timesteps with r=10.0. Running score: 14.571428571428571
Episode 7 finished after 14 timesteps with r=14.0. Running score: 14.5
Episode 8 finished after 35 timesteps with r=35.0. Running score: 16.7777777777777

Episode 101 finished after 11 timesteps with r=11.0. Running score: 13.382352941176471
Episode 102 finished after 10 timesteps with r=10.0. Running score: 13.349514563106796
Episode 103 finished after 11 timesteps with r=11.0. Running score: 13.326923076923077
Episode 104 finished after 10 timesteps with r=10.0. Running score: 13.295238095238096
Episode 105 finished after 10 timesteps with r=10.0. Running score: 13.264150943396226
Episode 106 finished after 10 timesteps with r=10.0. Running score: 13.233644859813085
Episode 107 finished after 9 timesteps with r=9.0. Running score: 13.194444444444445
Episode 108 finished after 9 timesteps with r=9.0. Running score: 13.155963302752294
Episode 109 finished after 10 timesteps with r=10.0. Running score: 13.127272727272727
Episode 110 finished after 11 timesteps with r=11.0. Running score: 13.108108108108109
Episode 111 finished after 9 timesteps with r=9.0. Running score: 13.071428571428571
Episode 112 finished after 10 timesteps with r=10

KeyboardInterrupt: 

# 7

In [1]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import gym
from gym import wrappers
import numpy as np
import random as ran

env = gym.make('CartPole-v0')

# 꺼내서 사용할 리플레이 갯수
REPLAY = 50
# 리플레이를 저장할 리스트
REPLAY_MEMORY = []
# 미니배치
MINIBATCH = 50

INPUT = env.observation_space.shape[0]
OUTPUT = env.action_space.n

# 하이퍼파라미터
LEARNING_LATE = 0.001
DISCOUNT = 0.99
model_path = "save/model.ckpt"


# 두개의 네트워크 구성

x=tf.placeholder(dtype=tf.float32, shape=(None, INPUT))

y=tf.placeholder(dtype=tf.float32, shape=(None, OUTPUT))
dropout = tf.placeholder(dtype=tf.float32)

# Main 네트워크
W1 = tf.get_variable('W1',shape=[INPUT, 200],initializer=tf.contrib.layers.xavier_initializer())
W2 = tf.get_variable('W2',shape=[200,200],initializer=tf.contrib.layers.xavier_initializer())
# W3 = tf.get_variable('W3',shape=[200,150],initializer=tf.contrib.layers.xavier_initializer())
W4 = tf.get_variable('W4',shape=[200, OUTPUT],initializer=tf.contrib.layers.xavier_initializer())

b1 = tf.Variable(tf.zeros([1],dtype=tf.float32))
b2 = tf.Variable(tf.zeros([1],dtype=tf.float32))

_L1=tf.nn.relu(tf.matmul(x,W1)+b1)
L1=tf.nn.dropout(_L1,dropout)
_L2=tf.nn.relu(tf.matmul(L1,W2)+b2)
L2=tf.nn.dropout(_L2,dropout)
# L3=tf.nn.relu(tf.matmul(L2,W3))
Q_pre = tf.matmul(L2,W4)

# Target 네트워크
W1_r = tf.get_variable('W1_r',shape=[INPUT, 200])
W2_r = tf.get_variable('W2_r',shape=[200,200])
# W3_r = tf.get_variable('W3_r',shape=[200,150])
W4_r = tf.get_variable('W4_r',shape=[200, OUTPUT])

b1_r = tf.Variable(tf.zeros([1],dtype=tf.float32))
b2_r = tf.Variable(tf.zeros([1],dtype=tf.float32))


L1_r=tf.nn.relu(tf.matmul(x ,W1_r)+b1_r)
L2_r=tf.nn.relu(tf.matmul(L1_r,W2_r)+b2_r)
# L3_r=tf.nn.relu(tf.matmul(L2_r,W3_r))
Q_pre_r = tf.matmul(L2_r,W4_r)

# 총 Reward를 저장해놓을 리스트
rlist=[0]
recent_rlist=[0]

episode = 0

# Loss function 정의
cost = tf.reduce_sum(tf.square(y-Q_pre))
optimizer = tf.train.AdamOptimizer(LEARNING_LATE, epsilon=0.01)
train = optimizer.minimize(cost)


saver = tf.train.Saver()
loss = None

# 세션 정의
with tf.Session() as sess:
    # 변수 초기화
    sess.run(tf.global_variables_initializer())
    # Target 네트워크에 main 네트워크 값을 카피해줌
    sess.run(W1_r.assign(W1))
    sess.run(W2_r.assign(W2))
    sess.run(W4_r.assign(W4))
    sess.run(b1_r.assign(b1))
    sess.run(b2_r.assign(b2))

    # 에피소드 시작
    while np.mean(recent_rlist) < 195 :
        episode += 1

        # state 초기화
        s = env.reset()
        if len(recent_rlist) > 200:
            del recent_rlist[0]
        # e-greedy
        e = 1. / ((episode/25)+1)

        rall = 0
        d = False
        count = 0

        # 에피소드가 끝나기 전까지 반복
        while not d and count < 10000 :

            #env.render()
            count += 1

            # state 값의 전처리
            s_t = np.reshape(s,[1,INPUT])

            # 현재 상태의 Q값을 에측
            Q = sess.run(Q_pre, feed_dict={x:s_t, dropout: 1})

            # e-greedy 정책으로 랜덤하게 action 결정
            if e > np.random.rand(1):
                a = env.action_space.sample()
            else:
                a = np.argmax(Q)

            # 결정된 action으로 Environment에 입력
            s1, r, d, _ = env.step(a)

            # Environment에서 반환한 Next_state, action, reward, done 값들을
            # Replay_memory에 저장
            REPLAY_MEMORY.append([s_t,a,r,s1,d,count])

            # 저장된 값들이 50000개 이상 넘어가면 맨 앞 Replay부터 삭제
            if len(REPLAY_MEMORY) > 50000:
                del REPLAY_MEMORY[0]

            # 총 reward 합
            rall += r
            # state를 Next_state로 바꿈
            s = s1


        # 10번의 episode마다 학습
        if episode % 10 == 1 and len(REPLAY_MEMORY) > 50:

            # 50번의 미니배치로 학습
                # 저장된 리플레이 중에 학습에 사용할 랜덤한 리플레이 샘플들을 가져옴
            for sample in ran.sample(REPLAY_MEMORY, REPLAY):

                s_t_r, a_r, r_r, s1_r, d_r ,count_r= sample

                # 꺼내온 리플레이의 state의 Q값을 예측
                Q = sess.run(Q_pre, feed_dict={x: s_t_r, dropout: 1})

                if d_r:
                    # 꺼내온 리플레이의 상태가 끝난 상황이라면 Negative Reward를 부여
                    if count_r < env.spec.timestep_limit :
                        Q[0, a_r] = -100
                else:
                    # 끝나지 않았다면 Q값을 업데이트
                    s1_t_r= np.reshape(s1_r,[1,INPUT])
                    Q1 = sess.run(Q_pre_r, feed_dict={x: s1_t_r})
                    Q[0, a_r] = r_r + DISCOUNT * np.max(Q1)

                # 업데이트 된 Q값으로 main네트워크를 학습
                _, loss = sess.run([train, cost], feed_dict={x: s_t_r, y: Q, dropout:1})

            # 10번 마다 target 네트워크에 main 네트워크 값을 copy
            sess.run(W1_r.assign(W1))
            sess.run(W2_r.assign(W2))
            sess.run(W4_r.assign(W4))
            sess.run(b1_r.assign(b1))
            sess.run(b2_r.assign(b2))
            print(loss)

        # 총 reward의 합을 list에 저장
        recent_rlist.append(rall)
        rlist.append(rall)
        print("Episode:{} steps:{} reward:{} average reward:{} recent reward:{}".format(episode, count, rall,
                                                                                        np.mean(rlist),
                                                                                        np.mean(recent_rlist)))
        print("loss:",loss)
        

    save_path = saver.save(sess, model_path)
    print("Model saved in file: ",save_path)


    rlist=[]
    recent_rlist=[]


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, model_path)

    print("Model restored form file: ", save_path)
    for episode in range(500):
        # state 초기화
        s = env.reset()

        rall = 0
        d = False
        count = 0
        # 에피소드가 끝나기 전까지 반복
        while not d :
            env.render()
            count += 1
            # state 값의 전처리
            s_t = np.reshape(s, [1, INPUT])

            # 현재 상태의 Q값을 에측
            Q = sess.run(Q_pre, feed_dict={x: s_t,dropout: 1})
            a = np.argmax(Q)

            # 결정된 action으로 Environment에 입력
            s, r, d, _ = env.step(a)

            # 총 reward 합
            rall += r


        rlist.append(rall)

        print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall,
                                                                        np.mean(rlist)))
# https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/master/CartPole/CartPole_DQN_Nature2015.py


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Episode:1 steps:20 reward:20.0 average reward:10.0 recent reward:10.0
loss: None
Episode:2 steps:30 reward:30.0 average reward:16.666666666666668 recent reward:16.666666666666668
loss: None
Episode:3 steps:16 reward:16.0 average reward:16.5 recent reward:16.5
loss: None
Episode:4 steps:14 reward:14.0 average reward:16.0 recent reward:16.0
loss: None
Episode:5 steps:29 reward:29.0 average reward:18.166666666666668 recent reward:18.166666666666668
loss: None
Episode:6 steps:37 reward:37.0 average reward:20.857142857142858 recent reward:20.857142857142858
loss: None
Episode

6.6924105
Episode:71 steps:100 reward:100.0 average reward:62.59722222222222 recent reward:62.59722222222222
loss: 6.6924105
Episode:72 steps:27 reward:27.0 average reward:62.10958904109589 recent reward:62.10958904109589
loss: 6.6924105
Episode:73 steps:22 reward:22.0 average reward:61.567567567567565 recent reward:61.567567567567565
loss: 6.6924105
Episode:74 steps:21 reward:21.0 average reward:61.026666666666664 recent reward:61.026666666666664
loss: 6.6924105
Episode:75 steps:21 reward:21.0 average reward:60.5 recent reward:60.5
loss: 6.6924105
Episode:76 steps:28 reward:28.0 average reward:60.077922077922075 recent reward:60.077922077922075
loss: 6.6924105
Episode:77 steps:21 reward:21.0 average reward:59.57692307692308 recent reward:59.57692307692308
loss: 6.6924105
Episode:78 steps:28 reward:28.0 average reward:59.177215189873415 recent reward:59.177215189873415
loss: 6.6924105
Episode:79 steps:41 reward:41.0 average reward:58.95 recent reward:58.95
loss: 6.6924105
Episode:80 st

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-7613b1dbfe4a>", line 115, in <module>
    Q = sess.run(Q_pre, feed_dict={x:s_t, dropout: 1})
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 929, in run
    run_metadata_ptr)
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1152, in _run
    feed_dict_tensor, options, run_metadata)
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
    run_metadata)
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1334, in _do_call
    return fn(*args)
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/site-packages/tensor

KeyboardInterrupt: 

# 8

In [4]:
import sys
from pylab import *
# import seaborn as sns
import math
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import tensorflow as tf
import gym
import numpy as np
import tensorflow.contrib.layers as layers
from gym import wrappers


class Agent(object):
    def __init__(self, input_size=4, hidden_size=2, gamma=0.95,
                 action_size=2, lr=0.1, dir='tmp/trial/'):
        # call the cartpole simulator from OpenAI gym package
        self.env = gym.make('CartPole-v0')
        # If you wish to save the simulation video, simply uncomment the line below
        # self.env = wrappers.Monitor(self.env, dir, force=True, video_callable=self.video_callable)

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.gamma = gamma
        self.action_size = action_size
        self.lr = lr
        # save the hyper parameters
        self.params = self.__dict__.copy()

        # inputs to the controller
        self.input_pl = tf.placeholder(tf.float32, [None, input_size])
        self.action_pl = tf.placeholder(tf.int32, [None])
        self.reward_pl = tf.placeholder(tf.float32, [None])

        # Here we use a single layered neural network as controller, which proved to be sufficient enough.
        # More complicated ones can be plugged in as well.
        # hidden_layer = layers.fully_connected(self.input_pl,
        #                                      hidden_size,
        #                                      biases_initializer=None,
        #                                      activation_fn=tf.nn.relu)
        # hidden_layer = layers.fully_connected(hidden_layer,
        #                                       hidden_size,
        #                                       biases_initializer=None,
        #                                       activation_fn=tf.nn.relu)
        self.output = layers.fully_connected(self.input_pl,
                                             action_size,
                                             biases_initializer=None,
                                             activation_fn=tf.nn.softmax)

        # responsible output
        self.one_hot = tf.one_hot(self.action_pl, action_size)
        self.responsible_output = tf.reduce_sum(self.output * self.one_hot, axis=1)

        # loss value of the network
        self.loss = -tf.reduce_mean(tf.log(self.responsible_output) * self.reward_pl)

        # get all network variables
        variables = tf.trainable_variables()
        self.variable_pls = []
        for i, var in enumerate(variables):
            self.variable_pls.append(tf.placeholder(tf.float32))

        # compute the gradient values
        self.gradients = tf.gradients(self.loss, variables)

        # update network variables
        solver = tf.train.AdamOptimizer(learning_rate=self.lr)
        # solver = tf.train.MomentumOptimizer(learning_rate=self.lr,momentum=0.95)
        self.update = solver.apply_gradients(zip(self.variable_pls, variables))


    def video_callable(self, episode_id):
        # display the simulation trajectory every 50 epoch
        return episode_id % 50 == 0

    def next_action(self, sess, feed_dict, greedy=False):
        """Pick an action based on the current state.
        Args:
        - sess: a tensorflow session
        - feed_dict: parameter for sess.run()
        - greedy: boolean, whether to take action greedily
        Return:
            Integer, action to be taken.
        """
        ans = sess.run(self.output, feed_dict=feed_dict)[0]
        if greedy:
            return ans.argmax()
        else:
            return np.random.choice(range(self.action_size), p=ans)

    def show_parameters(self):
        """Helper function to show the hyper parameters."""
        for key, value in self.params.items():
            print(key, '=', value)

def discounted_reward(rewards, gamma):
    """Compute the discounted reward."""
    ans = np.zeros_like(rewards)
    running_sum = 0
    # compute the result backward
    for i in reversed(range(len(rewards))):
        running_sum = running_sum * gamma + rewards[i]
        ans[i] = running_sum
    return ans

def one_trial(agent, sess, grad_buffer, reward_itr, i, render = True):
    '''
    this function does follow things before a trial is done:
    1. get a sequence of actions based on the current state and a given control policy
    2. get the system response of a given action
    3. get the instantaneous reward of this action
    once a trial is done:
    1. get the "long term" value of the controller
    2. get the gradient of the controller
    3. update the controller variables
    4. output the state history
    '''

    # reset the environment
    s = agent.env.reset()
    for idx in range(len(grad_buffer)):
        grad_buffer[idx] *= 0
    state_history = []
    reward_history = []
    action_history = []
    current_reward = 0

    while True:

        feed_dict = {agent.input_pl: [s]}
        # update the controller deterministically
        greedy = False
        # get the controller output under a given state
        action = agent.next_action(sess, feed_dict, greedy=greedy)
        # get the next states after taking an action
        snext, r, done, _ = agent.env.step(action)
        if render and i % 50 == 0:
            agent.env.render()
        current_reward += r
        state_history.append(s)
        reward_history.append(r)
        action_history.append(action)
        s = snext

        if done:

            # record how long it has been balancing when the simulation is done
            reward_itr += [current_reward]

            # get the "long term" rewards by taking decay parameter gamma into consideration
            rewards = discounted_reward(reward_history, agent.gamma)

            # normalizing the reward makes training faster
            rewards = (rewards - np.mean(rewards)) / np.std(rewards)

            # compute network gradients
            feed_dict = {
                agent.reward_pl: rewards,
                agent.action_pl: action_history,
                agent.input_pl: np.array(state_history)
            }
            episode_gradients = sess.run(agent.gradients,feed_dict=feed_dict)
            for idx, grad in enumerate(episode_gradients):
                grad_buffer[idx] += grad

            # apply gradients to the network variables
            feed_dict = dict(zip(agent.variable_pls, grad_buffer))
            sess.run(agent.update, feed_dict=feed_dict)

            # reset the buffer to zero
            for idx in range(len(grad_buffer)):
                grad_buffer[idx] *= 0
            break

    return state_history

def animate_itr(i,*args):
    '''animantion of each training epoch'''
    agent, sess, grad_buffer, reward_itr, sess, grad_buffer, agent, obt_itr, render = args
    #
    state_history = one_trial(agent, sess, grad_buffer, reward_itr, i, render)
    xlist = [range(len(reward_itr))]
    ylist = [reward_itr]
    for lnum, line in enumerate(lines_itr):
        line.set_data(xlist[lnum], ylist[lnum])  # set data for each line separately.

    if len(reward_itr) % obt_itr == 0:
        x_mag = 2.4
        y_mag = 30 * 2 * math.pi / 360
        # normalize to (-1,1)
        xlist = [np.asarray(state_history)[:,0] / x_mag]
        ylist = [np.asarray(state_history)[:,2] / y_mag]
        lines_obt.set_data(xlist, ylist)
        tau = 0.02
        time_text_obt.set_text('physical time = %6.2fs' % (len(xlist[0])*tau))

    return (lines_itr,) + (lines_obt,) + (time_text_obt,)


def get_fig(max_epoch):
    fig = plt.figure()
    ax_itr = axes([0.1, 0.1, 0.8, 0.8])
    ax_obt = axes([0.5, 0.2, .3, .3])

    # able to display multiple lines if needed
    global lines_obt, lines_itr, time_text_obt
    lines_itr = []
    lobj = ax_itr.plot([], [], lw=1, color="blue")[0]
    lines_itr.append(lobj)
    lines_obt = []

    ax_itr.set_xlim([0, max_epoch])
    ax_itr.set_ylim([0, 220])#([0, max_reward])
    ax_itr.grid(False)
    ax_itr.set_xlabel('trainig epoch')
    ax_itr.set_ylabel('reward')

    time_text_obt = []
    ax_obt.set_xlim([-1, 1])
    ax_obt.set_ylim([-1, 1])
    ax_obt.set_xlabel('cart position')
    ax_obt.set_ylabel('pole angle')
    lines_obt = ax_obt.plot([], [], lw=1, color="red")[0]
    time_text_obt = ax_obt.text(0.05, 0.9, '', fontsize=13, transform=ax_obt.transAxes)
    return fig, ax_itr, ax_obt, time_text_obt


def main():
    obt_itr = 10
    max_epoch = 3000
    # whether to show the pole balancing animation
    render = True
    dir = 'tmp/trial/'

    # set up figure for animation
    fig, ax_itr, ax_obt, time_text_obt = get_fig(max_epoch)
    agent = Agent(hidden_size=24, lr=0.2, gamma=0.95, dir=dir)
    agent.show_parameters()

    # tensorflow initialization for neural network controller
    tfconfig = tf.ConfigProto()
    tfconfig.gpu_options.allow_growth=True
    sess = tf.Session(config=tfconfig)
    tf.global_variables_initializer().run(session=sess)
    grad_buffer = sess.run(tf.trainable_variables())
    tf.reset_default_graph()

    global reward_itr
    reward_itr = []
    args = [agent, sess, grad_buffer, reward_itr, sess, grad_buffer, agent, obt_itr, render]
    # run the optimization and output animation
    ani = animation.FuncAnimation(fig, animate_itr,fargs=args)
    plt.show()

if __name__ == "__main__":
    main()

# Set up formatting for the movie files
# print('saving animation...')
# Writer = animation.writers['ffmpeg']
# writer = Writer(fps=100, metadata=dict(artist='Me'), bitrate=1800)

# https://raw.githubusercontent.com/hope-yao/cartpole/master/main.py

Instructions for updating:
Use tf.cast instead.
env = <TimeLimit<CartPoleEnv<CartPole-v0>>>
input_size = 4
hidden_size = 24
gamma = 0.95
action_size = 2
lr = 0.2


<Figure size 640x480 with 2 Axes>

# 9

In [1]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import gym
import numpy as np
import random as ran

env = gym.make('CartPole-v0')

# 꺼내서 사용할 리플레이 갯수
REPLAY = 10
# 리플레이를 저장할 리스트
REPLAY_MEMORY = []
# 미니배치
MINIBATCH = 50

INPUT = env.observation_space.shape[0]
OUTPUT = env.action_space.n

# 하이퍼파라미터
LEARNING_LATE = 0.1
NUM_EPISODE = 4000
e = 0.1
DISCOUNT = 0.9
rList = []

# 네트워크 클래스 구성
class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        
        # 네트워크 정보 입력
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        # 네트워크 생성
        self.build_network()

    def build_network(self, width = 10, L_rate = 1e-1):
        
        # 네트워크 구조
        self.x=tf.placeholder(dtype=tf.float32, shape=[None, self.input_size])

        W1 = tf.get_variable('W1',shape=[self.input_size, width],initializer=tf.contrib.layers.xavier_initializer())
        W2 = tf.get_variable('W2',shape=[width, self.output_size],initializer=tf.contrib.layers.xavier_initializer())

        L1=tf.nn.tanh(tf.matmul(self.x,W1))

        self.Q_pre = tf.matmul(L1,W2)

        self.y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))

        # 손실 함수
        self.loss = tf.reduce_sum(tf.square(self.y - self.Q_pre))
        self.train = tf.train.AdamOptimizer(learning_rate=L_rate).minimize(self.loss)
    
    # 예측한 Q값 구하기
    def predict(self, state):
        s_t = np.reshape(state, [1,self.input_size])
        return self.session.run(self.Q_pre, feed_dict={self.x : s_t})
    # 네트워크 학습
    def update(self, x, y):
        self.session.run(self.train, feed_dict={self.x : x, self.y : y})

    # 미니배치를 이용한 학습
    @staticmethod
    def replay_train(DQN, replay_memory, replay):
        for sample in ran.sample(replay_memory, replay):
            s_r, a_r, r_r, s1_r, d_r = sample
            Q = DQN.predict(s_r)
            # DQN 알고리즘으로 학습
            if d_r:
                Q[0, a_r] = -100
            else:
                Q[0, a_r] = r_r + DISCOUNT * np.max(DQN.predict(s1_r))

            DQN.update(np.reshape(s_r, [1, DQN.input_size]), Q)
# 메인
def main():
    with tf.Session() as sess:
        # mainDQN 이라는 DQN 클래스 생성
        mainDQN = DQN(sess, INPUT, OUTPUT)
        
        # 변수 초기화
        sess.run(tf.global_variables_initializer())
        for step in range(NUM_EPISODE):

            s = env.reset()
            e = 1. / ((step/10)+1)
            rall = 0
            d = False
            count=0

            while not d and count < 5000:
                env.render()
                count+=1
                # e-greedy 를 사용하여 action값 구함
                if e > np.random.rand(1):
                    a = env.action_space.sample()
                else:
                    a = np.argmax(mainDQN.predict(s))

                # action을 취함
                s1, r, d, _ = env.step(a)

                # state, action, reward, next_state, done 을 메모리에 저장
                REPLAY_MEMORY.append([s,a,r,s1,d])

                # 메모리에 50000개 이상의 값이 들어가면 가장 먼저 들어간 것부터 삭제
                if len(REPLAY_MEMORY) > 50000:
                    del REPLAY_MEMORY[0]
                    
                rall += r
                s = s1
                
            # 10 번의 스탭마다 미니배치로 학습
            if step % 10 == 1 :
                for _ in range(MINIBATCH):
                    replay_train(mainDQN,REPLAY_MEMORY,REPLAY)

                

            rList.append(rall)
            print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(step, count, rall, np.mean(rList)))

if __name__ == '__main__':
    main()
    
    
# https://gist.githubusercontent.com/jcwleo/3e117a8413e271335277186a9ce449a8/raw/9478b2ab6acbbe9af45d2f3ef0d6b0458fca8a8a/CartPole_DQN_NIPS2013_Class.py


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Episode 0 finished after 11 timesteps with r=11.0. Running score: 11.0
Episode 1 finished after 40 timesteps with r=40.0. Running score: 25.5
Episode 2 finished after 23 timesteps with r=23.0. Running score: 24.666666666666668
Episode 3 finished after 24 timesteps with r=24.0. Running score: 24.5
Episode 4 finished after 32 timesteps with r=32.0. Running score: 26.0
Episode 5 finished after 29 timesteps with r=29.0. Running score: 26.5
Episode 6 finished after 57 timesteps with r=57.0. Running score: 30.857142857142858
Episode 7 finished after 20 timesteps with r=20.0. Running score: 29.5
Episode 8 finished after 42 timesteps with r=42.0. Running score: 30.88888888888889
Episode 9 fi

KeyboardInterrupt: 

# ADSFASDF

In [2]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import gym
import numpy as np
import random as ran
import threading

env = gym.make('CartPole-v0')

# 꺼내서 사용할 리플레이 갯수
REPLAY = 10
# 리플레이를 저장할 리스트
# REPLAY_MEMORY = []
# 미니배치
MINIBATCH = 50

INPUT = env.observation_space.shape[0]
OUTPUT = env.action_space.n

# 하이퍼파라미터
LEARNING_LATE = 0.1
NUM_EPISODE = 4000
e = 0.1
DISCOUNT = 0.9
# rList = []

THREAD_NUM = 2

class DQN_:
    def __init__(self):
        pass
    
    def train(self):
        Solver = [DQN_solver(idx) for idx in range(THREAD_NUM)]
        for dqn_agent in Solver:
            dqn_agent.start()

class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        
        # 네트워크 정보 입력
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        self.loss_val = None
        
        # 네트워크 생성
        self.build_network()

    def build_network(self, width = 10, L_rate = 1e-1):
        
        # 네트워크 구조
        self.x=tf.placeholder(dtype=tf.float32, shape=[None, self.input_size])

        W1 = tf.get_variable('W1',shape=[self.input_size, width],initializer=tf.contrib.layers.xavier_initializer())
        W2 = tf.get_variable('W2',shape=[width, self.output_size],initializer=tf.contrib.layers.xavier_initializer())

        L1=tf.nn.tanh(tf.matmul(self.x,W1))

        self.Q_pre = tf.matmul(L1,W2)

        self.y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))

        # 손실 함수
        self.loss = tf.reduce_sum(tf.square(self.y - self.Q_pre))
        self.train = tf.train.AdamOptimizer(learning_rate=L_rate).minimize(self.loss)
    
    # 예측한 Q값 구하기
    def predict(self, state):
        s_t = np.reshape(state, [1,self.input_size])
        return self.session.run(self.Q_pre, feed_dict={self.x : s_t})
    # 네트워크 학습
    def update(self, x, y):
        _, self.loss_val = self.session.run(self.train, feed_dict={self.x : x, self.y : y})


# 미니배치를 이용한 학습
def replay_train(DQN, replay_memory, replay):
    for sample in ran.sample(replay_memory, replay):
        s_r, a_r, r_r, s1_r, d_r = sample
        Q = DQN.predict(s_r)
        # DQN 알고리즘으로 학습
        if d_r:
            Q[0, a_r] = -100
        else:
            Q[0, a_r] = r_r + DISCOUNT * np.max(DQN.predict(s1_r))

        DQN.update(np.reshape(s_r, [1, DQN.input_size]), Q)
            
class DQN_solver(threading.Thread):
    def __init__(self, idx):
        threading.Thread.__init__(self)
        self.thread_id = idx
        self.REPLAY_MEMORY = []
        self.rList = []
    
    def run(self):
#         print("Hello World")
        
        with tf.Session() as sess:
            # mainDQN 이라는 DQN 클래스 생성
            mainDQN = DQN(sess, INPUT, OUTPUT)

            # 변수 초기화
            sess.run(tf.global_variables_initializer())
            for step in range(NUM_EPISODE):

                s = env.reset()
                e = 1. / ((step/10)+1)
                rall = 0
                d = False
                count=0

                while not d and count < 5000:
#                     env.render()
                    count+=1
                    # e-greedy 를 사용하여 action값 구함
                    if e > np.random.rand(1):
                        a = env.action_space.sample()
                    else:
                        a = np.argmax(mainDQN.predict(s))

                    # action을 취함
                    s1, r, d, _ = env.step(a)

                    # state, action, reward, next_state, done 을 메모리에 저장
                    self.REPLAY_MEMORY.append([s,a,r,s1,d])

                    # 메모리에 50000개 이상의 값이 들어가면 가장 먼저 들어간 것부터 삭제
                    if len(self.REPLAY_MEMORY) > 50000:
                        del self.REPLAY_MEMORY[0]

                    rall += r
                    s = s1

                # 10 번의 스탭마다 미니배치로 학습
                if step % 10 == 1 :
                    for _ in range(MINIBATCH):
                        replay_train(mainDQN, self.REPLAY_MEMORY,REPLAY)

                self.rList.append(rall)
                print(self.thread_id, "th thread- Episode {} finished after {} timesteps with r={}. Running score: {}".format(step, count, rall, np.mean(self.rList)))
                print("LOSS: ", (mainDQN.loss_val))
                print()

if __name__ == "__main__":
    dqn = DQN_()
    dqn.train()

1 th thread- Episode 0 finished after 26 timesteps with r=26.0. Running score: 26.0
LOSS:  None

0 th thread- Episode 0 finished after 14 timesteps with r=14.0. Running score: 14.0
LOSS:  None



Exception in thread Thread-7:
Traceback (most recent call last):
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "<ipython-input-2-5d6fcaa6ee35>", line 140, in run
    replay_train(mainDQN, self.REPLAY_MEMORY,REPLAY)
  File "<ipython-input-2-5d6fcaa6ee35>", line 89, in replay_train
    DQN.update(np.reshape(s_r, [1, DQN.input_size]), Q)
  File "<ipython-input-2-5d6fcaa6ee35>", line 75, in update
    _, self.loss_val = self.session.run(self.train, feed_dict={self.x : x, self.y : y})
TypeError: cannot unpack non-iterable NoneType object

Exception in thread Thread-6:
Traceback (most recent call last):
  File "/Users/dohyungkwon/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "<ipython-input-2-5d6fcaa6ee35>", line 140, in run
    replay_train(mainDQN, self.REPLAY_MEMORY,REPLAY)
  File "<ipython-input-2-5d6fcaa6ee35>", line 89, in replay_train
    DQN.update(np.reshape(s_r, [1, 