In [18]:



import gym
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from collections import deque
import random



In [19]:
def plot_running_avg(totalrewards):
    N = len(totalrewards)
    running_avg = np.empty(N)
    for t in range(N):
        running_avg[t] = totalrewards[max(0, t - 100):(t + 1)].mean()
    plt.plot(running_avg)
    plt.title("Running Average")
    plt.show()


In [20]:

class DQG_model:
    def __init__(self, env):

        self.env = env
        # Initialize attributes
        self._state_size = env.observation_space.shape[0]
        self._action_size = env.action_space.n

        self.experience_replay = deque(maxlen=200000)

        self._optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

        # Initialize discount and exploration rate
        self.gamma = 0.99
        self.epsilon = 0.1

        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.alighn_target_model()

    def store(self, state, action, reward, next_state, done):
        self.experience_replay.append((state, action, reward, next_state, done))

    def _build_compile_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(20, input_dim=self._state_size, activation='relu'))
        model.add(tf.keras.layers.Dense(50, input_dim=self._state_size, activation='relu'))

        model.add(tf.keras.layers.Dense(self._action_size, activation='linear'))

        model.compile(loss='mse', optimizer=self._optimizer)

        return model

    def alighn_target_model(self):
        self.target_network.set_weights(self.q_network.get_weights())

    @tf.function
    def act(self, state):
        if tf.random.uniform([]) <= self.epsilon:
            return tf.random.uniform([], minval=0, maxval=self._action_size, dtype=tf.int64)

        q_values = self.q_network(state)
        return tf.argmax(q_values[0])


    def retrain(self, batch_size):
        minibatch = random.sample(self.experience_replay, batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        with tf.GradientTape() as tape:
            q_values = self.q_network(tf.concat(states, axis=0))

            target = tf.identity(q_values)
            updates = rewards + (1 - tf.cast(dones, tf.float32)) * self.gamma * tf.reduce_max(
                self.target_network(tf.concat(next_states, axis=0)), axis=-1)
            indices = tf.stack([tf.range(batch_size), actions], axis=-1)
            target = tf.tensor_scatter_nd_update(target, indices, updates)

            loss = tf.keras.losses.mean_squared_error(target, q_values)

        grads = tape.gradient(loss, self.q_network.trainable_variables)
        self._optimizer.apply_gradients(zip(grads, self.q_network.trainable_variables))


In [21]:

def run_episode(env,model):
    state = env.reset()
    state = state.reshape((1, -1))
    max_iter = 2000
    count = 0
    totalreward = 0
    batch_size = 2000

    done = False
    while not done and count < max_iter:
        if debug: print("count =", count)
        action = model.act(state)  # get action
        action = action.numpy()
        [next_state, reward, done, _] = env.step(action)  # get new state
        next_state = next_state.reshape((1, -1))

        if done:
            reward = -200

        model.store(state, action, reward, next_state, done)

        totalreward += reward

        if len(model.experience_replay)  > batch_size:
            model.retrain(batch_size)
            


        count += 1
        state = next_state
    
    model.alighn_target_model()
    

    return totalreward


In [None]:
debug=0
env = gym.make('CartPole-v1')

model = DQG_model(env)
model.q_network.summary()
# Create the optimizer


In [None]:

n_episodes = 50
total_rewards = np.empty(n_episodes)
for i in range(n_episodes):

    total_count = run_episode(env,model)
    total_rewards[i] = total_count
    plt.plot(total_rewards)
    plt.title("Rewards")
    plt.show()


    print("episode:", i, "total reward:", total_count)
print("avg reward for last 100 episodes:", total_rewards[-100:].mean())
