In [None]:
!pip install highway_env

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
%matplotlib inline
import gymnasium as gym
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [None]:
class Memory:  # stored as ( s, a, r, s_ ) in SumTree
    e = 0.01
    a = 0.8
    beta = 0.3
    beta_increment_per_sampling = 0.0005

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        return (np.abs(error) + self.e) ** self.a

    def add(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)


class SumTree:
    write = 0

    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
        self.n_entries = 0

    # update to the root node
    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    # find sample on leaf node
    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s - self.tree[left])

    def total(self):
        return self.tree[0]

    # store priority and sample
    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

        if self.n_entries < self.capacity:
            self.n_entries += 1

    # update priority
    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    # get priority and sample
    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1

        return (idx, self.tree[idx], self.data[dataIdx])


class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.999    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.dqn_learning_rate = 0.001
        self.model = self._build_model()
        self.memory = Memory(1000000)  # PER Memory
        self.batch_size = 32

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=25, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.dqn_learning_rate))
        return model

    def memorize(self, state, action, reward, next_state, done):
        # Calculate TD-Error for Prioritized Experience Replay
        td_error = reward + self.gamma * np.argmax(self.model.predict(next_state)[0]) - np.argmax(self.model.predict(state)[0])
        # Save TD-Error into Memory
        self.memory.add(td_error, (state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:  # Exploration
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action (Exploitation)

    def replay(self):
        batch, idxs, is_weight = self.memory.sample(self.batch_size)
        for i in range(self.batch_size):
            state, action, reward, next_state, done = batch[i]
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            # Gradient Update. Pay attention at the sample weight as proposed by the PER Paper
            self.model.fit(state, target_f, epochs=1, verbose=0, sample_weight=np.array([is_weight[i]]))
        if self.epsilon > self.epsilon_min: # Epsilon Update
            self.epsilon *= self.epsilon_decay

env = gym.make('highway-v0')
state_size = 25
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
scores=[]
EPISODES = 10000

for e in range(EPISODES):
    state,_ = env.reset()
    print("The episode is" ,e)
    state = np.reshape(state, [1, -1])
    done=False
    time=0
    while not(done):
        time+=1
        action = agent.act(state)
        next_state, reward, truncated , terminated, _ = env.step(action)
        done = truncated or terminated
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, -1])
        print(next_state.shape)
        agent.memorize(state, action, reward, next_state, done)
        state = next_state
        if done:
            if e % 100 == 0 and e>1:
                print("episode: {}/{}, Score Mean: {} / Median: {} ".format(e, EPISODES, int(np.mean(scores)), int(np.median(scores))))
                print("Beta {:.5f} / Eps: {:.5f}".format(agent.memory.beta, agent.epsilon))
            scores.append(time)
    if agent.memory.tree.n_entries > 1000:
        agent.replay()

  logger.warn(
  super().__init__(name, **kwargs)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
The episode is 136
(1, 25)
(1, 25)
(1, 25)
The episode is 137
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
The episode is 138
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
The episode is 139
(1, 25)
(1, 25)
(1, 25)
(1, 25)
The episode is 140
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
The episode is 141
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
The episode is 142
(1, 25)
(1, 25)
(1, 25)
(1, 25)
(1, 25)
The episode is 143
(1, 25)
(1, 25)
(1, 25)