In [None]:
!pip install gym
!pip install highway_env

In [15]:
import highway_env
import gym
import numpy as np
import random
import keras
from keras import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from collections import deque

Dummy RL algorithm
class Model:
  """ Dummy code for an RL algorithm, which predicts an action from an observation,
  and update its model from observed transitions."""

  def predict(self, obs):
    return 0

  def update(self, obs, action, next_obs, reward, info, done):
    pass
model = Model()

In [16]:
class DQN:
    def __init__(self, env):
        self.env     = env
        self.memory  = deque(maxlen=2000)
        
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.01
        self.tau = .05
        self.model = self.create_model()
        self.target_model = self.create_model()
        print(self.env.observation_space.shape)
        
    def create_model(self):
        model   = Sequential()
        state_shape  = self.env.observation_space.shape
        model.add(Dense(128, input_dim=state_shape[1], activation="relu"))
        model.add(Dense(128, input_dim=state_shape[1], activation="relu"))
        model.add(Dense(64, input_dim=state_shape[1], activation="relu"))
        model.add(Dense(32, input_dim=state_shape[1], activation="relu"))
        model.add(Dense(self.env.action_space.n, activation="linear"))
        model.compile(loss="mean_squared_error", optimizer=Adam(learning_rate=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])
    
    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size: 
            return
        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(
                    self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)
    
    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i]
        self.target_model.set_weights(target_weights)
    
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])
    def save_model(self, fn):
        self.model.save(fn)
    def load_model(self, fn):
        self.model = keras.models.load_model(fn)

In [17]:
env = gym.make("intersection-v0")
# To increase collision reward
# env.configure({
#     "collision_reward": -3,
# })
s = env.reset()

# Code for Training the agent

In [None]:
model = DQN(env=env)

In [None]:
state = env.reset()

In [None]:
step = 0
while True and step < 100000:
    step += 1
    action = model.act(state)
    next_state, reward, done, info = env.step(action)
    env.render()
    print(step)
    
    model.remember(state, action, reward, next_state, done)
    model.replay()
    model.target_train()
    
    state = next_state
    if done:
        obs = env.reset()

In [None]:
model.save_model("Single_Agent_DQN_Scratch")

# Code for Training the agent

In [24]:
model = DQN(env = env)
model.load_model("Single_Agent_DQN_Scratch_5000_Increased_Collision_Reward")

(15, 7)


In [25]:
state = env.reset()

In [26]:
episode = 0
episode_reward = 0
episode_rewards = []
while True:
    action = model.act(state)
    next_state, reward, done, info = env.step(action)
    env.render()
    episode_reward += reward
    state = next_state
    if done:
        episode += 1
        episode_rewards.append(episode_reward)
        print(("episode", episode, "reward", episode_reward))
        episode_reward = 0
        obs = env.reset()
    if episode == 100:
        break
avg_reward = sum(episode_rewards) / len(episode_rewards)
print(("average_reward", avg_reward))

('episode', 1, 'reward', 1.5498611712131654)
('episode', 2, 'reward', -2.4370451147124683)
('episode', 3, 'reward', 5.525487108524997)
('episode', 4, 'reward', 4.538831303257055)
('episode', 5, 'reward', 6.869101635648566)
('episode', 6, 'reward', -2.5116602437484765)
('episode', 7, 'reward', 9.616017393305233)
('episode', 8, 'reward', 8.24232670459421)
('episode', 9, 'reward', -1.379585246521927)
('episode', 10, 'reward', -1.3187193862696738)
('episode', 11, 'reward', 0.038538913300359745)
('episode', 12, 'reward', 10.0)
('episode', 13, 'reward', -2.3703484906885626)
('episode', 14, 'reward', 0.6175077255670267)
('episode', 15, 'reward', 5.6163432219928815)
('episode', 16, 'reward', 6.087579111093259)
('episode', 17, 'reward', -1.3727708302065347)
('episode', 18, 'reward', 0.70001307471657)
('episode', 19, 'reward', -2.3650567191435607)
('episode', 20, 'reward', 6.525826820326285)
('episode', 21, 'reward', -0.38116369941996364)
('episode', 22, 'reward', 7.540869946963048)
('episode', 

In [22]:
env.close()