<a href="https://colab.research.google.com/github/Purushotham-Mani/CS238/blob/main/HighwayDQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install environment and visualization dependencies
!pip install highway-env
!pip install tensorboardx gym pyvirtualdisplay
!apt-get install -y xvfb ffmpeg
# Environment
import gymnasium as gym
import highway_env

gym.register_envs(highway_env)

# Models and computation
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import namedtuple

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm.notebook import trange
import base64
from pathlib import Path

from gymnasium.wrappers import RecordVideo
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

Collecting highway-env
  Downloading highway_env-1.10.1-py3-none-any.whl.metadata (16 kB)
Collecting gymnasium>=1.0.0a2 (from highway-env)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from highway-env)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading highway_env-1.10.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: farama-notifications, gymnasium, highway-env
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0 highway-env-1.10.1
Collecting tensorboardx
  Downloading tensorboardX-2.6.2.2-py2.py3-no

In [None]:
display = Display(visible=0, size=(1400, 900))
display.start()

def show_videos(path="videos"):
    html = []
    for mp4 in Path(path).glob("*.mp4"):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                 </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [None]:
env = gym.make("highway-v0",render_mode="rgb_array")
# Default Config: Observation: Kinematics, Actions: DiscreteMetaAction
# ACTIONS_ALL = {0: 'LANE_LEFT',1: 'IDLE',2: 'LANE_RIGHT',3: 'FASTER',4: 'SLOWER'}
# Obeservations : Vehicle x y vx vy / first row is always ego vehicle
env = RecordVideo(env, video_folder="./videos", episode_trigger=lambda e: True)
env.reset()
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)
env.close()
show_videos()

In [None]:
class DQN(nn.Module):
  def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
    super(DQN, self).__init__()
    self.input_dims = input_dims
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.n_actions = n_actions
    self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
    self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
    self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
    self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
    self.loss = nn.MSELoss()
    self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(self.device)
    self.to(self.device)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.relu(self.fc2(x))
    actions = self.fc3(x)
    return actions

In [None]:
class Agent():
  def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, max_mem_size=100000, eps_end=0.01, eps_dec=5e-4):
    self.gamma = gamma
    self.epsilon = epsilon
    self.eps_min = eps_end
    self.eps_dec = eps_dec
    self.lr = lr
    self.action_space = [i for i in range(n_actions)]
    self.mem_size = max_mem_size
    self.batch_size = batch_size
    self.mem_cntr = 0

    self.Q_fn = DQN(self.lr, n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256)
    self.state_memory = np.zeros((self.mem_size, input_dims), dtype=np.float32)
    self.new_state_memory = np.zeros((self.mem_size, input_dims), dtype=np.float32)
    self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
    self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
    self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

  def store_transition(self, state, action, reward, state_, done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state
    self.new_state_memory[index] = state_
    self.reward_memory[index] = reward
    self.action_memory[index] = action
    self.terminal_memory[index] = done

    self.mem_cntr += 1

  def choose_action(self, observation):
    if (np.random.random() > self.epsilon):
      state = torch.tensor(np.array(observation)).to(self.Q_fn.device) ## may need to edit this
      actions = self.Q_fn.forward(state)
      action = torch.argmax(actions).item()
      # print('exploit')
    else:
      action = np.random.choice(self.action_space)
    # print(action)
    return action

  def train(self):
    if self.mem_cntr < self.batch_size:
      return

    self.Q_fn.optimizer.zero_grad()
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, self.batch_size, replace=False)

    batch_index = np.arange(self.batch_size, dtype=np.int32)

    state_batch = torch.tensor(self.state_memory[batch]).to(self.Q_fn.device)
    new_state_batch = torch.tensor(self.new_state_memory[batch]).to(self.Q_fn.device)
    reward_batch = torch.tensor(self.reward_memory[batch]).to(self.Q_fn.device)
    terminal_batch = torch.tensor(self.terminal_memory[batch]).to(self.Q_fn.device)

    action_batch = self.action_memory[batch]

    q_eval = self.Q_fn.forward(state_batch)[batch_index, action_batch]
    q_next = self.Q_fn.forward(new_state_batch) ## can implement target network
    q_next[terminal_batch] = 0.0

    q_target = reward_batch + self.gamma * torch.max(q_next, dim=1)[0]

    loss = self.Q_fn.loss(q_target, q_eval).to(self.Q_fn.device)
    loss.backward()
    self.Q_fn.optimizer.step()

    self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

In [None]:
env = gym.make("highway-v0")
# Default Config: Observation: Kinematics, Actions: DiscreteMetaAction
# ACTIONS_ALL = {0: 'LANE_LEFT',1: 'IDLE',2: 'LANE_RIGHT',3: 'FASTER',4: 'SLOWER'}
# Obeservations : Vehicle x y vx vy / first row is always ego vehicle
agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=env.action_space.n, eps_end = 0.01, input_dims=np.prod(env.observation_space.shape), lr = 0.0001)
scores, eps_history = [], []
n_games = 190
if Path("./weights").exists():
  agent.Q_fn.load_state_dict(torch.load("./weights", weights_only=True))
for i in range(n_games):
  done = False
  score = 0
  observation, info = env.reset()
  while not done:
    action = agent.choose_action(observation.reshape((1,-1)))
    observation_, reward, done, truncated, info = env.step(action)
    score += reward
    agent.store_transition(observation.reshape((1,-1)), action, reward, observation_.reshape((1,-1)), done)
    agent.train()
    # torch.save(agent.Q_fn.state_dict(),"./weights" )
    observation = observation_
  scores.append(score)
  eps_history.append(agent.epsilon)

  avg_score = np.mean(scores[-100:])
  print('episode ', i, 'score %.2f' % score, 'average score %.2f' % avg_score,'epsilon %.2f' % agent.epsilon)


cuda:0
episode  0 score 4.33 average score 4.33 epsilon 1.00
episode  1 score 1.75 average score 3.04 epsilon 1.00
episode  2 score 5.28 average score 3.79 epsilon 1.00
episode  3 score 16.90 average score 7.07 epsilon 1.00
episode  4 score 6.46 average score 6.95 epsilon 1.00
episode  5 score 12.92 average score 7.94 epsilon 1.00
episode  6 score 1.87 average score 7.07 epsilon 1.00
episode  7 score 17.62 average score 8.39 epsilon 0.99
episode  8 score 44.32 average score 12.38 epsilon 0.96
episode  9 score 8.62 average score 12.01 epsilon 0.95
episode  10 score 12.58 average score 12.06 epsilon 0.94
episode  11 score 17.11 average score 12.48 epsilon 0.93
episode  12 score 3.57 average score 11.80 epsilon 0.93
episode  13 score 1.62 average score 11.07 epsilon 0.93
episode  14 score 6.11 average score 10.74 epsilon 0.92
episode  15 score 6.35 average score 10.46 epsilon 0.92
episode  16 score 2.60 average score 10.00 epsilon 0.92
episode  17 score 1.99 average score 9.56 epsilon 0.9

In [None]:
env = gym.make("highway-v0",render_mode="rgb_array")
# Default Config: Observation: Kinematics, Actions: DiscreteMetaAction
# ACTIONS_ALL = {0: 'LANE_LEFT',1: 'IDLE',2: 'LANE_RIGHT',3: 'FASTER',4: 'SLOWER'}
# Obeservations : Vehicle x y vx vy / first row is always ego vehicle
env = RecordVideo(env, video_folder="./videos", episode_trigger=lambda e: True)
observation, info = env.reset()
done = False
score = 0
while not done:
    state = torch.tensor(np.array(observation.reshape((1,-1)))).to(agent.Q_fn.device) ## may need to edit this
    actions = agent.Q_fn.forward(state)
    action = torch.argmax(actions).item()
    obs, reward, done, truncated, info = env.step(action)
    score += reward
    observation = obs
print('score %.2f' % score)
env.close()
show_videos()

  logger.warn(


score 7.72


In [None]:
torch.save(agent.Q_fn.state_dict(),"./weights" )