In [None]:
import numpy as np
import pandas as pd
from typing import Dict, List, Text

import os
import copy
import random
import time
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

# import tensorflow
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Flatten
# from tensorflow.keras.optimizers import Adam

from typing import Tuple
from highway_env import utils
from highway_env.envs.common.abstract import AbstractEnv
from highway_env.envs.common.action import Action
from highway_env.road.road import Road, RoadNetwork
from highway_env.vehicle.controller import ControlledVehicle
from highway_env.vehicle.kinematics import Vehicle
from highway_env.vehicle.behavior import IDMVehicle
from highway_env.vehicle.kinematics import Vehicle

Observation = np.ndarray

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

dataset = pd.read_csv("//content/DR_LaneChange_ET0_obs.csv")

class AbstractEnv:
    @classmethod
    def default_config(cls) -> Dict:
        return {}


class HighwayEnv(AbstractEnv):
    def __init__(self, dataset: pd.DataFrame, config: Dict):
        self.dataset = dataset
        self.agent_id = 0  # ID of the controlled agent (ego agent)
        self.controlled_vehicle = None  # The controlled vehicle (ego agent)
        self.config = config
        self.np_random = np.random.RandomState()


    @classmethod
    def default_config(cls) -> Dict:
        config = super().default_config()
        config.update({
            "observation": {
                "type": "Kinematics"
            },
            "action": {
                "type": "DiscreteMetaAction",
            },
            "lanes_count": 4,
            "vehicles_count": 50,
            "controlled_vehicles": 1,
            "initial_lane_id": None,
            "duration": 40,  # [s]
            "ego_spacing": 2,
            "vehicles_density": 1,
            "collision_reward": -1,
            "right_lane_reward": 0.1,
            "high_speed_reward": 0.4,
            "lane_change_reward": 0,
            "reward_speed_range": [20, 30],
            "normalize_reward": True,
            "offroad_terminal": False,
            "show_trajectories": False  # Add this attribute
        })
        return config

    def _reset(self) -> Observation:
        self._create_road()
        self._create_vehicles()
        state = np.array([
            self.controlled_vehicle.position[0],
            self.controlled_vehicle.position[1],
            self.controlled_vehicle.velocity[0],
            self.controlled_vehicle.velocity[1],
            self.controlled_vehicle.heading
        ])
        state = np.expand_dims(state, axis=0)  # Add an extra dimension for the batch
        state = np.expand_dims(state, axis=0)  # Add an extra dimension for the number of channels
        return state

    def _step(self, action: Action) -> Tuple[Observation, float, bool, Dict]:
        # Perform the given action in the environment and get the next state, reward, done flag, and info
        next_state, reward, done, info = self.controlled_vehicle.step(action)

        # Preprocess the next state
        next_state = self._preprocess_state(next_state)

        return next_state, reward, done, info

    def step(self, action: Action) -> Tuple[Observation, float, bool, Dict]:
        return self._step(action)


    def _create_road(self) -> None:
        """Create a road composed of straight adjacent lanes."""
        self.road = Road(network=RoadNetwork.straight_road_network(self.config["lanes_count"], speed_limit=30),
                         np_random=self.np_random, record_history=False)

    def _create_vehicles(self) -> None:
        """Create vehicles from the dataset."""
        # Extract controlled vehicle data from the dataset
        self.dataset = pd.read_csv("/content/DR_LaneChange_ET0_obs.csv")

        self.agent_id = 1  # ID of the controlled agent (ego agent)

        agent_data = self.dataset[self.dataset["track_to_predict"] == self.agent_id]
        print(agent_data['interesting_agent'].value_counts())

        # Initialize the controlled vehicle (ego agent)
        controlled_vehicle_data = agent_data[agent_data["interesting_agent"] == 1]
        print(controlled_vehicle_data)
        x, y, vx, vy, psi_rad, length, width = controlled_vehicle_data.iloc[0][["x", "y", "vx", "vy", "psi_rad", "length", "width"]]
        self.controlled_vehicle = ControlledVehicle(
        road=self.road,
        position=[x, y],
        heading=psi_rad,
        )
        self.road.vehicles.append(self.controlled_vehicle)

        # Create non-controlled vehicles (other agents) from the dataset
        other_vehicles_data = agent_data.loc[agent_data["interesting_agent"] == 0]
        for _, vehicle_data in other_vehicles_data.iterrows():
            x, y, vx, vy, psi_rad, length, width = vehicle_data[["x", "y", "vx", "vy", "psi_rad", "length", "width"]]
            vehicle = IDMVehicle(
              road=self.road,
              position=[x, y],
              speed=np.linalg.norm([vx, vy]),
              heading=psi_rad,
    # more parameters here if IDMVehicle requires them...
              )

            self.road.vehicles.append(vehicle)


    def _reward(self, action: Action) -> float:
        """
        The reward is defined to foster driving at high speed, on the rightmost lanes, and to avoid collisions.
        :param action: the last action performed
        :return: the corresponding reward
        """
        rewards = self._rewards(action)
        reward = sum(self.config.get(name, 0) * reward for name, reward in rewards.items())
        if self.config["normalize_reward"]:
            reward = utils.lmap(reward,
                                [self.config["collision_reward"],
                                 self.config["high_speed_reward"] + self.config["right_lane_reward"]],
                                [0, 1])
        reward *= rewards['on_road_reward']
        return reward

    def _rewards(self, action: Action) -> Dict[Text, float]:
        neighbours = self.road.network.all_side_lanes(self.controlled_vehicle.lane_index)
        lane = self.controlled_vehicle.target_lane_index[2]
        # Use forward speed rather than speed, see https://github.com/eleurent/highway-env/issues/268
        forward_speed = self.controlled_vehicle.speed * np.cos(self.controlled_vehicle.heading)
        scaled_speed = utils.lmap(forward_speed, self.config["reward_speed_range"], [0, 1])
        return {
            "collision_reward": float(self.controlled_vehicle.crashed),
            "right_lane_reward": lane / max(len(neighbours) - 1, 1),
            "high_speed_reward": np.clip(scaled_speed, 0, 1),
            "on_road_reward": float(self.controlled_vehicle.on_road)
        }




class Net(nn.Module):
    def __init__(self, state_dim, action_dim):
      super(Net, self).__init__()
      hidden_nodes1 = 128
      hidden_nodes2 = 64
      self.flatten= nn.Flatten()
      self.fc1 = nn.Linear(state_dim, hidden_nodes1)
      self.fc2 = nn.Linear(hidden_nodes1, hidden_nodes2)
      self.fc3 = nn.Linear(hidden_nodes2, action_dim)

    def forward(self, state):
      x = self.flatten(state)
      x = F.relu(self.fc1(state))
      x = F.relu(self.fc2(x))
      out = self.fc3(x)
      return out

# def build_model(states, actions):
#     model = Sequential()
#     model.add(Dense(24, activation='relu', input_shape=states))
#     model.add(Dense(24, activation='relu'))
#     model.add(Dense(actions, activation='linear'))
#     return model

# model = build_model(states, actions)

# model.summary()


class DOUBLEDQN_CNN(nn.Module):
    def __init__(
        self,
        env,
        state_dim,
        action_dim,
        lr=0.001,
        gamma=0.99,
        batch_size=5,
        timestamp="",
    ):
        """
        :param env: object, a gym environment
        :param state_dim: int, size of state space
        :param action_dim: int, size of action space
        :param lr: float, learning rate
        :param gamma: float, discount factor
        :param batch_size: int, batch size for training
        """
        super(DOUBLEDQN_CNN, self).__init__()

        self.env = env
        # self.env = HighwayEnv(dataset, config={})  # Instantiate HighwayEnv with the dataset
        self.env._reset()
        self.timestamp = timestamp

        self.test_env = copy.deepcopy(env)  # for evaluation purpose
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.batch_size = batch_size
        self.learn_step_counter = 0

        # Pass the dataset to the HighwayEnv class
        # self.highway_env = HighwayEnv(dataset, config={})  # Adjust the config as needed

        self.target_net = Net(self.state_dim, self.action_dim).to(device)
        self.estimate_net = Net(self.state_dim, self.action_dim).to(device)
        # self.ReplayBuffer = Replay(1000, 100, self.state_dim, self.action_dim)

        self.optimizer = torch.optim.Adam(self.estimate_net.parameters(), lr=lr)

    def update_target_networks(self):
        """
        A function to update the target networks
        """
        self.target_net.load_state_dict(self.estimate_net.state_dict())
    def choose_action(self, state, epsilon=0.9):
        state = np.expand_dims(state, axis=0)
        state = np.expand_dims(state, axis=0)
        state = torch.FloatTensor(state).to(device)
        if np.random.randn() <= epsilon:
            action_value = self.estimate_net(state)
            action = torch.argmax(action_value).item()
        else:
            action = np.random.randint(0, self.action_dim)
        return action
    def preprocess_dataset(self, dataset):
        """
        Preprocess the dataset to make it compatible with the highway_env

        :param dataset: pandas DataFrame, input dataset
        :return: numpy array, preprocessed dataset
        """
        # Extract relevant columns
        PIXEL_SCALE = 4
        dataset = dataset[
            [
                "frame_id",
                "timestamp_ms",
                "x",
                "y",
                "vx",
                "vy",
                "psi_rad",
                "length",
                "width",
                "agent_type",
                "interesting_agent",
            ]
        ]

        # Convert positions from meters to pixels
        dataset["x"] = dataset["x"] * PIXEL_SCALE
        dataset["y"] = dataset["y"] * PIXEL_SCALE

        # Normalize positions, velocities, yaw angle, length, and width
        dataset["x"] = (dataset["x"] - dataset["x"].mean()) / dataset["x"].std()
        dataset["y"] = (dataset["y"] - dataset["y"].mean()) / dataset["y"].std()
        dataset["vx"] = (dataset["vx"] - dataset["vx"].mean()) / dataset["vx"].std()
        dataset["vy"] = (dataset["vy"] - dataset["vy"].mean()) / dataset["vy"].std()
        dataset["psi_rad"] = (
            dataset["psi_rad"] - dataset["psi_rad"].mean()
        ) / dataset["psi_rad"].std()
        dataset["length"] = (
            dataset["length"] - dataset["length"].mean()
        ) / dataset["length"].std()
        dataset["width"] = (dataset["width"] - dataset["width"].mean()) / dataset[
            "width"
        ].std()
        # Encode agent types using one-hot encoding
        agent_types = pd.get_dummies(dataset["agent_type"], prefix="agent_type")
        dataset = pd.concat([dataset, agent_types], axis=1)

        dataset.drop(columns=["agent_type"], inplace=True)

        return dataset.to_numpy()

    def train(self, num_epochs, dataset):
        """
        Train the policy for the given number of iterations

        :param num_epochs: int, number of epochs to train the policy for
        :param dataset: pandas DataFrame, input dataset
        :return: list, training loss
        """
        dataset = self.preprocess_dataset(dataset)

        count_list = []
        loss_list = []
        total_reward_list = []
        avg_reward_list = []
        epoch_reward = 0

        state = self.env._reset()  # Reset the environment once at the beginning of training
        for epoch in range(int(num_epochs)):
            done = False
            avg_loss = 0
            step = 0
            while not done:
                step += 1
                action = self.choose_action(state)
                state_next, reward, done, _ = self.env.step(action)
                exp = {
                    "state": state,
                    "action": action,
                    "reward": reward,
                    "state_next": state_next,
                    "done": done,
                }
                self.ReplayBuffer.buffer_add(exp)
                state = state_next

                # sample random batch from replay memory
                exp_batch = self.ReplayBuffer.buffer_sample(self.batch_size)

                # extract batch data
                state_batch = torch.FloatTensor([exp["state"] for exp in exp_batch])
                action_batch = torch.LongTensor([exp["action"] for exp in exp_batch])
                reward_batch = torch.FloatTensor([exp["reward"] for exp in exp_batch])
                state_next_batch = torch.FloatTensor(
                    [exp["state_next"] for exp in exp_batch]
                )
                done_batch = torch.FloatTensor([1 - exp["done"] for exp in exp_batch])

                # reshape
                state_batch = state_batch.to(device).unsqueeze(1)
                action_batch = action_batch.to(device).unsqueeze(1)
                reward_batch = reward_batch.to(device).unsqueeze(1)
                state_next_batch = state_next_batch.to(device).unsqueeze(1)
                done_batch = done_batch.to(device).unsqueeze(1)

                # get estimate Q value
                estimate_Q = self.estimate_net(state_batch).gather(1, action_batch)

                # get target Q value
                max_action_idx = self.estimate_net(state_next_batch).detach().argmax(1)
                target_Q = reward_batch + done_batch * self.gamma * self.target_net(
                    state_next_batch
                ).gather(1, max_action_idx.unsqueeze(1))

                # compute mse loss
                loss = F.mse_loss(estimate_Q, target_Q)
                avg_loss += loss.item()

                # update network
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # update target network
                if self.learn_step_counter % 100 == 0:
                    self.update_target_networks()
                self.learn_step_counter += 1

            reward, count = self.eval()
            epoch_reward += reward

            # save
            period = 40
            if epoch % period == 0:
                # log
                avg_loss /= step
                epoch_reward /= period
                avg_reward_list.append(epoch_reward)
                loss_list.append(avg_loss)

                print(
                    "\nepoch: [{}/{}], \tavg loss: {:.4f}, \tavg reward: {:.3f}, \tsteps: {}".format(
                        epoch + 1, num_epochs, avg_loss, epoch_reward, count
                    )
                )

                epoch_reward = 0
                # create a new directory for saving
                try:
                    os.makedirs(self.timestamp)
                except OSError:
                    pass
                np.save(self.timestamp + "/double_dqn_cnn_loss.npy", loss_list)
                np.save(
                    self.timestamp + "/double_dqn_cnn_avg_reward.npy", avg_reward_list
                )
                torch.save(
                    self.estimate_net.state_dict(),
                    self.timestamp + "/double_dqn_cnn.pkl",
                )

        self.env.close()
        return loss_list, avg_reward_list


    def eval(self):
        """
        Evaluate the policy
        """
        count = 0
        total_reward = 0
        done = False
        state = self.test_env.reset()

        while not done:
            action = self.choose_action(state, epsilon=1)
            state_next, reward, done, _ = self.test_env.step(action)
            total_reward += reward
            count += 1
            state = state_next

        return total_reward, count


if __name__ == "__main__":
    # Read the dataset from the CSV file
    dataset = pd.read_csv("/content/DR_LaneChange_ET0_obs.csv")

    # Timestamp for saving
    named_tuple = time.localtime()  # get struct_time
    time_string = time.strftime(
        "%m%d_%H_%M", named_tuple
    )  # have a folder of "date+time ex: 1209_20_36 -> December 12th, 20:36"

    myobjec=HighwayEnv(dataset = dataset, config={"lanes_count": 4})
    double_dqn_cnn_object = DOUBLEDQN_CNN(
        # HighwayEnv(dataset = hdataset, config={}),
        myobjec,
        state_dim=11,  # Adjust state dimension according to your dataset
        action_dim=4,  # Adjust action dimension according to your problem
        lr=0.001,
        gamma=0.99,
        batch_size=64,
        timestamp=time_string,
    )


    # Train the policy
    iterations = 4000
    print(dataset[0:10])
    avg_loss, avg_reward_list = double_dqn_cnn_object.train(iterations, dataset)
    np.save(time_string + "/double_dqn_cnn_loss.npy", avg_loss)
    np.save(time_string + "/double_dqn_cnn_avg_reward.npy", avg_reward_list)

    # Save the DQN network
    torch.save(
        double_dqn_cnn_object.estimate_net.state_dict(),
        time_string + "/double_dqn_cnn.pkl",
    )

    # Plot
    plt.figure(figsize=(10, 6))
    plt.plot(avg_loss)
    plt.grid()
    plt.title("Double DQN Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.savefig("double_dqn_loss.png", dpi=150)
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.plot(avg_reward_list)
    plt.grid()
    plt.title("Double DQN Training Reward")
    plt.xlabel("*40 Epochs")
    plt.ylabel("Reward")
    plt.savefig(time_string + "/double_dqn_cnn_train_reward.png", dpi=150)
    plt.show()

cpu
0.0    13410
1.0     3420
Name: interesting_agent, dtype: int64
       case_id  track_id  frame_id  timestamp_ms agent_type         x  \
81         1.0         8         1           100        car  1060.560   
82         1.0         8         2           200        car  1059.611   
83         1.0         8         3           300        car  1058.662   
84         1.0         8         4           400        car  1057.712   
85         1.0         8         5           500        car  1056.762   
...        ...       ...       ...           ...        ...       ...   
24272    342.0         6         6           600        car  1035.409   
24273    342.0         6         7           700        car  1034.923   
24274    342.0         6         8           800        car  1034.435   
24275    342.0         6         9           900        car  1033.943   
24276    342.0         6        10          1000        car  1033.449   

             y     vx     vy  psi_rad  length  width  t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["x"] = dataset["x"] * PIXEL_SCALE
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["y"] = dataset["y"] * PIXEL_SCALE
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["x"] = (dataset["x"] - dataset["x"].mean()) / dataset["x"].std()
A value is trying to be set on a copy of a slice

RuntimeError: ignored

In [None]:
pip install highway_env

Collecting highway_env
  Downloading highway_env-1.8.2-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium>=0.27 (from highway_env)
  Downloading gymnasium-0.29.0-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.8/953.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium>=0.27->highway_env)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, highway_env
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.0 highway_env-1.8.2


In [None]:
import numpy as np
import pandas as pd
from typing import Dict, List, Text

import os
import copy
import random
import time
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from typing import Tuple
from highway_env import utils
from highway_env.envs.common.abstract import AbstractEnv  # Update import statement
from highway_env.envs.common.action import Action
from highway_env.road.road import Road, RoadNetwork
from highway_env.vehicle.controller import ControlledVehicle
from highway_env.vehicle.behavior import IDMVehicle

Observation = np.ndarray

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

dataset = pd.read_csv("//content/DR_LaneChange_ET0_obs.csv")

class HighwayEnv(AbstractEnv):
    def __init__(self, dataset: pd.DataFrame, config: Dict):
        self.dataset = dataset
        self.agent_id = 0  # ID of the controlled agent (ego agent)
        self.controlled_vehicle = None  # The controlled vehicle (ego agent)
        self.config = config
        self.np_random = np.random.RandomState()
        self.config.setdefault("lanes_count", 4)

    @classmethod
    def default_config(cls) -> Dict:
        config = super().default_config()
        config.update({
            "lanes_count": 4,  # Add this line to include the missing key
            "observation": {
                "type": "Kinematics"
            },
            "action": {
                "type": "DiscreteMetaAction",
            },
            "vehicles_count": 50,
            "controlled_vehicles": 1,
            "initial_lane_id": None,
            "duration": 40,  # [s]
            "ego_spacing": 2,
            "vehicles_density": 1,
            "collision_reward": -1,
            "right_lane_reward": 0.1,
            "high_speed_reward": 0.4,
            "lane_change_reward": 0,
            "reward_speed_range": [20, 30],
            "normalize_reward": True,
            "offroad_terminal": False,
            "show_trajectories": False  # Add this attribute
        })
        return config

    def _reset(self) -> Observation:
        self._create_road()
        self._create_vehicles()
        state = np.array([
            self.controlled_vehicle.position[0],
            self.controlled_vehicle.position[1],
            self.controlled_vehicle.velocity[0],
            self.controlled_vehicle.velocity[1],
            self.controlled_vehicle.heading
        ])
        state = np.expand_dims(state, axis=0)  # Add an extra dimension for the batch
        state = np.expand_dims(state, axis=0)  # Add an extra dimension for the number of channels
        return state

    def _step(self, action: Action) -> Tuple[Observation, float, bool, Dict]:
        # Perform the given action in the environment and get the next state, reward, done flag, and info
        next_state, reward, done, info = self.controlled_vehicle.step(action)

        # Preprocess the next state
        next_state = self._preprocess_state(next_state)

        return next_state, reward, done, info

    def step(self, action: Action) -> Tuple[Observation, float, bool, Dict]:
        return self._step(action)


    def _create_road(self) -> None:
        """Create a road composed of straight adjacent lanes."""
        # self.road = Road(network=RoadNetwork.straight_road_network(self.config["lanes_count"], speed_limit=30),
        #                  np_random=self.np_random, record_history=False)
        lanes_count = self.config.get("lanes_count", 4)
        self.road = Road(
            network=RoadNetwork.straight_road_network(lanes_count, speed_limit=30),
            np_random=self.np_random,
            record_history=False,
        )

    def _create_vehicles(self) -> None:
        """Create vehicles from the dataset."""
        # Extract controlled vehicle data from the dataset
        self.agent_id = 1  # ID of the controlled agent (ego agent)
        agent_data = self.dataset[self.dataset["track_to_predict"] == self.agent_id]

        # Initialize the controlled vehicle (ego agent)
        controlled_vehicle_data = agent_data[agent_data["interesting_agent"] == 1]
        x, y, vx, vy, psi_rad, length, width = controlled_vehicle_data.iloc[0][
            ["x", "y", "vx", "vy", "psi_rad", "length", "width"]
        ]
        self.controlled_vehicle = ControlledVehicle(
            road=self.road,
            position=[x, y],
            heading=psi_rad,
            speed=np.linalg.norm([vx, vy]),
            enable_lane_change=True,
            timer=np.random.choice([0, 5, 10, 15]),  # Change this as needed
        )
        self.road.vehicles.append(self.controlled_vehicle)

        # Create non-controlled vehicles (other agents) from the dataset
        other_vehicles_data = agent_data.loc[agent_data["interesting_agent"] == 0]
        for _, vehicle_data in other_vehicles_data.iterrows():
            x, y, vx, vy, psi_rad, length, width = vehicle_data[["x", "y", "vx", "vy", "psi_rad", "length", "width"]]
            vehicle = IDMVehicle(
                road=self.road,
                position=[x, y],
                speed=np.linalg.norm([vx, vy]),
                heading=psi_rad,
                length=length,
                width=width,
                enable_lane_change=True,
                timer=np.random.choice([0, 5, 10, 15]),  # Change this as needed
            )
            self.road.vehicles.append(vehicle)

    def _reward(self, action: Action) -> float:
        """
        The reward is defined to foster driving at high speed, on the rightmost lanes, and to avoid collisions.
        :param action: the last action performed
        :return: the corresponding reward
        """
        rewards = self._rewards(action)
        reward = sum(self.config.get(name, 0) * reward for name, reward in rewards.items())
        if self.config["normalize_reward"]:
            reward = utils.lmap(reward,
                                [self.config["collision_reward"],
                                 self.config["high_speed_reward"] + self.config["right_lane_reward"]],
                                [0, 1])
        reward *= rewards['on_road_reward']
        return reward

    def _rewards(self, action: Action) -> Dict[Text, float]:
        neighbours = self.road.network.all_side_lanes(self.controlled_vehicle.lane_index)
        lane = self.controlled_vehicle.target_lane_index[2]
        # Use forward speed rather than speed, see https://github.com/eleurent/highway-env/issues/268
        forward_speed = self.controlled_vehicle.speed * np.cos(self.controlled_vehicle.heading)
        scaled_speed = utils.lmap(forward_speed, self.config["reward_speed_range"], [0, 1])
        return {
            "collision_reward": float(self.controlled_vehicle.crashed),
            "right_lane_reward": lane / max(len(neighbours) - 1, 1),
            "high_speed_reward": np.clip(scaled_speed, 0, 1),
            "on_road_reward": float(self.controlled_vehicle.on_road)
        }


class Net(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Net, self).__init__()
        hidden_nodes1 = 128
        hidden_nodes2 = 64
        self.fc1 = nn.Linear(state_dim, hidden_nodes1)
        self.fc2 = nn.Linear(hidden_nodes1, hidden_nodes2)
        self.fc3 = nn.Linear(hidden_nodes2, action_dim)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        out = self.fc3(x)
        return out


class DOUBLEDQN_CNN(nn.Module):
    def __init__(
        self,
        env,
        state_dim,
        action_dim,
        lr=0.001,
        gamma=0.99,
        batch_size=5,
        timestamp="",
    ):
        """
        :param env: object, a gym environment
        :param state_dim: int, size of state space
        :param action_dim: int, size of action space
        :param lr: float, learning rate
        :param gamma: float, discount factor
        :param batch_size: int, batch size for training
        """
        super(DOUBLEDQN_CNN, self).__init__()

        self.env = env
        self.timestamp = timestamp

        self.test_env = copy.deepcopy(env)  # for evaluation purpose
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.batch_size = batch_size
        self.learn_step_counter = 0

        self.target_net = Net(state_dim, action_dim).to(device)
        self.estimate_net = Net(state_dim, action_dim).to(device)

        self.optimizer = torch.optim.Adam(self.estimate_net.parameters(), lr=lr)

    def update_target_networks(self):
        """
        A function to update the target networks
        """
        self.target_net.load_state_dict(self.estimate_net.state_dict())

    def choose_action(self, state, epsilon=0.9):
        state = np.expand_dims(state, axis=0)
        state = np.expand_dims(state, axis=0)
        state = torch.FloatTensor(state).to(device)
        if np.random.randn() <= epsilon:
            action_value = self.estimate_net(state)
            action = torch.argmax(action_value).item()
        else:
            action = np.random.randint(0, self.action_dim)
        return action

    def preprocess_dataset(self, dataset):
        """
        Preprocess the dataset to make it compatible with the highway_env

        :param dataset: pandas DataFrame, input dataset
        :return: numpy array, preprocessed dataset
        """
        # Extract relevant columns
        PIXEL_SCALE = 4
        dataset = dataset[
            [
                "frame_id",
                "timestamp_ms",
                "x",
                "y",
                "vx",
                "vy",
                "psi_rad",
                "length",
                "width",
                "agent_type",
                "interesting_agent",
            ]
        ]

        # Convert positions from meters to pixels
        dataset["x"] = dataset["x"] * PIXEL_SCALE
        dataset["y"] = dataset["y"] * PIXEL_SCALE

        # Normalize positions, velocities, yaw angle, length, and width
        dataset["x"] = (dataset["x"] - dataset["x"].mean()) / dataset["x"].std()
        dataset["y"] = (dataset["y"] - dataset["y"].mean()) / dataset["y"].std()
        dataset["vx"] = (dataset["vx"] - dataset["vx"].mean()) / dataset["vx"].std()
        dataset["vy"] = (dataset["vy"] - dataset["vy"].mean()) / dataset["vy"].std()
        dataset["psi_rad"] = (
            dataset["psi_rad"] - dataset["psi_rad"].mean()
        ) / dataset["psi_rad"].std()
        dataset["length"] = (
            dataset["length"] - dataset["length"].mean()
        ) / dataset["length"].std()
        dataset["width"] = (dataset["width"] - dataset["width"].mean()) / dataset[
            "width"
        ].std()
        # Encode agent types using one-hot encoding
        agent_types = pd.get_dummies(dataset["agent_type"], prefix="agent_type")
        dataset = pd.concat([dataset, agent_types], axis=1)

        dataset.drop(columns=["agent_type"], inplace=True)

        return dataset.to_numpy()

    def train(self, num_epochs, dataset):
        dataset = self.preprocess_dataset(dataset)

        count_list = []
        loss_list = []
        total_reward_list = []
        avg_reward_list = []
        epoch_reward = 0

        state = self.env._reset()  # Reset the environment once at the beginning of training
        for epoch in range(int(num_epochs)):
            done = False
            avg_loss = 0
            step = 0
            while not done:
                step += 1
                action = self.choose_action(state)
                state_next, reward, done, _ = self.env.step(action)
                exp = {
                    "state": state,
                    "action": action,
                    "reward": reward,
                    "state_next": state_next,
                    "done": done,
                }
                # Update the replay buffer with the experience
                self.ReplayBuffer.buffer_add(exp)
                state = state_next

                # sample random batch from replay memory
                exp_batch = self.ReplayBuffer.buffer_sample(self.batch_size)

                # extract batch data
                state_batch = torch.FloatTensor([exp["state"] for exp in exp_batch])
                action_batch = torch.LongTensor([exp["action"] for exp in exp_batch])
                reward_batch = torch.FloatTensor([exp["reward"] for exp in exp_batch])
                state_next_batch = torch.FloatTensor(
                    [exp["state_next"] for exp in exp_batch]
                )
                done_batch = torch.FloatTensor([1 - exp["done"] for exp in exp_batch])

                # reshape
                state_batch = state_batch.to(device).unsqueeze(1)
                action_batch = action_batch.to(device).unsqueeze(1)
                reward_batch = reward_batch.to(device).unsqueeze(1)
                state_next_batch = state_next_batch.to(device).unsqueeze(1)
                done_batch = done_batch.to(device).unsqueeze(1)

                # get estimate Q value
                estimate_Q = self.estimate_net(state_batch).gather(1, action_batch)

                # get target Q value
                max_action_idx = self.estimate_net(state_next_batch).detach().argmax(1)
                target_Q = reward_batch + done_batch * self.gamma * self.target_net(
                    state_next_batch
                ).gather(1, max_action_idx.unsqueeze(1))

                # compute mse loss
                loss = F.mse_loss(estimate_Q, target_Q)
                avg_loss += loss.item()

                # update network
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # update target network
                if self.learn_step_counter % 100 == 0:
                    self.update_target_networks()
                self.learn_step_counter += 1

            reward, count = self.eval()
            epoch_reward += reward

            # save
            period = 40
            if epoch % period == 0:
                # log
                avg_loss /= step
                epoch_reward /= period
                avg_reward_list.append(epoch_reward)
                loss_list.append(avg_loss)

                print(
                    "\nepoch: [{}/{}], \tavg loss: {:.4f}, \tavg reward: {:.3f}, \tsteps: {}".format(
                        epoch + 1, num_epochs, avg_loss, epoch_reward, count
                    )
                )

                epoch_reward = 0
                # create a new directory for saving
                try:
                    os.makedirs(self.timestamp)
                except OSError:
                    pass
                np.save(self.timestamp + "/double_dqn_cnn_loss.npy", loss_list)
                np.save(
                    self.timestamp + "/double_dqn_cnn_avg_reward.npy", avg_reward_list
                )
                torch.save(
                    self.estimate_net.state_dict(),
                    self.timestamp + "/double_dqn_cnn.pkl",
                )

        self.env.close()
        return loss_list, avg_reward_list


if __name__ == "__main__":
    # Read the dataset from the CSV file
    dataset = pd.read_csv("/content/DR_LaneChange_ET0_obs.csv")

    # Timestamp for saving
    named_tuple = time.localtime()  # get struct_time
    time_string = time.strftime(
        "%m%d_%H_%M", named_tuple
    )

    # Create an instance of the HighwayEnv with the dataset
    # highway_env = HighwayEnv(dataset=dataset, config={})
    highway_env = HighwayEnv(dataset, config={"lanes_count": 4})  # Adjust the 'lanes_count' as needed


    # Create an instance of the DOUBLEDQN_CNN agent with the HighwayEnv
    double_dqn_cnn_agent = DOUBLEDQN_CNN(
        env=highway_env,
        state_dim=5,  # Adjust state dimension according to your dataset
        action_dim=4,  # Adjust action dimension according to your problem
        lr=0.001,
        gamma=0.99,
        batch_size=64,
        timestamp=time_string,
    )

    # Train the agent
    iterations = 4000
    avg_loss, avg_reward_list = double_dqn_cnn_agent.train(iterations, dataset)
    np.save(time_string + "/double_dqn_cnn_loss.npy", avg_loss)
    np.save(time_string + "/double_dqn_cnn_avg_reward.npy", avg_reward_list)

    # Save the DQN network
    torch.save(
        double_dqn_cnn_agent.estimate_net.state_dict(),
        time_string + "/double_dqn_cnn.pkl",
    )

    # Plot the training progress
    plt.figure(figsize=(10, 6))
    plt.plot(avg_loss)
    plt.grid()
    plt.title("Double DQN Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.savefig("double_dqn_loss.png", dpi=150)
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.plot(avg_reward_list)
    plt.grid()
    plt.title("Double DQN Training Reward")
    plt.xlabel("*40 Epochs")
    plt.ylabel("Reward")
    plt.savefig(time_string + "/double_dqn_cnn_train_reward.png", dpi=150)
    plt.show()


cpu


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["x"] = dataset["x"] * PIXEL_SCALE
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["y"] = dataset["y"] * PIXEL_SCALE
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["x"] = (dataset["x"] - dataset["x"].mean()) / dataset["x"].std()
A value is trying to be set on a copy of a slice

TypeError: ignored