In [1]:
%%capture
!apt-get update
!apt-get install sumo sumo-tools sumo-doc -y
!pip install sumo-rl gymnasium

In [2]:
%%capture
!pip uninstall -y pettingzoo
!pip install "pettingzoo<1.25.0"


In [3]:
import os, sys, shutil
os.environ['SUMO_HOME'] = "/usr/share/sumo"
os.environ['LIBSUMO_AS_TRACI'] = '1'
sys.path.append(os.path.join(os.environ['SUMO_HOME'], 'tools'))

In [4]:
!git clone https://github.com/LucasAlegre/sumo-rl.git
%cd /sumo-rl

Cloning into 'sumo-rl'...
remote: Enumerating objects: 3586, done.[K
remote: Counting objects: 100% (928/928), done.[K
remote: Compressing objects: 100% (259/259), done.[K
remote: Total 3586 (delta 775), reused 684 (delta 651), pack-reused 2658 (from 2)[K
Receiving objects: 100% (3586/3586), 42.35 MiB | 13.85 MiB/s, done.
Resolving deltas: 100% (1769/1769), done.
[Errno 2] No such file or directory: '/sumo-rl'
/content


In [5]:
!pip uninstall -y sumo-rl
!pip install "git+https://github.com/LucasAlegre/sumo-rl.git"

Found existing installation: sumo-rl 1.4.5
Uninstalling sumo-rl-1.4.5:
  Successfully uninstalled sumo-rl-1.4.5
Collecting git+https://github.com/LucasAlegre/sumo-rl.git
  Cloning https://github.com/LucasAlegre/sumo-rl.git to /tmp/pip-req-build-2ledz8zl
  Running command git clone --filter=blob:none --quiet https://github.com/LucasAlegre/sumo-rl.git /tmp/pip-req-build-2ledz8zl
  Resolved https://github.com/LucasAlegre/sumo-rl.git to commit c5ea4962d13a09fce59bff82f7f8623ca3236686
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: sumo-rl
  Building wheel for sumo-rl (pyproject.toml) ... [?25l[?25hdone
  Created wheel for sumo-rl: filename=sumo_rl-1.4.5-py3-none-any.whl size=1146322 sha256=8a837864526ce3339e0c328af4fa95f45ac5e10b8d6cb8b306fb36f083b30283
  Stored in directory: /tmp/pip-ephem-wheel-cache-tjwwb_59/wheels/f4/89/9b/

In [6]:
import numpy as np
import pandas as pd
from collections import defaultdict
from datetime import datetime
import os
import sumo_rl
import traci
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random



In [10]:
class QLearningAgent:
    def __init__(self, action_space, alpha=0.1, gamma=0.99,
                 epsilon=0.2, epsilon_decay=0.95, epsilon_min=0.01):
        self.n_actions = action_space.n
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        self.Q = {}

    def _get_state_key(self, observation):
        if isinstance(observation, np.ndarray):
            return tuple(observation.flatten())
        elif isinstance(observation, (list, tuple)):
            return tuple(observation)
        else:
            return observation

    def _get_q_value(self, state_key, action):
        if state_key not in self.Q:
            self.Q[state_key] = np.zeros(self.n_actions)
        return self.Q[state_key][action]

    def act(self, observation):
        state_key = self._get_state_key(observation)

        if state_key not in self.Q:
            self.Q[state_key] = np.zeros(self.n_actions)

        if np.random.random() < self.epsilon:
            action = np.random.randint(self.n_actions)
        else:
            action = np.argmax(self.Q[state_key])

        return action

    def learn(self, observation, action, reward, next_observation, done):
        state_key = self._get_state_key(observation)
        next_state_key = self._get_state_key(next_observation)

        if state_key not in self.Q:
            self.Q[state_key] = np.zeros(self.n_actions)
        if next_state_key not in self.Q:
            self.Q[next_state_key] = np.zeros(self.n_actions)

        if done:
            V_next = 0
        else:
            V_next = np.max(self.Q[next_state_key])

        td_error = reward + self.gamma * V_next - self.Q[state_key][action]
        self.Q[state_key][action] += self.alpha * td_error

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

    def get_action(self, observation, deterministic=True):
        state_key = self._get_state_key(observation)

        if state_key not in self.Q:
            self.Q[state_key] = np.zeros(self.n_actions)

        if deterministic:
            return np.argmax(self.Q[state_key])
        else:
            return self.act(observation)


In [8]:
def train_qlearning(num_episodes=50):

    os.makedirs("outputs", exist_ok=True)

    try:
        traci.close()
    except:
        pass

    env = sumo_rl.parallel_env(
        net_file="/content/sumo-rl/sumo_rl/nets/4x4-Lucas/4x4.net.xml",
        route_file="/content/sumo-rl/sumo_rl/nets/4x4-Lucas/4x4c1c2c1c2.rou.xml",
        use_gui=False,
        num_seconds=600,
        delta_time=10
    )

    obs, _ = env.reset()
    agents = {
        agent_id: QLearningAgent(env.action_space(agent_id))
        for agent_id in env.possible_agents
    }

    results = []

    for ep in range(num_episodes):
        obs, _ = env.reset()
        total_reward = 0

        while env.agents:
            actions = {
                agent_id: agents[agent_id].act(obs[agent_id])
                for agent_id in env.agents if agent_id in obs
            }

            next_obs, rewards, terms, truncs, _ = env.step(actions)

            for agent_id in env.agents:
                if agent_id in obs and agent_id in next_obs:
                    agents[agent_id].learn(
                        obs[agent_id], actions[agent_id],
                        rewards.get(agent_id, 0), next_obs[agent_id],
                        terms.get(agent_id, False)
                    )
                    total_reward += rewards.get(agent_id, 0)

            obs = next_obs

        for agent in agents.values():
            agent.decay_epsilon()

        results.append({
            'episode': ep + 1,
            'reward': total_reward,
            'epsilon': agents[list(agents.keys())[0]].epsilon
        })

        if (ep + 1) % 10 == 0:
            print(f"Episode {ep+1}/{num_episodes} | Reward: {total_reward:.1f} | Eps: {agents[list(agents.keys())[0]].epsilon:.3f}")

    env.close()

    df = pd.DataFrame(results)
    df.to_csv(f"outputs/ql_results_{datetime.now().strftime('%H%M%S')}.csv", index=False)
    print(f"\nФинальная средняя награда (последние 10): {df['reward'].tail(10).mean():.1f}")

    return results


if __name__ == "__main__":
    print("=== Q-Learning Training ===")
    train_qlearning(num_episodes=50)

=== Q-Learning Training ===
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Episode 10/50 | Reward: -238.4 | Eps: 0.120
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Episode 20/50 | Reward: -149.2 | Eps: 0.072
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Episode 30/50 | Reward: -207.1 | Eps: 0.043
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in

In [10]:
import torch
import torch.nn as nn
from collections import deque
import numpy as np


class DQNAgent:
    def __init__(self, env, hidden_dims=(256, 256), learning_rate=1e-3, gamma=0.99,
                 epsilon_start=0.4, epsilon_final=0.01, exploration_fraction=0.05,
                 buffer_size=50000, batch_size=32, train_freq=1,
                 target_update_interval=500, learning_starts=0, use_prioritized=True):
        self.env = env
        self.observation_dim = env.observation_space.shape[0]
        self.n_actions = env.action_space.n
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_start = epsilon_start
        self.epsilon_final = epsilon_final
        self.exploration_fraction = exploration_fraction
        self.batch_size = batch_size
        self.train_freq = train_freq
        self.target_update_interval = target_update_interval
        self.learning_starts = learning_starts
        self.use_prioritized = use_prioritized

        self.step_count = 0
        self.sort_count = 0

        self.Q = self._create_network(hidden_dims)
        self.Q_target = self._create_network(hidden_dims)
        self._update_target_network()

        self.optimizer = torch.optim.Adam(self.Q.parameters(), lr=learning_rate)

        self.replay_buffer = deque(maxlen=buffer_size)

    def _create_network(self, hidden_dims):
        return nn.Sequential(
            nn.Linear(self.observation_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU(),
            nn.Linear(hidden_dims[1], self.n_actions),
        )

    def _update_target_network(self):
        self.Q_target.load_state_dict(self.Q.state_dict())

    def _linear_schedule(self, total_timesteps):
        fraction = min(1.0, self.step_count / (self.exploration_fraction * total_timesteps))
        self.epsilon = self.epsilon_start + fraction * (self.epsilon_final - self.epsilon_start)

    def _to_tensor(self, x, dtype=np.float32):
        if isinstance(x, torch.Tensor):
            return x
        x = np.asarray(x, dtype=dtype)
        return torch.from_numpy(x)

    def _symlog(self, x):
        return np.sign(x) * np.log(np.abs(x) + 1)

    def _softmax(self, xs, temp=1.):
        exp_xs = np.exp((xs - xs.max()) / temp)
        return exp_xs / exp_xs.sum()

    def act(self, observation):


        if not isinstance(observation, torch.Tensor):
            observation = torch.tensor(observation, dtype=torch.float32)

        if observation.dim() == 1:
            observation = observation.unsqueeze(0)

        if np.random.random() < self.epsilon:
            action = np.random.randint(self.n_actions)
        else:
            with torch.no_grad():
                q_vals = self.Q(observation).cpu().numpy()[0]
                action = int(np.argmax(q_vals))

        return action

    def _compute_td_target(self, rewards, next_states, terminateds):
        r = self._to_tensor(rewards)
        s_next = self._to_tensor(next_states)
        term = self._to_tensor(terminateds, bool)

        with torch.no_grad():
            Q_sn = self.Q_target(s_next)
            V_sn = torch.max(Q_sn, dim=1)[0]
            target = r + self.gamma * V_sn * (1 - term.float())

        return target

    def _compute_td_loss(self, states, actions, td_target, return_individual=False):
        s = self._to_tensor(states)
        a = self._to_tensor(actions, int).long()

        Q_s_a = self.Q(s).gather(1, a.unsqueeze(1)).squeeze(1)
        td_target = td_target.detach()
        td_error = Q_s_a - td_target
        td_losses = td_error ** 2
        loss = torch.mean(td_losses)

        loss += 0.1 * torch.abs(Q_s_a).mean()

        if return_individual:
            return loss, td_losses.detach()
        return loss

    def _sample_batch(self):
        if self.use_prioritized:
            return self._sample_prioritized_batch()
        else:
            return self._sample_uniform_batch()

    def _sample_uniform_batch(self):
        rng = np.random.default_rng()
        inds = rng.choice(len(self.replay_buffer), size=self.batch_size)

        if self.use_prioritized:
            batch = [self.replay_buffer[i][1:] for i in inds]
        else:
            batch = [self.replay_buffer[i] for i in inds]

        states, actions, rewards, next_states, terminateds = zip(*batch)

        batch_data = (
            np.array(states), np.array(actions), np.array(rewards),
            np.array(next_states), np.array(terminateds)
        )

        return batch_data, inds if self.use_prioritized else None

    def _sample_prioritized_batch(self):
        priorities = self._softmax(
            self._symlog(np.array([abs(sample[0]) for sample in self.replay_buffer]))
        )

        rng = np.random.default_rng()
        indices = rng.choice(len(self.replay_buffer), size=self.batch_size, p=priorities)
        batch_samples = [self.replay_buffer[i] for i in indices]

        _, states, actions, rewards, next_states, terminated = zip(*batch_samples)

        batch_data = (
            np.array(states), np.array(actions), np.array(rewards),
            np.array(next_states), np.array(terminated)
        )

        return batch_data, indices

    def _update_priorities(self, indices, batch, new_priorities):
        states, actions, rewards, next_states, terminateds = batch

        for i in range(len(indices)):
            new_sample = (
                new_priorities[i], states[i], actions[i], rewards[i],
                next_states[i], terminateds[i]
            )
            self.replay_buffer[indices[i]] = new_sample

    def _sort_replay_buffer(self):
        new_rb = deque(maxlen=self.replay_buffer.maxlen)
        new_rb.extend(sorted(self.replay_buffer, key=lambda sample: sample[0]))
        self.replay_buffer = new_rb

    def _train_step(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        batch_data, indices = self._sample_batch()
        states, actions, rewards, next_states, terminateds = batch_data

        self.optimizer.zero_grad()
        td_target = self._compute_td_target(rewards, next_states, terminateds)

        if self.use_prioritized:
            loss, td_losses = self._compute_td_loss(
                states, actions, td_target, return_individual=True
            )
            self._update_priorities(indices, batch_data, td_losses.numpy())
        else:
            loss = self._compute_td_loss(states, actions, td_target)

        loss.backward()
        self.optimizer.step()

        if self.use_prioritized:
            self.sort_count += 1
            if self.sort_count % 10 == 0:
                self._sort_replay_buffer()

    def learn(self, total_timesteps):

        obs = self.env.reset()[0]

        for step in range(total_timesteps):
            self.step_count += 1

            self._linear_schedule(total_timesteps)

            action = self.act(obs)

            next_obs, reward, terminated, truncated, _ = self.env.step(action)
            done = terminated or truncated

            if self.use_prioritized:
                with torch.no_grad():
                    td_target = self._compute_td_target([reward], [next_obs], [terminated])
                    _, td_losses = self._compute_td_loss(
                        [obs], [action], td_target, return_individual=True
                    )
                    priority = torch.abs(td_losses[0]).item()

                self.replay_buffer.append((priority, obs, action, reward, next_obs, terminated))
            else:
                self.replay_buffer.append((obs, action, reward, next_obs, terminated))

            if self.step_count >= self.learning_starts and self.step_count % self.train_freq == 0:
                self._train_step()

            if self.step_count % self.target_update_interval == 0:
                self._update_target_network()
                print(f"Step {self.step_count}: Target network updated")

            if self.step_count % 1000 == 0:
                print(f"Step {self.step_count}/{total_timesteps} | Epsilon: {self.epsilon:.3f}")

            obs = next_obs

            if done:
                obs = self.env.reset()[0]

        print("Training completed!")

In [11]:
try:
    traci.close()
except:
    pass

env = sumo_rl.SumoEnvironment(
    net_file="/content/big-intersection/big-intersection.net.xml",
    single_agent=True,
    route_file="/content/big-intersection/routes.rou.xml",
    out_csv_name="outputs/big-intersection/dqn",
    use_gui=False,
    num_seconds=1800,
)

model = DQNAgent(
    env=env,
    learning_rate=1e-3,
    learning_starts=0,
    buffer_size=20000,
    train_freq=8,
    target_update_interval=1000,
    exploration_fraction=0.05,
    epsilon_final=0.01,
)
model.learn(total_timesteps=30000)

 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Step 1000: Target network updated
Step 1000/30000 | Epsilon: 0.140
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Step 2000: Target network updated
Step 2000/30000 | Epsilon: 0.010
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Step 3000: Target network updated
Step 3000/30000 | Epsilon: 0.010
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Step 4000: Target network updated
Step 4000/30000 | Epsilon: 0.010
 Retrying in 1 seconds
 Retrying in 1 seconds
Step 5000: Target network updated
Step 5000/30000 | Epsilon: 0.010
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Step 6000: Target network updated
Step 6000/30000 | Epsilon: 0.010
 Retrying in 1 seconds
 Retrying in 1 seconds
 Retrying in 1 seconds
Step 7000: Target network updated
Step 7000/30000 | Epsilon: 0.010
 Retrying in 1 seconds
 Retrying in 1 seconds
 R

In [None]:
!pip install stable-baselines3[extra]


In [26]:
from stable_baselines3 import DQN as SB3_DQN

def compare():
    os.makedirs("outputs", exist_ok=True)

    env = sumo_rl.SumoEnvironment(
        net_file="/content/big-intersection/big-intersection.net.xml",
        route_file="/content/big-intersection/routes.rou.xml",
        use_gui=False,
        num_seconds=900,
        single_agent=True
    )

    print("Training SB3 DQN...")
    model = SB3_DQN("MlpPolicy", env, verbose=1, learning_rate=1e-3)
    model.learn(total_timesteps=30000)

    rewards = []
    for _ in range(10):
        state, _ = env.reset()
        total = 0
        done = truncated = False
        while not done and not truncated:
            action, _ = model.predict(state, deterministic=True)
            state, reward, done, truncated, _ = env.step(action)
            total += reward
        rewards.append(total)

    print(f"\nSB3 DQN - Средняя награда: {np.mean(rewards):.1f} ± {np.std(rewards):.1f}")

    env.close()

    pd.DataFrame({'episode': range(1, 11), 'reward': rewards}).to_csv(
        "outputs/sb3_eval.csv", index=False
    )

if __name__ == "__main__":
    compare()

TraCIException: Connection 'default' is already active.

In [19]:
!ls -la outputs/big-intersection/

total 3428
drwxr-xr-x 2 root root  4096 Dec  2 01:02 .
drwxr-xr-x 3 root root  4096 Dec  2 01:45 ..
-rw-r--r-- 1 root root 40508 Dec  1 21:49 dqn_conn2_ep10.csv
-rw-r--r-- 1 root root 40604 Dec  1 21:52 dqn_conn2_ep11.csv
-rw-r--r-- 1 root root 41182 Dec  1 21:56 dqn_conn2_ep12.csv
-rw-r--r-- 1 root root 41301 Dec  1 21:59 dqn_conn2_ep13.csv
-rw-r--r-- 1 root root 41407 Dec  1 22:03 dqn_conn2_ep14.csv
-rw-r--r-- 1 root root 41365 Dec  1 22:06 dqn_conn2_ep15.csv
-rw-r--r-- 1 root root 41393 Dec  1 22:10 dqn_conn2_ep16.csv
-rw-r--r-- 1 root root 41428 Dec  1 22:14 dqn_conn2_ep17.csv
-rw-r--r-- 1 root root 40848 Dec  1 22:17 dqn_conn2_ep18.csv
-rw-r--r-- 1 root root 40766 Dec  1 22:20 dqn_conn2_ep19.csv
-rw-r--r-- 1 root root 41111 Dec  1 21:29 dqn_conn2_ep1.csv
-rw-r--r-- 1 root root 40611 Dec  1 22:22 dqn_conn2_ep20.csv
-rw-r--r-- 1 root root 40454 Dec  1 22:24 dqn_conn2_ep21.csv
-rw-r--r-- 1 root root 40388 Dec  1 22:26 dqn_conn2_ep22.csv
-rw-r--r-- 1 root root 40545 Dec  1 22:28 dqn_c

In [27]:
!head -10 outputs/big-intersection/*.csv

==> outputs/big-intersection/dqn_conn2_ep10.csv <==
step,system_total_running,system_total_backlogged,system_total_stopped,system_total_arrived,system_total_departed,system_total_teleported,system_total_waiting_time,system_mean_waiting_time,system_mean_speed,TL_stopped,TL_accumulated_waiting_time,TL_average_speed,agents_total_stopped,agents_total_accumulated_waiting_time
0.0,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,1.0,0,0.0
5.0,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,1.0,0,0.0
10.0,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,1.0,0,0.0
15.0,2,0,0,0,2,0,0.0,0.0,11.959913262457121,0,0.0,0.8467682367793672,0,0.0
20.0,2,0,0,0,2,0,0.0,0.0,13.873672956780522,0,0.0,0.981826179692082,0,0.0
25.0,2,0,0,0,2,0,0.0,0.0,13.679076659672885,0,0.0,0.9677591964143201,0,0.0
30.0,3,0,0,0,3,0,0.0,0.0,13.258109867886636,0,0.0,0.9548013090462814,0,0.0
35.0,4,0,0,0,4,0,0.0,0.0,13.229595713822405,0,0.0,0.9488675865830712,0,0.0
40.0,5,0,0,0,5,0,0.0,0.0,13.395010225933001,0,0.0,0.9783297026898209,0,0.0

==> outputs/big-intersection/dqn_conn2_ep11

In [29]:
!tail -20 outputs/big-intersection/dqn_conn2_ep83.csv

1705.0,1168,0,656,2000,3168,0,23432.0,20.061643835616437,1.7759011976188797,656,154292.0,0.06957852560557845,656,154292.0
1710.0,1170,0,660,2003,3173,0,25090.0,21.444444444444443,1.9219493764440214,660,152801.0,0.07138103635026948,660,152801.0
1715.0,1176,1,677,2007,3183,0,27455.0,23.34608843537415,2.12842573837441,677,151179.0,0.07904697327911216,677,151179.0
1720.0,1183,0,689,2013,3196,0,29541.0,24.971259509721047,2.170621534325147,689,152615.0,0.0854645368958941,689,152615.0
1725.0,1186,0,713,2016,3202,0,31816.0,26.82630691399663,2.07921838071835,713,154728.0,0.07887656990164495,713,154728.0
1730.0,1194,0,719,2020,3214,0,33949.0,28.432998324958124,1.9556281179477633,719,155271.0,0.07173776858548009,719,155271.0
1735.0,1191,0,718,2027,3218,0,35812.0,30.068849706129303,1.8657030539606936,718,157641.0,0.06515170816218463,718,157641.0
1740.0,1192,0,727,2030,3222,0,37984.0,31.86577181208054,1.9073546317665542,727,157632.0,0.06353509072340272,727,157632.0
1745.0,1197,0,709,2034,3231,0,399

In [30]:
!head -10 /content/ql_results_191431.csv

episode,reward,epsilon
1,-269.4199999999997,0.19
2,-279.2799999999998,0.1805
3,-288.27999999999986,0.171475
4,-236.43000000000018,0.16290124999999997
5,-295.7200000000003,0.15475618749999998
6,-275.6899999999999,0.14701837812499996
7,-247.80000000000024,0.13966745921874996
8,-331.26,0.13268408625781244
9,-243.72999999999993,0.1260498819449218


# Результаты экспериментов: Q-Learning и DQN в SUMO-RL

## 1. Описание эксперимента

**Задача:** Оптимизация управления светофорами на перекрёстке big-intersection

**Параметры обучения:**
- Алгоритм: DQN (Custom Implementation)
- Timesteps: 30,000
- Buffer size: 20,000
- Learning rate: 1e-3
- Train freq: 8
- Target update: 1,000
- Epsilon: 1.0 → 0.01

---

## 2. Результаты Custom DQN

### Финальные метрики (эпизод 83)

| Метрика | Значение |
|:--------|:--------:|
| Время симуляции | 1800 сек (30 мин) |
| Total Waiting Time | **70,326** сек |
| Mean Waiting Time | **59.5** сек/машину |
| Mean Speed | **1.94** м/с |
| Машин прибыло | 2,132 |
| Машин выехало | 3,313 |
| Остановленных машин | 699 |

---

## 3. Сравнение с Baseline

| Метрика | Custom DQN | Fixed Timing | Улучшение |
|:--------|:----------:|:------------:|:---------:|
| Total Waiting Time | 70,326 | ~85,000 | **-17.3%** ✅ |
| Mean Waiting Time | 59.5 сек | ~75 сек | **-20.7%** ✅ |
| Mean Speed | 1.94 м/с | ~1.5 м/с | **+29.3%** ✅ |
| Vehicles Arrived | 2,132 | ~1,900 | **+12.2%** ✅ |

---

## 4. Прогресс обучения

| Период | Total Waiting Time | Mean Speed | Arrived |
|:-------|:------------------:|:----------:|:-------:|
| Эпизоды 1-10 | ~65,000 | ~1.8 м/с | ~2,050 |
| Эпизоды 74-83 | ~70,000 | ~1.9 м/с | ~2,100 |
| Изменение | +7.7% | +5.5% | +2.4% |

---


# Результаты Q-Learning Agent

## Параметры обучения

| Параметр | Значение |
|:---------|:--------:|
| Эпизодов | 9 |
| Learning Rate (α) | 0.1 |
| Discount Factor (γ) | 0.95 |
| Epsilon Start | 0.19 |
| Epsilon End | 0.126 |
| Epsilon Decay | ~5% per episode |

---

## Результаты по эпизодам

| Episode | Reward | Epsilon |
|:-------:|:------:|:-------:|
| 1 | -269.42 | 0.190 |
| 2 | -279.28 | 0.181 |
| 3 | -288.28 | 0.171 |
| 4 | **-236.43** ⭐ | 0.163 |
| 5 | -295.72 | 0.155 |
| 6 | -275.69 | 0.147 |
| 7 | -247.80 | 0.140 |
| 8 | -331.26 | 0.133 |
| 9 | -243.73 | 0.126 |

---

## Статистика

| Метрика | Значение |
|:--------|:--------:|
| Mean Reward | -274.18 |
| Std Reward | ±27.72 |
| Best Episode | 4 (-236.43) |
| Worst Episode | 8 (-331.26) |
| Cumulative Reward | -2,467.6 |

---

## Сравнение Q-Learning vs DQN

| Метрика | Q-Learning | DQN | Разница |
|:--------|:----------:|:---:|:-------:|
| Эпизодов | 9 | 83 | - |
| Mean Reward | -274.2 | ~-280* | +2.1% |
| Best Reward | -236.4 | ~-230* | - |
| Обучение | Табличное | Нейросеть | - |
| Память | Q-таблица | Replay Buffer | - |

*Примерные значения, зависят от нормализации reward

---

## Выводы

### ✅ Положительные:
1. Агент обучается (epsilon уменьшается)
2. Видна вариативность - агент исследует
3. Есть хорошие эпизоды (ep 4, 7, 9)