In [1]:
pip install gym numpy stable-baselines3

Note: you may need to restart the kernel to use updated packages.


In [1]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import time
import os

# Custom 2D racing environment
class RacingEnv(gym.Env):
    def __init__(self):
        super(RacingEnv, self).__init__()
        self.width = 10
        self.height = 20
        self.car_pos = [self.width // 2, 0]
        self.obstacle_pos = [np.random.randint(0, self.width), self.height - 1]
        self.action_space = gym.spaces.Discrete(3)  # Left, Stay, Right
        self.observation_space = gym.spaces.Box(low=0, high=max(self.width, self.height),
                                                shape=(4,), dtype=np.float32)

    def reset(self, seed=None):
        super().reset(seed=seed)
        self.car_pos = [self.width // 2, 0]
        self.obstacle_pos = [np.random.randint(0, self.width), self.height - 1]
        return self._get_obs(), {}

    def step(self, action):
        # Move car
        if action == 0 and self.car_pos[0] > 0:
            self.car_pos[0] -= 1
        elif action == 2 and self.car_pos[0] < self.width - 1:
            self.car_pos[0] += 1

        # Move obstacle
        self.obstacle_pos[1] -= 1

        # Check collision
        if self.car_pos == self.obstacle_pos:
            reward = -10
            terminated = True
        elif self.obstacle_pos[1] < 0:
            reward = 1
            self.obstacle_pos = [np.random.randint(0, self.width), self.height - 1]
            terminated = False
        else:
            reward = 0
            terminated = False

        return self._get_obs(), reward, terminated, False, {}

    def _get_obs(self):
        return np.array([self.car_pos[0], self.car_pos[1],
                         self.obstacle_pos[0], self.obstacle_pos[1]], dtype=np.float32)

    def render(self):
        os.system('cls' if os.name == 'nt' else 'clear')
        for y in range(self.height - 1, -1, -1):
            for x in range(self.width):
                if [x, y] == self.car_pos:
                    print('C', end='')
                elif [x, y] == self.obstacle_pos:
                    print('O', end='')
                else:
                    print('.', end='')
            print()
        print("\n")

# Wrapper to make the environment compatible with stable-baselines3
class VisualRacingEnv(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.env = env

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.env.render()
        return obs, info

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.env.render()
        time.sleep(0.1)  # Add a small delay to make the visualization visible
        return obs, reward, terminated, truncated, info

# Create and wrap the environment
env = DummyVecEnv([lambda: VisualRacingEnv(RacingEnv())])

# Initialize the PPO model
model = PPO("MlpPolicy", env, verbose=1)

# Train the model
model.learn(total_timesteps=100000)

# Evaluate the trained model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# Save the trained model
model.save("ppo_racing_model")

# Test the model
obs, _ = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        obs, _ = env.reset()

Using cuda device


  from .autonotebook import tqdm as notebook_tqdm


....O.....
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
.....C....


..........
....O.....
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
....C.....


..........
..........
....O.....
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
....C.....


..........
..........
..........
....O.....
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
...C......


..........
..........
..........
..........
....O.....
..........
..........
..........
..........
..........
..

In [3]:
pip install 'shimmy>=0.2.1

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: "'shimmy"


In [1]:
import numpy as np
import torch
from tmrl import get_environment
def analyze_observation(obs):
    print("Analyzing observation structure:")
    
    if isinstance(obs, (list, tuple)):
        print(f"Observation is a {type(obs).__name__} with {len(obs)} elements.")
        for i, item in enumerate(obs):
            print(f"\nElement {i}:")
            analyze_item(item)
    elif isinstance(obs, dict):
        print(f"Observation is a dictionary with {len(obs)} keys.")
        for key, value in obs.items():
            print(f"\nKey: {key}")
            analyze_item(value)
    else:
        analyze_item(obs)

def analyze_item(item):
    if isinstance(item, np.ndarray):
        print(f"Type: NumPy array, Shape: {item.shape}, Data type: {item.dtype}")
    elif isinstance(item, torch.Tensor):
        print(f"Type: PyTorch Tensor, Shape: {item.shape}, Data type: {item.dtype}, Device: {item.device}")
    elif isinstance(item, (int, float, bool)):
        print(f"Type: {type(item).__name__}, Value: {item}")
    elif isinstance(item, str):
        print(f"Type: string, Length: {len(item)}")
    elif isinstance(item, (list, tuple)):
        print(f"Type: {type(item).__name__}, Length: {len(item)}")
        if len(item) > 0:
            print("First element:")
            analyze_item(item[0])
    elif isinstance(item, dict):
        print(f"Type: dictionary, Number of keys: {len(item)}")
        if len(item) > 0:
            first_key = next(iter(item))
            print(f"Example - Key: {first_key}")
            analyze_item(item[first_key])
    else:
        print(f"Type: {type(item).__name__}")
        try:
            print(f"Shape/Length: {len(item)}")
        except:
            print("Shape/Length: Not applicable")

# Example usage
def example_usage(env):
    obs, _ = env.reset()
    analyze_observation(obs)

# Uncomment and use this if you want to test with a specific environment
# from your_environment_module import get_environment
env = get_environment()
example_usage(env)

Analyzing observation structure:
Observation is a tuple with 6 elements.

Element 0:
Type: NumPy array, Shape: (1,), Data type: float32

Element 1:
Type: NumPy array, Shape: (1,), Data type: float32

Element 2:
Type: NumPy array, Shape: (1,), Data type: float32

Element 3:
Type: NumPy array, Shape: (4, 64, 64), Data type: uint8

Element 4:
Type: NumPy array, Shape: (3,), Data type: float32

Element 5:
Type: NumPy array, Shape: (3,), Data type: float32


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
from tmrl import get_environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class DQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        conv_out_size = self._get_conv_out(h, w)
        self.fc1 = nn.Linear(conv_out_size + 9, 512)  # 9 additional inputs (1 each from elements 0, 1, 2, and 3 each from elements 4 and 5)
        self.fc2 = nn.Linear(512, outputs)

    def _get_conv_out(self, h, w):
        o = self.conv1(torch.zeros(1, 4, h, w))
        o = self.conv2(o)
        o = self.conv3(o)
        return int(np.prod(o.size()))

    def forward(self, x, additional_inputs):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.cat((x, additional_inputs), dim=1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

class DQNAgent:
    def __init__(self, n_actions, memory_size=10000, batch_size=32, gamma=0.99, epsilon_start=1.0, epsilon_final=0.01, epsilon_decay=0.995):
        self.n_actions = n_actions
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay

        self.policy_net = DQN(64, 64, n_actions).to(device)
        self.target_net = DQN(64, 64, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters())

    def select_action(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                image = torch.FloatTensor(state[3]).unsqueeze(0).to(device)
                additional = torch.FloatTensor(np.concatenate([state[0], state[1], state[2], state[4], state[5]])).unsqueeze(0).to(device)
                return self.policy_net(image, additional).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

        state_image_batch = torch.FloatTensor(np.stack([s[3] for s in state_batch])).to(device)
        state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in state_batch])).to(device)

        next_state_image_batch = torch.FloatTensor(np.stack([s[3] for s in next_state_batch])).to(device)
        next_state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in next_state_batch])).to(device)

        action_batch = torch.LongTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        done_batch = torch.FloatTensor(done_batch).to(device)

        q_values = self.policy_net(state_image_batch, state_additional_batch).gather(1, action_batch.unsqueeze(1))
        next_q_values = self.target_net(next_state_image_batch, next_state_additional_batch).max(1)[0].detach()
        expected_q_values = reward_batch + (1 - done_batch) * self.gamma * next_q_values

        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.epsilon = max(self.epsilon_final, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

def preprocess_observation(obs):
    # Ensure all elements are numpy arrays
    speed = np.array(obs[0]).flatten()
    steering = np.array(obs[1]).flatten()
    gyro = np.array(obs[2]).flatten()
    
    # Normalize image
    image = np.array(obs[3]).astype(np.float32) / 255.0
    
    # Ensure previous actions and current actions are numpy arrays
    prev_action = np.array(obs[4]).flatten()
    action = np.array(obs[5]).flatten()
    
    # Normalize to [-1, 1] range
    speed = np.clip(speed / 300.0, -1, 1)  # Assuming max speed is 300
    steering = np.clip(steering / np.pi, -1, 1)  # Assuming steering is in radians
    gyro = np.clip(gyro / np.pi, -1, 1)  # Assuming gyro is in radians
    prev_action = np.clip(prev_action, -1, 1)
    action = np.clip(action, -1, 1)
    
    return (speed, steering, gyro, image, prev_action, action)

def env_action_to_agent_action(env_action, n_actions):
    # Convert environment action to agent action index
    if isinstance(env_action, (int, np.integer)):
        return env_action
    elif isinstance(env_action, (float, np.float32, np.float64)):
        # Discretize the continuous action
        return int((env_action + 1) * (n_actions - 1) / 2)
    elif isinstance(env_action, np.ndarray):
        return np.argmax(env_action)
    else:
        raise ValueError(f"Unexpected action type: {type(env_action)}")

def agent_action_to_env_action(agent_action, n_actions):
    # Convert agent action index to environment action
    steering = (agent_action * 2 / (n_actions - 1)) - 1
    throttle = 1.0  # Full throttle
    brake = 0.0     # No brake
    return [ throttle, brake,steering]  # Return a list with three values

def train(env, agent, num_episodes, max_steps_per_episode):
    for episode in range(num_episodes):
        obs, _ = env.reset()
        state = preprocess_observation(obs)
        total_reward = 0

        for step in range(max_steps_per_episode):
            agent_action = agent.select_action(state)
            env_action = agent_action_to_env_action(agent_action.item(), agent.n_actions)
            next_obs, reward, terminated, truncated, _ = env.step(env_action)  # Pass env_action as a list
            next_state = preprocess_observation(next_obs)
            done = terminated or truncated
            
            agent.remember(state, agent_action.item(), reward, next_state, done)
            agent.replay()
            
            state = next_state
            total_reward += reward

            if done:
                break

        if episode % 10 == 0:
            agent.update_target_network()

        print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")# Usage example:
# from your_environment import get_environment
env = get_environment()
n_actions = 3  # Number of possible actions (adjust as needed)
agent = DQNAgent(n_actions)
train(env, agent, num_episodes=1000, max_steps_per_episode=1000)

  from .autonotebook import tqdm as notebook_tqdm


Episode 0, Total Reward: 3.4399999901652336, Epsilon: 0.78
Episode 1, Total Reward: 11.859999973326921, Epsilon: 0.36
Episode 2, Total Reward: 138.37999992072582, Epsilon: 0.04
Episode 3, Total Reward: 1.88999998383224, Epsilon: 0.03
Episode 4, Total Reward: 10.640000021085143, Epsilon: 0.01
Episode 5, Total Reward: 1.4599999822676182, Epsilon: 0.01
Episode 6, Total Reward: 10.85999995842576, Epsilon: 0.01
Episode 7, Total Reward: 2.219999987632036, Epsilon: 0.01
Episode 8, Total Reward: 1.3199999928474426, Epsilon: 0.01
Episode 9, Total Reward: 12.089999966323376, Epsilon: 0.01
Episode 10, Total Reward: 1.059999991208315, Epsilon: 0.01
Episode 11, Total Reward: 1.149999987334013, Epsilon: 0.01
Episode 12, Total Reward: 1.1799999866634607, Epsilon: 0.01
Episode 13, Total Reward: 1.6499999817460775, Epsilon: 0.01
Episode 14, Total Reward: 1.0899999868124723, Epsilon: 0.01
Episode 15, Total Reward: 1.0499999951571226, Epsilon: 0.01
Episode 16, Total Reward: 1.0499999951571226, Epsilon: 0



Episode 81, Total Reward: 138.37999986857176, Epsilon: 0.01
Episode 82, Total Reward: 3.9499999713152647, Epsilon: 0.01
Episode 83, Total Reward: 1.1699999943375587, Epsilon: 0.01
Episode 84, Total Reward: 28.000000031664968, Epsilon: 0.01
Episode 85, Total Reward: 1.109999991953373, Epsilon: 0.01
Episode 86, Total Reward: 1.239999983459711, Epsilon: 0.01
Episode 87, Total Reward: 2.179999992251396, Epsilon: 0.01
Episode 88, Total Reward: 11.330000007525086, Epsilon: 0.01
Episode 89, Total Reward: 1.55999999307096, Epsilon: 0.01
Episode 90, Total Reward: 3.859999969601631, Epsilon: 0.01
Episode 91, Total Reward: 5.549999983981252, Epsilon: 0.01
Episode 92, Total Reward: 1.1199999935925007, Epsilon: 0.01
Episode 93, Total Reward: 4.0999999940395355, Epsilon: 0.01
Episode 94, Total Reward: 2.709999991580844, Epsilon: 0.01
Episode 95, Total Reward: 12.259999975562096, Epsilon: 0.01
Episode 96, Total Reward: 1.2099999897181988, Epsilon: 0.01
Episode 97, Total Reward: 138.3800022304058, Eps



Episode 146, Total Reward: 31.36999993212521, Epsilon: 0.01
Episode 147, Total Reward: 1.1399999875575304, Epsilon: 0.01
Episode 148, Total Reward: 4.319999974220991, Epsilon: 0.01
Episode 149, Total Reward: 28.03999998793006, Epsilon: 0.01
Episode 150, Total Reward: 4.519999975338578, Epsilon: 0.01
Episode 151, Total Reward: 1.2399999797344208, Epsilon: 0.01
Episode 152, Total Reward: 3.5499999802559614, Epsilon: 0.01
Episode 153, Total Reward: 19.169999981299043, Epsilon: 0.01
Episode 154, Total Reward: 15.419999964535236, Epsilon: 0.01
Episode 155, Total Reward: 3.299999974668026, Epsilon: 0.01
Episode 156, Total Reward: 1.9399999808520079, Epsilon: 0.01
Episode 157, Total Reward: 1.1599999852478504, Epsilon: 0.01
Episode 158, Total Reward: 1.1699999943375587, Epsilon: 0.01
Episode 159, Total Reward: 1.2199999932199717, Epsilon: 0.01
Episode 160, Total Reward: 10.690000031143427, Epsilon: 0.01


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
from tmrl import get_environment
import os
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
        self.ln1 = nn.GroupNorm(32, channels)  # Using GroupNorm instead of LayerNorm
        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
        self.ln2 = nn.GroupNorm(32, channels)  # Using GroupNorm instead of LayerNorm

    def forward(self, x):
        residual = x
        out = F.relu(self.ln1(self.conv1(x)))
        out = self.ln2(self.conv2(out))
        out += residual
        return F.relu(out)

class AttentionModule(nn.Module):
    def __init__(self, in_channels):
        super(AttentionModule, self).__init__()
        self.conv = nn.Conv2d(in_channels, 1, kernel_size=1)
        self.softmax = nn.Softmax(dim=-1)  # Use dim=-1 for the final dimension

    def forward(self, x):
        batch_size, C, H, W = x.size()
        proj = self.conv(x).view(batch_size, 1, -1)
        weights = self.softmax(proj).view(batch_size, 1, H, W)
        return x * weights.expand_as(x)


class ComplexDQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(ComplexDQN, self).__init__()
        
        # Initial convolutional layers
        self.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2)
        self.bn2 = nn.BatchNorm2d(128)
        
        # Residual blocks
        self.res1 = ResidualBlock(128)
        self.res2 = ResidualBlock(128)
        self.res3 = ResidualBlock(128)
        
        # Attention module
        self.attention = AttentionModule(128)
        
        # Additional convolutional layers
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
        self.bn3 = nn.BatchNorm2d(256)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
        self.bn4 = nn.BatchNorm2d(512)

        # Calculate the size of flattened features
        self.conv_out_size = self._get_conv_out(h, w)

        # Fully connected layers
        self.fc1 = nn.Linear(self.conv_out_size + 9, 2048)
        self.ln1 = nn.LayerNorm(2048)
        self.fc2 = nn.Linear(2048, 1024)
        self.ln2 = nn.LayerNorm(1024)
        self.fc3 = nn.Linear(1024, 512)
        self.ln3 = nn.LayerNorm(512)
        self.fc4 = nn.Linear(512, outputs)

        # Dropout layers
        self.dropout1 = nn.Dropout(0.4)
        self.dropout2 = nn.Dropout(0.4)
        self.dropout3 = nn.Dropout(0.4)

    def _get_conv_out(self, h, w):
        o = F.relu(self.bn1(self.conv1(torch.zeros(1, 4, h, w))))
        o = F.relu(self.bn2(self.conv2(o)))
        o = self.res1(o)
        o = self.res2(o)
        o = self.res3(o)
        o = self.attention(o)
        o = F.relu(self.bn3(self.conv3(o)))
        o = F.relu(self.bn4(self.conv4(o)))
        return int(np.prod(o.size()))

    def forward(self, x, additional_inputs):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.res1(x)
        x = self.res2(x)
        x = self.res3(x)
        x = self.attention(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = torch.cat((x, additional_inputs), dim=1)
        x = F.relu(self.ln1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.ln2(self.fc2(x)))
        x = self.dropout2(x)
        x = F.relu(self.ln3(self.fc3(x)))
        x = self.dropout3(x)
        return self.fc4(x)


class ComplexDQNAgent:
    def __init__(self, n_actions, memory_size=200000, batch_size=128, gamma=0.99, epsilon_start=1.0, epsilon_final=0.01, epsilon_decay=0.9995, learning_rate=0.0001):
        self.n_actions = n_actions
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.steps = 0

        self.policy_net = ComplexDQN(64, 64, n_actions).to(device)
        self.target_net = ComplexDQN(64, 64, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=200000, eta_min=1e-6)

    def select_action(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                image = torch.FloatTensor(state[3]).unsqueeze(0).to(device)
                additional = torch.FloatTensor(np.concatenate([state[0], state[1], state[2], state[4], state[5]])).unsqueeze(0).to(device)
                return self.policy_net(image, additional).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

        state_image_batch = torch.FloatTensor(np.stack([s[3] for s in state_batch])).to(device)
        state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in state_batch])).to(device)

        next_state_image_batch = torch.FloatTensor(np.stack([s[3] for s in next_state_batch])).to(device)
        next_state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in next_state_batch])).to(device)

        action_batch = torch.LongTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        done_batch = torch.FloatTensor(done_batch).to(device)

        q_values = self.policy_net(state_image_batch, state_additional_batch).gather(1, action_batch.unsqueeze(1))
        next_q_values = self.target_net(next_state_image_batch, next_state_additional_batch).max(1)[0].detach()
        expected_q_values = reward_batch + (1 - done_batch) * self.gamma * next_q_values

        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.epsilon = max(self.epsilon_final, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save_checkpoint(self, episode, directory="checkpoints"):
        if not os.path.exists(directory):
            os.makedirs(directory)
        checkpoint = {
            'episode': episode,
            'model_state_dict': self.policy_net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'epsilon': self.epsilon,
            'steps': self.steps
        }
        filename = os.path.join(directory, f"checkpoint_episode_{episode}_step_{self.steps}.pth")
        torch.save(checkpoint, filename)
        print(f"Checkpoint saved: {filename}")

    def load_checkpoint(self, filename):
        checkpoint = torch.load(filename)
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.target_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.epsilon = 0.9
        self.steps = checkpoint['steps']
        return checkpoint['episode']

def preprocess_observation(obs):
    speed = np.array(obs[0]).flatten()
    steering = np.array(obs[1]).flatten()
    gyro = np.array(obs[2]).flatten()
    image = np.array(obs[3]).astype(np.float32) / 255.0
    prev_action = np.array(obs[4]).flatten()
    action = np.array(obs[5]).flatten()
    
    speed = np.clip(speed / 300.0, -1, 1)
    steering = np.clip(steering / np.pi, -1, 1)
    gyro = np.clip(gyro / np.pi, -1, 1)
    prev_action = np.clip(prev_action, -1, 1)
    action = np.clip(action, -1, 1)
    
    return (speed, steering, gyro, image, prev_action, action)

def env_action_to_agent_action(env_action, n_actions):
    if isinstance(env_action, (int, np.integer)):
        return env_action
    elif isinstance(env_action, (float, np.float32, np.float64)):
        return int((env_action + 1) * (n_actions - 1) / 2)
    elif isinstance(env_action, np.ndarray):
        return np.argmax(env_action)
    else:
        raise ValueError(f"Unexpected action type: {type(env_action)}")

def agent_action_to_env_action(agent_action, n_actions):
    steering = (agent_action * 2 / (n_actions - 1)) - 1
    throttle = 1.0
    brake = 0.0
    return [throttle, brake, steering]


def detect_crash(obs, prev_obs, speed_threshold=1.0):
    """
    Detect if a crash has occurred based on observations.
    
    :param obs: Current observation
    :param prev_obs: Previous observation
    :param speed_threshold: Threshold for sudden speed change
    :return: True if crash detected, False otherwise
    """
    if prev_obs is None:
        return False
    
    current_speed = np.linalg.norm(obs[0])  # Assuming obs[0] contains speed information
    prev_speed = np.linalg.norm(prev_obs[0])
    
    # Detect sudden drop in speed
    if prev_speed - current_speed > speed_threshold:
        return True
    

def train(env, agent, num_episodes, max_steps_per_episode):
    for episode in range(num_episodes):
        obs, _ = env.reset()
        state = preprocess_observation(obs)
        total_reward = 0
        episode_start_time = time.time()
        prev_obs = None
        crashes = 0

        for step in range(max_steps_per_episode):
            agent_action = agent.select_action(state)
            env_action = agent_action_to_env_action(agent_action.item(), agent.n_actions)
            next_obs, env_reward, terminated, truncated, _ = env.step(env_action)
            next_state = preprocess_observation(next_obs)
            
            # Detect crash
            if detect_crash(next_obs, prev_obs):
                crashes += 1
                env_reward -= 10  # Apply a penalty for crashing
            
            done = terminated or truncated
            
            # Calculate time-based reward
            elapsed_time = time.time() - episode_start_time
            time_factor = 1 / (elapsed_time + 1)  # Adding 1 to avoid division by zero
            reward = env_reward * time_factor * 10  # Multiplying by 10 to make time factor more significant
            
            agent.remember(state, agent_action.item(), reward, next_state, done)
            agent.replay()
            
            state = next_state
            prev_obs = next_obs
            total_reward += reward

            agent.steps += 1
            agent.scheduler.step()  # Update learning rate

            # Save checkpoint every 1000 steps
            if agent.steps % 1000 == 0:
                agent.save_checkpoint(episode)

            if done:
                break

        if episode % 5 == 0:
            agent.update_target_network()

        print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Crashes: {crashes}, Epsilon: {agent.epsilon:.4f}, LR: {agent.scheduler.get_last_lr()[0]:.6f}, Time: {elapsed_time:.2f}s")

if __name__ == "__main__":
    env = get_environment()
    n_actions = 3  # Number of possible actions (adjust as needed)
    agent = ComplexDQNAgent(n_actions)
    agent.load_checkpoint("checkpoints\checkpoint_episode_285_step_38000.pth")
    train(env, agent, num_episodes=5000, max_steps_per_episode=5000)

cuda


  from .autonotebook import tqdm as notebook_tqdm


Episode 0, Total Reward: -102.13, Crashes: 3, Epsilon: 0.9000, LR: 0.000091, Time: 4.74s




Episode 1, Total Reward: -11.43, Crashes: 2, Epsilon: 0.8924, LR: 0.000091, Time: 13.72s




Episode 2, Total Reward: -96.86, Crashes: 17, Epsilon: 0.8608, LR: 0.000091, Time: 45.33s




Episode 3, Total Reward: -33.63, Crashes: 2, Epsilon: 0.8304, LR: 0.000091, Time: 45.72s




In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
from tmrl import get_environment
import os
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
        self.ln1 = nn.GroupNorm(32, channels)  # Using GroupNorm instead of LayerNorm
        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
        self.ln2 = nn.GroupNorm(32, channels)  # Using GroupNorm instead of LayerNorm

    def forward(self, x):
        residual = x
        out = F.relu(self.ln1(self.conv1(x)))
        out = self.ln2(self.conv2(out))
        out += residual
        return F.relu(out)

class AttentionModule(nn.Module):
    def __init__(self, in_channels):
        super(AttentionModule, self).__init__()
        self.conv = nn.Conv2d(in_channels, 1, kernel_size=1)
        self.softmax = nn.Softmax(dim=-1)  # Use dim=-1 for the final dimension

    def forward(self, x):
        batch_size, C, H, W = x.size()
        proj = self.conv(x).view(batch_size, 1, -1)
        weights = self.softmax(proj).view(batch_size, 1, H, W)
        return x * weights.expand_as(x)


class ComplexDQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(ComplexDQN, self).__init__()
        
        # Initial convolutional layers
        self.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2)
        self.bn2 = nn.BatchNorm2d(128)
        
        # Residual blocks
        self.res1 = ResidualBlock(128)
        self.res2 = ResidualBlock(128)
        self.res3 = ResidualBlock(128)
        
        # Attention module
        self.attention = AttentionModule(128)
        
        # Additional convolutional layers
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
        self.bn3 = nn.BatchNorm2d(256)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
        self.bn4 = nn.BatchNorm2d(512)

        # Calculate the size of flattened features
        self.conv_out_size = self._get_conv_out(h, w)

        # Fully connected layers
        self.fc1 = nn.Linear(self.conv_out_size + 9, 2048)
        self.ln1 = nn.LayerNorm(2048)
        self.fc2 = nn.Linear(2048, 1024)
        self.ln2 = nn.LayerNorm(1024)
        self.fc3 = nn.Linear(1024, 512)
        self.ln3 = nn.LayerNorm(512)
        self.fc4 = nn.Linear(512, outputs)

        # Dropout layers
        self.dropout1 = nn.Dropout(0.4)
        self.dropout2 = nn.Dropout(0.4)
        self.dropout3 = nn.Dropout(0.4)

    def _get_conv_out(self, h, w):
        o = F.relu(self.bn1(self.conv1(torch.zeros(1, 4, h, w))))
        o = F.relu(self.bn2(self.conv2(o)))
        o = self.res1(o)
        o = self.res2(o)
        o = self.res3(o)
        o = self.attention(o)
        o = F.relu(self.bn3(self.conv3(o)))
        o = F.relu(self.bn4(self.conv4(o)))
        return int(np.prod(o.size()))

    def forward(self, x, additional_inputs):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.res1(x)
        x = self.res2(x)
        x = self.res3(x)
        x = self.attention(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = torch.cat((x, additional_inputs), dim=1)
        x = F.relu(self.ln1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.ln2(self.fc2(x)))
        x = self.dropout2(x)
        x = F.relu(self.ln3(self.fc3(x)))
        x = self.dropout3(x)
        return self.fc4(x)


class ComplexDQNAgent:
    def __init__(self, n_actions, memory_size=200000, batch_size=128, gamma=0.99, epsilon_start=1.0, epsilon_final=0.01, epsilon_decay=0.9995, learning_rate=0.0001):
        self.n_actions = n_actions
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.steps = 0

        self.policy_net = ComplexDQN(64, 64, n_actions).to(device)
        self.target_net = ComplexDQN(64, 64, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=200000, eta_min=1e-6)

    def select_action(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                image = torch.FloatTensor(state[3]).unsqueeze(0).to(device)
                additional = torch.FloatTensor(np.concatenate([state[0], state[1], state[2], state[4], state[5]])).unsqueeze(0).to(device)
                return self.policy_net(image, additional).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

        state_image_batch = torch.FloatTensor(np.stack([s[3] for s in state_batch])).to(device)
        state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in state_batch])).to(device)

        next_state_image_batch = torch.FloatTensor(np.stack([s[3] for s in next_state_batch])).to(device)
        next_state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in next_state_batch])).to(device)

        action_batch = torch.LongTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        done_batch = torch.FloatTensor(done_batch).to(device)

        q_values = self.policy_net(state_image_batch, state_additional_batch).gather(1, action_batch.unsqueeze(1))
        next_q_values = self.target_net(next_state_image_batch, next_state_additional_batch).max(1)[0].detach()
        expected_q_values = reward_batch + (1 - done_batch) * self.gamma * next_q_values

        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.epsilon = max(self.epsilon_final, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save_checkpoint(self, episode, directory="checkpoints"):
        if not os.path.exists(directory):
            os.makedirs(directory)
        checkpoint = {
            'episode': episode,
            'model_state_dict': self.policy_net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'epsilon': self.epsilon,
            'steps': self.steps
        }
        filename = os.path.join(directory, f"checkpoint_episode_{episode}_step_{self.steps}.pth")
        torch.save(checkpoint, filename)
        print(f"Checkpoint saved: {filename}")

    def load_checkpoint(self, filename):
        checkpoint = torch.load(filename)
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.target_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.epsilon = 0.9
        self.steps = checkpoint['steps']
        return checkpoint['episode']


cuda


In [None]:

class SimpleDQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(SimpleDQN, self).__init__()
        
        # Simplified convolutional layers
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # Calculate the size of flattened features
        conv_out_size = self._get_conv_out(h, w)
        
        # Simplified fully connected layers
        self.fc1 = nn.Linear(conv_out_size + 9, 512)
        self.fc2 = nn.Linear(512, outputs)

    def _get_conv_out(self, h, w):
        o = F.relu(self.conv1(torch.zeros(1, 4, h, w)))
        o = F.relu(self.conv2(o))
        o = F.relu(self.conv3(o))
        return int(np.prod(o.size()))

    def forward(self, x, additional_inputs):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.cat((x, additional_inputs), dim=1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

class SimpleDQNAgent:
    def __init__(self, n_actions, memory_size=100000, batch_size=32, gamma=0.99, epsilon_start=1.0, epsilon_final=0.01, epsilon_decay=0.995, learning_rate=0.001):
        self.n_actions = n_actions
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.steps = 0

        self.policy_net = SimpleDQN(64, 64, n_actions).to(device)
        self.target_net = SimpleDQN(64, 64, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)

    def select_action(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                image = torch.FloatTensor(state[3]).unsqueeze(0).to(device)
                additional = torch.FloatTensor(np.concatenate([state[0], state[1], state[2], state[4], state[5]])).unsqueeze(0).to(device)
                return self.policy_net(image, additional).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

        state_image_batch = torch.FloatTensor(np.stack([s[3] for s in state_batch])).to(device)
        state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in state_batch])).to(device)

        next_state_image_batch = torch.FloatTensor(np.stack([s[3] for s in next_state_batch])).to(device)
        next_state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in next_state_batch])).to(device)

        action_batch = torch.LongTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        done_batch = torch.FloatTensor(done_batch).to(device)

        q_values = self.policy_net(state_image_batch, state_additional_batch).gather(1, action_batch.unsqueeze(1))
        next_q_values = self.target_net(next_state_image_batch, next_state_additional_batch).max(1)[0].detach()
        expected_q_values = reward_batch + (1 - done_batch) * self.gamma * next_q_values

        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.epsilon = max(self.epsilon_final, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save_checkpoint(self, episode, directory="checkpoints_simple"):
        if not os.path.exists(directory):
            os.makedirs(directory)
        checkpoint = {
            'episode': episode,
            'model_state_dict': self.policy_net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'steps': self.steps
        }
        filename = os.path.join(directory, f"checkpoint_episode_{episode}_step_{self.steps}.pth")
        torch.save(checkpoint, filename)
        print(f"Checkpoint saved: {filename}")

    def load_checkpoint(self, filename):
        checkpoint = torch.load(filename)
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.target_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.epsilon = checkpoint['epsilon']
        self.steps = checkpoint['steps']
        return checkpoint['episode']

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from collections import deque
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ComplexDQN(nn.Module):
    def __init__(self, h, w, outputs, additional_inputs=9):
        super(ComplexDQN, self).__init__()
        
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        
        conv_out_size = self._get_conv_out(h, w)
        
        self.fc1 = nn.Linear(conv_out_size + additional_inputs, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, outputs)
        
        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)

    def _get_conv_out(self, h, w):
        o = F.relu(self.bn1(self.conv1(torch.zeros(1, 4, h, w))))
        o = F.relu(self.bn2(self.conv2(o)))
        o = F.relu(self.bn3(self.conv3(o)))
        return int(np.prod(o.size()))

    def forward(self, x, additional_inputs):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        
        x = x.view(x.size(0), -1)
        x = torch.cat((x, additional_inputs), dim=1)
        
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        return self.fc3(x)

class ComplexDQNAgent:
    def __init__(self, n_actions, memory_size=100000, batch_size=64, gamma=0.99, 
                 epsilon_start=1.0, epsilon_final=0.01, epsilon_decay=0.999, 
                 learning_rate=0.0005, target_update=10, 
                 checkpoint_dir="checkpoints_complex"):
        self.n_actions = n_actions
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.target_update = target_update
        self.checkpoint_dir = checkpoint_dir
        self.steps = 0

        # Initialize networks
        self.policy_net = ComplexDQN(64, 64, n_actions).to(device)
        self.target_net = ComplexDQN(64, 64, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=10000, gamma=0.95)

    def select_action(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                image = torch.FloatTensor(state[3]).unsqueeze(0).to(device)
                additional = torch.FloatTensor(np.concatenate([state[0], state[1], state[2], state[4], state[5]])).unsqueeze(0).to(device)
                return self.policy_net(image, additional).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

        state_image_batch = torch.FloatTensor(np.stack([s[3] for s in state_batch])).to(device)
        state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in state_batch])).to(device)

        next_state_image_batch = torch.FloatTensor(np.stack([s[3] for s in next_state_batch])).to(device)
        next_state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in next_state_batch])).to(device)

        action_batch = torch.LongTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        done_batch = torch.FloatTensor(done_batch).to(device)

        state_action_values = self.policy_net(state_image_batch, state_additional_batch).gather(1, action_batch.unsqueeze(1))
        next_state_values = self.target_net(next_state_image_batch, next_state_additional_batch).max(1)[0].detach()
        expected_state_action_values = reward_batch + (1 - done_batch) * self.gamma * next_state_values

        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()
        self.scheduler.step()

        self.epsilon = max(self.epsilon_final, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save_checkpoint(self, episode):
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        checkpoint = {
            'episode': episode,
            'model_state_dict': self.policy_net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'epsilon': self.epsilon,
            'steps': self.steps
        }
        filename = os.path.join(self.checkpoint_dir, f"checkpoint_episode_{episode}_step_{self.steps}.pth")
        torch.save(checkpoint, filename)
        print(f"Checkpoint saved: {filename}")

    def load_checkpoint(self, filename):
        checkpoint = torch.load(filename)
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.target_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.epsilon = checkpoint['epsilon']
        self.steps = checkpoint['steps']
        return checkpoint['episode']

# The train and evaluate methods remain the same as in the previous implementation

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
from tmrl import get_environment
import os
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


class SimpleDQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(SimpleDQN, self).__init__()
        
        # Simplified convolutional layers
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # Calculate the size of flattened features
        conv_out_size = self._get_conv_out(h, w)
        
        # Simplified fully connected layers
        self.fc1 = nn.Linear(conv_out_size + 9, 512)
        self.fc2 = nn.Linear(512, outputs)

    def _get_conv_out(self, h, w):
        o = F.relu(self.conv1(torch.zeros(1, 4, h, w)))
        o = F.relu(self.conv2(o))
        o = F.relu(self.conv3(o))
        return int(np.prod(o.size()))

    def forward(self, x, additional_inputs):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.cat((x, additional_inputs), dim=1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

class SimpleDQNAgent:
    def __init__(self, n_actions, memory_size=100000, batch_size=32, gamma=0.99, epsilon_start=1.0, epsilon_final=0.01, epsilon_decay=0.995, learning_rate=0.001):
        self.n_actions = n_actions
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.steps = 0

        self.policy_net = SimpleDQN(64, 64, n_actions).to(device)
        self.target_net = SimpleDQN(64, 64, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)

    def select_action(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                image = torch.FloatTensor(state[3]).unsqueeze(0).to(device)
                additional = torch.FloatTensor(np.concatenate([state[0], state[1], state[2], state[4], state[5]])).unsqueeze(0).to(device)
                return self.policy_net(image, additional).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

        state_image_batch = torch.FloatTensor(np.stack([s[3] for s in state_batch])).to(device)
        state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in state_batch])).to(device)

        next_state_image_batch = torch.FloatTensor(np.stack([s[3] for s in next_state_batch])).to(device)
        next_state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in next_state_batch])).to(device)

        action_batch = torch.LongTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        done_batch = torch.FloatTensor(done_batch).to(device)

        q_values = self.policy_net(state_image_batch, state_additional_batch).gather(1, action_batch.unsqueeze(1))
        next_q_values = self.target_net(next_state_image_batch, next_state_additional_batch).max(1)[0].detach()
        expected_q_values = reward_batch + (1 - done_batch) * self.gamma * next_q_values

        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.epsilon = max(self.epsilon_final, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save_checkpoint(self, episode, directory="checkpoints_simple_left_turn"):
        if not os.path.exists(directory):
            os.makedirs(directory)
        checkpoint = {
            'episode': episode,
            'model_state_dict': self.policy_net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'steps': self.steps
        }
        filename = os.path.join(directory, f"checkpoint_episode_{episode}_step_{self.steps}.pth")
        torch.save(checkpoint, filename)
        print(f"Checkpoint saved: {filename}")

    def load_checkpoint(self, filename):
        checkpoint = torch.load(filename)
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.target_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.epsilon = checkpoint['epsilon']
        self.steps = checkpoint['steps']
        return checkpoint['episode']

def preprocess_observation(obs):
    speed = np.array(obs[0]).flatten()
    steering = np.array(obs[1]).flatten()
    gyro = np.array(obs[2]).flatten()
    image = np.array(obs[3]).astype(np.float32) / 255.0
    prev_action = np.array(obs[4]).flatten()
    action = np.array(obs[5]).flatten()
    
    speed = np.clip(speed / 300.0, -1, 1)
    steering = np.clip(steering / np.pi, -1, 1)
    gyro = np.clip(gyro / np.pi, -1, 1)
    prev_action = np.clip(prev_action, -1, 1)
    action = np.clip(action, -1, 1)
    
    return (speed, steering, gyro, image, prev_action, action)

def env_action_to_agent_action(env_action, n_actions):
    if isinstance(env_action, (int, np.integer)):
        return env_action
    elif isinstance(env_action, (float, np.float32, np.float64)):
        return int((env_action + 1) * (n_actions - 1) / 2)
    elif isinstance(env_action, np.ndarray):
        return np.argmax(env_action)
    else:
        raise ValueError(f"Unexpected action type: {type(env_action)}")

def agent_action_to_env_action(agent_action, n_actions):
    steering = (agent_action * 2 / (n_actions - 1)) - 1
    throttle = 1.0
    brake = 0.0
    return [throttle, brake, steering]

def detect_crash(obs, prev_obs, speed_threshold=1.0):
    if prev_obs is None:
        return False
    
    current_speed = np.linalg.norm(obs[0])
    prev_speed = np.linalg.norm(prev_obs[0])
    
    if prev_speed - current_speed > speed_threshold:
        return True
    
    return False

def train(env, agent, num_episodes, max_steps_per_episode):
    for episode in range(num_episodes):
        obs, _ = env.reset()
        state = preprocess_observation(obs)
        total_reward = 0
        episode_start_time = time.time()
        prev_obs = None
        crashes = 0

        for step in range(max_steps_per_episode):
            agent_action = agent.select_action(state)
            env_action = agent_action_to_env_action(agent_action.item(), agent.n_actions)
            next_obs, env_reward, terminated, truncated, _ = env.step(env_action)
            next_state = preprocess_observation(next_obs)
            
            if detect_crash(next_obs, prev_obs):
                crashes += 1
                env_reward-= 10
            done = terminated or truncated
            
            elapsed_time = time.time() - episode_start_time
            time_factor = 1 / (elapsed_time + 1)
            reward = (env_reward)
            
            agent.remember(state, agent_action.item(), reward, next_state, done)
            agent.replay()
            
            state = next_state
            prev_obs = next_obs
            total_reward += reward

            agent.steps += 1

            if agent.steps % 1000 == 0:
                agent.save_checkpoint(episode)

            if done:
                break
        # total_reward-= 20*crashes
        if episode % 10 == 0:
            agent.update_target_network()

        print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Crashes: {crashes}, Epsilon: {agent.epsilon:.4f}, Time: {elapsed_time:.2f}s")

if __name__ == "__main__":
    env = get_environment()
    n_actions = 3
    agent = SimpleDQNAgent(n_actions)
    train(env, agent, num_episodes=5000, max_steps_per_episode=5000)

cuda


  from .autonotebook import tqdm as notebook_tqdm


Episode 0, Total Reward: 472.86, Crashes: 15, Epsilon: 0.2799, Time: 15.64s
Episode 1, Total Reward: 22.60, Crashes: 7, Epsilon: 0.1320, Time: 8.20s
Episode 2, Total Reward: 56.46, Crashes: 3, Epsilon: 0.0920, Time: 4.31s
Episode 3, Total Reward: 422.86, Crashes: 20, Epsilon: 0.0154, Time: 18.50s
Episode 4, Total Reward: 56.21, Crashes: 3, Epsilon: 0.0108, Time: 4.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_5_step_1000.pth
Episode 5, Total Reward: 26.66, Crashes: 6, Epsilon: 0.0100, Time: 4.30s
Episode 6, Total Reward: 37.85, Crashes: 5, Epsilon: 0.0100, Time: 5.05s
Episode 7, Total Reward: 49.54, Crashes: 4, Epsilon: 0.0100, Time: 4.85s
Episode 8, Total Reward: -90.37, Crashes: 20, Epsilon: 0.0100, Time: 16.00s
Episode 9, Total Reward: 18.12, Crashes: 7, Epsilon: 0.0100, Time: 4.85s
Episode 10, Total Reward: -67.14, Crashes: 19, Epsilon: 0.0100, Time: 16.95s
Episode 11, Total Reward: 35.61, Crashes: 5, Epsilon: 0.0100, Time: 4.30s
Checkpoint saved: checkpoin



Episode 40, Total Reward: -62.92, Crashes: 18, Epsilon: 0.0100, Time: 18.32s
Episode 41, Total Reward: 35.41, Crashes: 5, Epsilon: 0.0100, Time: 4.30s
Episode 42, Total Reward: 46.45, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 43, Total Reward: 45.17, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 44, Total Reward: 36.87, Crashes: 5, Epsilon: 0.0100, Time: 4.50s
Episode 45, Total Reward: 8.37, Crashes: 8, Epsilon: 0.0100, Time: 5.95s
Episode 46, Total Reward: 45.80, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 47, Total Reward: 38.24, Crashes: 5, Epsilon: 0.0100, Time: 4.60s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_48_step_7000.pth
Episode 48, Total Reward: -77.89, Crashes: 19, Epsilon: 0.0100, Time: 17.25s
Episode 49, Total Reward: 57.23, Crashes: 3, Epsilon: 0.0100, Time: 4.30s
Episode 50, Total Reward: 28.72, Crashes: 6, Epsilon: 0.0100, Time: 4.30s
Episode 51, Total Reward: 45.85, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 52, Total Rewar



Episode 221, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 12.85s
Episode 222, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.95s
Episode 223, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.45s
Episode 224, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.60s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_225_step_44000.pth
Episode 225, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 13.75s
Episode 226, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 15.05s
Episode 227, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 11.85s
Episode 228, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 10.90s
Episode 229, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.50s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_230_step_45000.pth
Episode 230, Total Reward: 392.86, Crashes: 23, Epsilon: 0.0100, Time: 13.80s
Episode 231, Total Reward: 492.86, Crashes: 13, Eps



Episode 263, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.94s
Episode 264, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 13.05s
Episode 265, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 14.20s
Episode 266, Total Reward: 41.52, Crashes: 5, Epsilon: 0.0100, Time: 6.75s
Episode 267, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 15.60s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_268_step_54000.pth
Episode 268, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 14.90s
Episode 269, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 13.55s
Episode 270, Total Reward: 432.87, Crashes: 19, Epsilon: 0.0100, Time: 15.15s
Episode 271, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 10.85s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_272_step_55000.pth
Episode 272, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.00s




Episode 273, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 13.34s
Episode 274, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 12.25s
Episode 275, Total Reward: 1.02, Crashes: 9, Epsilon: 0.0100, Time: 5.70s
Episode 276, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_277_step_56000.pth
Episode 277, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 14.55s
Episode 278, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 10.40s
Episode 279, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 13.80s
Episode 280, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.30s
Episode 281, Total Reward: 69.89, Crashes: 2, Epsilon: 0.0100, Time: 4.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_282_step_57000.pth
Episode 282, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 14.20s
Episode 283, Total Reward: 532.86, Crashes: 9, Epsilon: 



Episode 337, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.56s
Episode 338, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.65s
Episode 339, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 10.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_340_step_71000.pth
Episode 340, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 13.40s
Episode 341, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 11.70s
Episode 342, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 11.40s
Episode 343, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 10.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_344_step_72000.pth
Episode 344, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 11.00s
Episode 345, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 13.25s
Episode 346, Total Reward: 422.87, Crashes: 20, Epsilon: 0.0100, Time: 12.95s
Episode 347, Total Reward: 552.86, Crashes: 7, Eps



Episode 357, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 12.75s
Episode 358, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.40s
Episode 359, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 12.05s
Episode 360, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 11.60s
Episode 361, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 12.20s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_362_step_76000.pth
Episode 362, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 12.20s
Episode 363, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.45s
Episode 364, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 12.25s
Episode 365, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 14.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_366_step_77000.pth
Episode 366, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 11.20s
Episode 367, Total Reward: 582.86, Crashes: 4, Eps



Episode 413, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 16.35s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_414_step_88000.pth
Episode 414, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 12.80s
Episode 415, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.60s
Episode 416, Total Reward: 432.86, Crashes: 19, Epsilon: 0.0100, Time: 14.25s
Episode 417, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 12.05s
Episode 418, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.20s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_419_step_89000.pth
Episode 419, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 11.50s
Episode 420, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 14.55s
Episode 421, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 11.90s
Episode 422, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.30s
Checkpoint saved: checkpoints_simple_right_turn\chec



Episode 437, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.58s
Episode 438, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.90s
Episode 439, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 12.05s
Episode 440, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.20s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_441_step_94000.pth
Episode 441, Total Reward: 452.87, Crashes: 17, Epsilon: 0.0100, Time: 11.95s
Episode 442, Total Reward: -77.14, Crashes: 20, Epsilon: 0.0100, Time: 10.65s
Episode 443, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.70s
Episode 444, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.90s
Episode 445, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_446_step_95000.pth
Episode 446, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.18s
Episode 447, Total Reward: 622.86, Crashes: 0, Eps



Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_451_step_96000.pth
Episode 451, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.85s
Episode 452, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 11.85s
Episode 453, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.20s
Episode 454, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 11.15s
Episode 455, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.40s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_456_step_97000.pth
Episode 456, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 10.30s
Episode 457, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.40s
Episode 458, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 10.55s
Episode 459, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 16.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_460_step_98000.pth
Episode 460, Total Reward: 512.86, Crashes: 



Episode 490, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.51s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_491_step_105000.pth
Episode 491, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 10.55s
Episode 492, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.75s
Episode 493, Total Reward: 402.86, Crashes: 22, Epsilon: 0.0100, Time: 11.60s
Episode 494, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 11.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_495_step_106000.pth
Episode 495, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 12.45s
Episode 496, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.50s




Episode 497, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.11s
Episode 498, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.25s
Episode 499, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_500_step_107000.pth
Episode 500, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 12.65s
Episode 501, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.95s
Episode 502, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 12.25s
Episode 503, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_504_step_108000.pth
Episode 504, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.20s
Episode 505, Total Reward: 372.86, Crashes: 25, Epsilon: 0.0100, Time: 13.60s
Episode 506, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.10s
Episode 507, Total Reward: 502.86, Crashes: 12, Epsil



Episode 534, Total Reward: 12.86, Crashes: 11, Epsilon: 0.0100, Time: 14.45s
Episode 535, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.20s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_536_step_115000.pth
Episode 536, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 12.55s
Episode 537, Total Reward: 422.87, Crashes: 20, Epsilon: 0.0100, Time: 13.95s
Episode 538, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.60s
Episode 539, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 13.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_540_step_116000.pth
Episode 540, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 14.15s
Episode 541, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.65s
Episode 542, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 13.15s
Episode 543, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.35s
Episode 544, Total Reward: 44.94, Crashes: 4, Epsi



Episode 609, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 15.45s
Episode 610, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 14.50s
Episode 611, Total Reward: 44.74, Crashes: 6, Epsilon: 0.0100, Time: 10.30s
Episode 612, Total Reward: 12.86, Crashes: 11, Epsilon: 0.0100, Time: 14.65s
Episode 613, Total Reward: 18.81, Crashes: 7, Epsilon: 0.0100, Time: 5.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_614_step_134000.pth
Episode 614, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 13.70s
Episode 615, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 15.90s
Episode 616, Total Reward: 0.52, Crashes: 10, Epsilon: 0.0100, Time: 9.00s
Episode 617, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 14.40s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_618_step_135000.pth
Episode 618, Total Reward: 402.86, Crashes: 22, Epsilon: 0.0100, Time: 11.85s
Episode 619, Total Reward: -77.14, Crashes: 20, Epsilon



Episode 681, Total Reward: 422.86, Crashes: 20, Epsilon: 0.0100, Time: 11.78s
Episode 682, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 10.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_683_step_152000.pth
Episode 683, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.45s
Episode 684, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 14.10s
Episode 685, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 11.15s
Episode 686, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 12.95s
Episode 687, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 13.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_688_step_153000.pth
Episode 688, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 14.75s
Episode 689, Total Reward: 462.85, Crashes: 16, Epsilon: 0.0100, Time: 12.55s
Episode 690, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 12.15s
Episode 691, Total Reward: 492.86, Crashes: 13,



Episode 714, Total Reward: 54.78, Crashes: 5, Epsilon: 0.0100, Time: 8.37s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_715_step_159000.pth
Episode 715, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.80s
Episode 716, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.30s
Episode 717, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 13.90s
Episode 718, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 10.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_719_step_160000.pth
Episode 719, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 11.75s
Episode 720, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.95s
Episode 721, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 12.90s
Episode 722, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.65s
Episode 723, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 12.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpo



Episode 730, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 15.59s
Episode 731, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 10.50s
Episode 732, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_733_step_163000.pth
Episode 733, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 13.65s
Episode 734, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 14.00s
Episode 735, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 13.60s




Episode 736, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 10.72s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_737_step_164000.pth
Episode 737, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 15.10s
Episode 738, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 14.40s
Episode 739, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 11.20s
Episode 740, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 13.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_741_step_165000.pth
Episode 741, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 13.25s
Episode 742, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 13.30s
Episode 743, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 11.20s
Episode 744, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_745_step_166000.pth
Episode 745, Total Reward: 582.87, Cras



Episode 746, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 16.19s
Episode 747, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 11.05s
Episode 748, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 12.60s
Episode 749, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 11.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_750_step_167000.pth
Episode 750, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 11.70s
Episode 751, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.50s
Episode 752, Total Reward: 46.26, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 753, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 16.30s
Episode 754, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.15s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_755_step_168000.pth
Episode 755, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 16.30s
Episode 756, Total Reward: 522.87, Crashes: 10, Epsilon



Episode 796, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 11.57s
Episode 797, Total Reward: -43.03, Crashes: 16, Epsilon: 0.0100, Time: 12.05s
Episode 798, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 14.35s
Episode 799, Total Reward: -121.01, Crashes: 24, Epsilon: 0.0100, Time: 17.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_800_step_179000.pth
Episode 800, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 14.20s
Episode 801, Total Reward: -67.14, Crashes: 19, Epsilon: 0.0100, Time: 17.25s
Episode 802, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.90s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_803_step_180000.pth
Episode 803, Total Reward: -47.14, Crashes: 17, Epsilon: 0.0100, Time: 15.10s
Episode 804, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 14.10s
Episode 805, Total Reward: -65.02, Crashes: 17, Epsilon: 0.0100, Time: 8.50s
Episode 806, Total Reward: -5.58, Crashes: 10, Ep



Episode 860, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 14.35s
Episode 861, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 12.35s
Episode 862, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.90s
Episode 863, Total Reward: 602.87, Crashes: 2, Epsilon: 0.0100, Time: 12.60s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_864_step_194000.pth
Episode 864, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 15.20s
Episode 865, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 11.20s
Episode 866, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 11.20s
Episode 867, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_868_step_195000.pth
Episode 868, Total Reward: 472.87, Crashes: 15, Epsilon: 0.0100, Time: 12.20s
Episode 869, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 11.25s
Episode 870, Total Reward: 552.86, Crashes: 7, Eps



Episode 884, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.06s
Episode 885, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 15.00s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_886_step_199000.pth
Episode 886, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 13.15s
Episode 887, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.50s
Episode 888, Total Reward: 46.10, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 889, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 12.85s
Episode 890, Total Reward: 472.87, Crashes: 15, Epsilon: 0.0100, Time: 13.15s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_891_step_200000.pth
Episode 891, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 13.25s
Episode 892, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.80s
Episode 893, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 11.00s
Episode 894, Total Reward: 592.87, Crashes: 3, Epsilon



Episode 915, Total Reward: 84.98, Crashes: 0, Epsilon: 0.0100, Time: 8.97s




Episode 916, Total Reward: 74.64, Crashes: 1, Epsilon: 0.0100, Time: 5.16s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_917_step_206000.pth
Episode 917, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 11.45s
Episode 918, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.55s
Episode 919, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 11.40s
Episode 920, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 14.95s
Episode 921, Total Reward: 46.00, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 922, Total Reward: 36.89, Crashes: 5, Epsilon: 0.0100, Time: 4.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_923_step_207000.pth
Episode 923, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 13.15s
Episode 924, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.75s
Episode 925, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 16.35s
Episode 926, Total Reward: -12.13, Crashes: 13, Epsilon: 



Episode 935, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.53s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_936_step_210000.pth
Episode 936, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 15.35s
Episode 937, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 10.95s
Episode 938, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 13.85s
Episode 939, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 10.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_940_step_211000.pth
Episode 940, Total Reward: 462.87, Crashes: 16, Epsilon: 0.0100, Time: 14.55s
Episode 941, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 11.15s
Episode 942, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 10.05s
Episode 943, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 16.60s
Episode 944, Total Reward: 35.70, Crashes: 5, Epsilon: 0.0100, Time: 4.30s
Checkpoint saved: checkpoints_simple_right_turn\chec



Episode 977, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 13.07s
Episode 978, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 12.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_979_step_220000.pth
Episode 979, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 14.75s
Episode 980, Total Reward: 45.05, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 981, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 11.50s
Episode 982, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 15.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_983_step_221000.pth




Episode 983, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 16.98s
Episode 984, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 18.05s
Episode 985, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 14.55s
Episode 986, Total Reward: 55.25, Crashes: 3, Epsilon: 0.0100, Time: 4.30s
Episode 987, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 14.85s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_988_step_222000.pth
Episode 988, Total Reward: 442.87, Crashes: 18, Epsilon: 0.0100, Time: 13.75s
Episode 989, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 13.70s
Episode 990, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 13.10s
Episode 991, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.45s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_992_step_223000.pth
Episode 992, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.70s
Episode 993, Total Reward: 562.86, Crashes: 6, Epsilo



Episode 1069, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 18.59s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1070_step_240000.pth
Episode 1070, Total Reward: 422.87, Crashes: 20, Epsilon: 0.0100, Time: 14.30s
Episode 1071, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 10.00s
Episode 1072, Total Reward: 472.87, Crashes: 15, Epsilon: 0.0100, Time: 13.75s
Episode 1073, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 15.45s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1074_step_241000.pth
Episode 1074, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.80s
Episode 1075, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 16.45s
Episode 1076, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 13.70s
Episode 1077, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 12.65s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1078_step_242000.pth
Episode 1078, Total Reward: 4



Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1093_step_245000.pth
Episode 1093, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 17.91s
Episode 1094, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 10.80s
Episode 1095, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.10s
Episode 1096, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 15.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1097_step_246000.pth
Episode 1097, Total Reward: 392.85, Crashes: 23, Epsilon: 0.0100, Time: 18.50s
Episode 1098, Total Reward: 452.87, Crashes: 17, Epsilon: 0.0100, Time: 14.05s
Episode 1099, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.40s
Episode 1100, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 14.35s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1101_step_247000.pth
Episode 1101, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 16.50s
Episode 1102, Total Reward



Episode 1133, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 18.39s
Episode 1134, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 11.50s
Episode 1135, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 12.15s
Episode 1136, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.35s
Episode 1137, Total Reward: 27.75, Crashes: 6, Epsilon: 0.0100, Time: 4.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1138_step_255000.pth
Episode 1138, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.35s
Episode 1139, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 11.70s
Episode 1140, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 12.45s
Episode 1141, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.92s
Episode 1142, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.40s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1143_step_256000.pth
Episode 1143, Total Reward: 472.87, Crash



Episode 1164, Total Reward: 45.44, Crashes: 4, Epsilon: 0.0100, Time: 4.62s
Episode 1165, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 14.15s
Episode 1166, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 14.60s
Episode 1167, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 13.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1168_step_261000.pth
Episode 1168, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 13.70s
Episode 1169, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 11.05s
Episode 1170, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 13.05s
Episode 1171, Total Reward: 422.87, Crashes: 20, Epsilon: 0.0100, Time: 13.40s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1172_step_262000.pth
Episode 1172, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 11.70s
Episode 1173, Total Reward: -67.13, Crashes: 19, Epsilon: 0.0100, Time: 17.30s
Episode 1174, Total Reward: 512.87, Cra



Episode 1210, Total Reward: 372.87, Crashes: 25, Epsilon: 0.0100, Time: 14.86s
Episode 1211, Total Reward: 30.45, Crashes: 7, Epsilon: 0.0100, Time: 10.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1212_step_271000.pth
Episode 1212, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 12.00s
Episode 1213, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 12.00s
Episode 1214, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 13.55s




Episode 1215, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.99s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1216_step_272000.pth
Episode 1216, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 13.50s
Episode 1217, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.65s
Episode 1218, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 14.20s
Episode 1219, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 11.55s




Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1220_step_273000.pth
Episode 1220, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 14.13s
Episode 1221, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 13.65s
Episode 1222, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 13.10s




Episode 1223, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 10.66s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1224_step_274000.pth
Episode 1224, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 12.85s
Episode 1225, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 13.25s
Episode 1226, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.30s
Episode 1227, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 11.80s
Episode 1228, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.00s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1229_step_275000.pth
Episode 1229, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 12.60s
Episode 1230, Total Reward: 612.86, Crashes: 1, Epsilon: 0.0100, Time: 10.55s
Episode 1231, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 10.60s
Episode 1232, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 14.05s
Checkpoint saved: checkpoints_simple_rig



Episode 1233, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.65s
Episode 1234, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 13.15s
Episode 1235, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.15s
Episode 1236, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 10.65s
Episode 1237, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 10.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1238_step_277000.pth
Episode 1238, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.95s
Episode 1239, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 14.10s
Episode 1240, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.00s
Episode 1241, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.05s
Episode 1242, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1243_step_278000.pth
Episode 1243, Total Reward: 552.86, Crashe



Episode 1253, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.74s
Episode 1254, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.15s
Episode 1255, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 12.00s
Episode 1256, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.10s
Episode 1257, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 11.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1258_step_281000.pth
Episode 1258, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 10.55s
Episode 1259, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.85s
Episode 1260, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 12.75s
Episode 1261, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.15s
Episode 1262, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 11.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1263_step_282000.pth
Episode 1263, Total Reward: 502.87, Crash



Episode 1286, Total Reward: -38.87, Crashes: 15, Epsilon: 0.0100, Time: 10.03s




Episode 1287, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.42s




Episode 1288, Total Reward: 86.92, Crashes: 1, Epsilon: 0.0100, Time: 33.83s




Episode 1289, Total Reward: 28.24, Crashes: 8, Epsilon: 0.0100, Time: 18.53s




Episode 1290, Total Reward: -7.14, Crashes: 13, Epsilon: 0.0100, Time: 13.27s




Episode 1291, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.52s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1292_step_288000.pth
Episode 1292, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 10.60s
Episode 1293, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 13.10s
Episode 1294, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 14.10s
Episode 1295, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1296_step_289000.pth
Episode 1296, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 11.30s
Episode 1297, Total Reward: -77.13, Crashes: 20, Epsilon: 0.0100, Time: 11.60s
Episode 1298, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 14.10s
Episode 1299, Total Reward: 462.87, Crashes: 16, Epsilon: 0.0100, Time: 16.60s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1300_step_290000.pth
Episode 1300, Total Reward: 



Episode 1335, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 14.03s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1336_step_298000.pth
Episode 1336, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 13.90s
Episode 1337, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.80s
Episode 1338, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.25s
Episode 1339, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 14.20s
Episode 1340, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 12.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1341_step_299000.pth
Episode 1341, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 11.70s
Episode 1342, Total Reward: 612.86, Crashes: 1, Epsilon: 0.0100, Time: 10.70s
Episode 1343, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 12.75s
Episode 1344, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.25s
Checkpoint saved: checkpoints_simple_r



Episode 1357, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 16.51s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1358_step_303000.pth
Episode 1358, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.00s
Episode 1359, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 11.80s
Episode 1360, Total Reward: 432.87, Crashes: 19, Epsilon: 0.0100, Time: 14.10s
Episode 1361, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 11.30s
Episode 1362, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1363_step_304000.pth




Episode 1363, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 12.05s
Episode 1364, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 11.85s
Episode 1365, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 11.00s
Episode 1366, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 15.50s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1367_step_305000.pth
Episode 1367, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 12.85s
Episode 1368, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 11.15s
Episode 1369, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 10.70s
Episode 1370, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 15.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1371_step_306000.pth
Episode 1371, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.15s
Episode 1372, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.85s
Episode 1373, Total Reward: 622.86, Cra



Episode 1387, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 13.31s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1388_step_310000.pth
Episode 1388, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 13.00s
Episode 1389, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 10.75s
Episode 1390, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.05s
Episode 1391, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 12.50s
Episode 1392, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 10.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1393_step_311000.pth




Episode 1393, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 11.17s
Episode 1394, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.95s
Episode 1395, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.90s
Episode 1396, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.90s
Episode 1397, Total Reward: 10.05, Crashes: 8, Epsilon: 0.0100, Time: 4.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1398_step_312000.pth
Episode 1398, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.00s
Episode 1399, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 11.60s
Episode 1400, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 12.00s
Episode 1401, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 13.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1402_step_313000.pth
Episode 1402, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 15.95s
Episode 1403, Total Reward: 462.86, Crash



Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1416_step_316000.pth
Episode 1416, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 13.65s
Episode 1417, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.30s
Episode 1418, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 16.35s
Episode 1419, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1420_step_317000.pth
Episode 1420, Total Reward: 422.86, Crashes: 20, Epsilon: 0.0100, Time: 13.05s
Episode 1421, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 14.10s
Episode 1422, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.55s
Episode 1423, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 16.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1424_step_318000.pth
Episode 1424, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 13.40s
Episode 1425, Total Reward



Episode 1431, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.37s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1432_step_320000.pth
Episode 1432, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 13.35s
Episode 1433, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 11.45s
Episode 1434, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 16.75s
Episode 1435, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 11.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1436_step_321000.pth
Episode 1436, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 13.55s
Episode 1437, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 14.20s
Episode 1438, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 12.55s
Episode 1439, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 12.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1440_step_322000.pth
Episode 1440, Total Reward: 



Episode 1443, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 13.53s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1444_step_323000.pth
Episode 1444, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 11.20s
Episode 1445, Total Reward: 402.87, Crashes: 22, Epsilon: 0.0100, Time: 15.25s
Episode 1446, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.45s
Episode 1447, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 14.15s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1448_step_324000.pth
Episode 1448, Total Reward: 432.87, Crashes: 19, Epsilon: 0.0100, Time: 13.65s
Episode 1449, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 13.70s
Episode 1450, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 13.20s
Episode 1451, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 10.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1452_step_325000.pth
Episode 1452, Total Reward: 4



Episode 1457, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 13.06s
Episode 1458, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 10.80s
Episode 1459, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Episode 1460, Total Reward: 432.86, Crashes: 19, Epsilon: 0.0100, Time: 16.65s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1461_step_327000.pth
Episode 1461, Total Reward: 452.87, Crashes: 17, Epsilon: 0.0100, Time: 12.85s
Episode 1462, Total Reward: -54.93, Crashes: 16, Epsilon: 0.0100, Time: 9.85s
Episode 1463, Total Reward: 51.13, Crashes: 4, Epsilon: 0.0100, Time: 6.10s
Episode 1464, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 18.15s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1465_step_328000.pth
Episode 1465, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.40s
Episode 1466, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 13.30s




Episode 1467, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 15.49s
Episode 1468, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.35s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1469_step_329000.pth
Episode 1469, Total Reward: 12.86, Crashes: 11, Epsilon: 0.0100, Time: 12.65s
Episode 1470, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.00s
Episode 1471, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 14.70s
Episode 1472, Total Reward: -60.93, Crashes: 17, Epsilon: 0.0100, Time: 10.15s
Episode 1473, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 13.45s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1474_step_330000.pth




Episode 1474, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.90s
Episode 1475, Total Reward: -16.43, Crashes: 11, Epsilon: 0.0100, Time: 5.50s
Episode 1476, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.10s
Episode 1477, Total Reward: 282.87, Crashes: 34, Epsilon: 0.0100, Time: 15.05s
Episode 1478, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 14.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1479_step_331000.pth
Episode 1479, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 13.25s
Episode 1480, Total Reward: 462.87, Crashes: 16, Epsilon: 0.0100, Time: 13.90s
Episode 1481, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 13.00s
Episode 1482, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 11.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1483_step_332000.pth
Episode 1483, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.55s
Episode 1484, Total Reward: 482.86, Cra



Episode 1501, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 13.90s
Episode 1502, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 13.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1503_step_337000.pth
Episode 1503, Total Reward: 412.86, Crashes: 21, Epsilon: 0.0100, Time: 14.10s
Episode 1504, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 13.25s
Episode 1505, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 10.65s
Episode 1506, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 12.05s
Episode 1507, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 14.85s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1508_step_338000.pth
Episode 1508, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.10s
Episode 1509, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 14.60s
Episode 1510, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 12.45s
Episode 1511, Total Reward: 622.86, C



Episode 1533, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 18.21s
Episode 1534, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 12.50s




Episode 1535, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 11.78s
Episode 1536, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 13.85s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1537_step_345000.pth
Episode 1537, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.15s
Episode 1538, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 15.40s
Episode 1539, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 14.35s
Episode 1540, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.85s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1541_step_346000.pth
Episode 1541, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.55s
Episode 1542, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 16.35s
Episode 1543, Total Reward: 482.84, Crashes: 14, Epsilon: 0.0100, Time: 13.20s
Episode 1544, Total Reward: 422.86, Crashes: 20, Epsilon: 0.0100, Time: 12.85s
Checkpoint saved: checkpoints_simple_ri



Episode 1579, Total Reward: 32.92, Crashes: 6, Epsilon: 0.0100, Time: 5.60s
Episode 1580, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 13.20s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1581_step_355000.pth
Episode 1581, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.65s
Episode 1582, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 14.85s
Episode 1583, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.55s
Episode 1584, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 12.45s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1585_step_356000.pth




Episode 1585, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 14.21s
Episode 1586, Total Reward: 442.87, Crashes: 18, Epsilon: 0.0100, Time: 11.45s
Episode 1587, Total Reward: -47.14, Crashes: 17, Epsilon: 0.0100, Time: 12.00s
Episode 1588, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.66s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1589_step_357000.pth
Episode 1589, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 13.90s
Episode 1590, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 10.55s
Episode 1591, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.25s
Episode 1592, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.50s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1593_step_358000.pth
Episode 1593, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 13.05s
Episode 1594, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 12.65s
Episode 1595, Total Reward: 502.86, Cra



Episode 1619, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 13.72s
Episode 1620, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 12.85s
Episode 1621, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 10.60s
Episode 1622, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 12.35s
Episode 1623, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.45s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1624_step_365000.pth
Episode 1624, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 1625, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 11.15s
Episode 1626, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.75s
Episode 1627, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1628_step_366000.pth
Episode 1628, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 14.05s
Episode 1629, Total Reward: 612.86, Crashes:



Episode 1633, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 11.06s
Episode 1634, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 12.25s
Episode 1635, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 10.65s
Episode 1636, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.60s
Episode 1637, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 10.35s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1638_step_368000.pth
Episode 1638, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.10s
Episode 1639, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 13.35s
Episode 1640, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.85s
Episode 1641, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1642_step_369000.pth
Episode 1642, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Episode 1643, Total Reward: 542.87, Crashes:



Episode 1655, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 11.19s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1656_step_372000.pth
Episode 1656, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.55s
Episode 1657, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.80s
Episode 1658, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.70s
Episode 1659, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 10.90s




Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1660_step_373000.pth
Episode 1660, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 15.28s
Episode 1661, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.15s
Episode 1662, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 10.60s
Episode 1663, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.00s
Episode 1664, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1665_step_374000.pth




Episode 1665, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 10.11s
Episode 1666, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 11.05s
Episode 1667, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 12.35s
Episode 1668, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 12.50s
Episode 1669, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.00s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1670_step_375000.pth
Episode 1670, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 13.80s
Episode 1671, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.80s
Episode 1672, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.85s
Episode 1673, Total Reward: 45.75, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 1674, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.35s




Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1675_step_376000.pth
Episode 1675, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 13.37s
Episode 1676, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 12.05s
Episode 1677, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.60s
Episode 1678, Total Reward: 56.02, Crashes: 3, Epsilon: 0.0100, Time: 4.30s
Episode 1679, Total Reward: 602.87, Crashes: 2, Epsilon: 0.0100, Time: 10.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1680_step_377000.pth
Episode 1680, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.35s
Episode 1681, Total Reward: 55.94, Crashes: 3, Epsilon: 0.0100, Time: 4.30s
Episode 1682, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 12.60s
Episode 1683, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 12.20s
Episode 1684, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.40s
Checkpoint saved: checkpoints_simple_right_tur



Episode 1695, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 10.47s
Episode 1696, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 11.65s
Episode 1697, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.80s




Episode 1698, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.96s
Episode 1699, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1700_step_381000.pth
Episode 1700, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.70s
Episode 1701, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 11.40s
Episode 1702, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 12.10s
Episode 1703, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.25s
Episode 1704, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.85s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1705_step_382000.pth
Episode 1705, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.95s
Episode 1706, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Episode 1707, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 12.45s
Episode 1708, Total Reward: 542.86, Crashes: 



Episode 1736, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 13.16s
Episode 1737, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 10.95s
Episode 1738, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 11.20s
Episode 1739, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 12.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1740_step_390000.pth
Episode 1740, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 15.90s
Episode 1741, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.95s
Episode 1742, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 13.10s




Episode 1743, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 10.67s




Episode 1744, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1745_step_391000.pth
Episode 1745, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 16.55s
Episode 1746, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 17.45s
Episode 1747, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 12.29s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1748_step_392000.pth
Episode 1748, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.10s
Episode 1749, Total Reward: -9.52, Crashes: 11, Epsilon: 0.0100, Time: 7.65s
Episode 1750, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 13.80s
Episode 1751, Total Reward: -8.38, Crashes: 11, Epsilon: 0.0100, Time: 8.20s
Episode 1752, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1753_step_393000.pth




Episode 1753, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 12.70s
Episode 1754, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.85s
Episode 1755, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.95s
Episode 1756, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 11.75s
Episode 1757, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 12.50s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1758_step_394000.pth
Episode 1758, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.80s
Episode 1759, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 12.55s
Episode 1760, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 11.75s
Episode 1761, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 13.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1762_step_395000.pth
Episode 1762, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.05s
Episode 1763, Total Reward: 622.86, Crashes:



Episode 1853, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 10.17s
Episode 1854, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 10.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1855_step_415000.pth
Episode 1855, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 9.80s
Episode 1856, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 14.50s
Episode 1857, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.45s
Episode 1858, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1859_step_416000.pth
Episode 1859, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.40s
Episode 1860, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 12.10s




Episode 1861, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 14.80s
Episode 1862, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 10.35s
Episode 1863, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 11.35s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1864_step_417000.pth
Episode 1864, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 11.35s
Episode 1865, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 12.55s
Episode 1866, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.20s
Episode 1867, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 1868, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1869_step_418000.pth
Episode 1869, Total Reward: 612.86, Crashes: 1, Epsilon: 0.0100, Time: 10.30s
Episode 1870, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.35s
Episode 1871, Total Reward: 562.87, Crashes: 



Episode 1889, Total Reward: 85.01, Crashes: 0, Epsilon: 0.0100, Time: 5.97s
Episode 1890, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.40s
Episode 1891, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 12.50s
Episode 1892, Total Reward: -87.19, Crashes: 21, Epsilon: 0.0100, Time: 12.95s
Episode 1893, Total Reward: 45.23, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1894_step_423000.pth
Episode 1894, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 13.55s
Episode 1895, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 12.65s
Episode 1896, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 10.15s
Episode 1897, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 12.05s
Episode 1898, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1899_step_424000.pth
Episode 1899, Total Reward: 552.86, Crashes: 7



Episode 1954, Total Reward: 45.17, Crashes: 4, Epsilon: 0.0100, Time: 4.59s
Episode 1955, Total Reward: 452.87, Crashes: 17, Epsilon: 0.0100, Time: 13.65s
Episode 1956, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1957_step_437000.pth
Episode 1957, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.00s
Episode 1958, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 10.35s
Episode 1959, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 13.40s
Episode 1960, Total Reward: 25.21, Crashes: 6, Epsilon: 0.0100, Time: 4.30s
Episode 1961, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1962_step_438000.pth
Episode 1962, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 13.20s
Episode 1963, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 12.45s
Episode 1964, Total Reward: 472.86, Crashes:



Episode 1975, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.74s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1976_step_441000.pth
Episode 1976, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.85s
Episode 1977, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 10.60s
Episode 1978, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 14.00s
Episode 1979, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 12.21s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1980_step_442000.pth
Episode 1980, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 14.55s
Episode 1981, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 13.55s
Episode 1982, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 10.85s
Episode 1983, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 13.35s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1984_step_443000.pth
Episode 1984, Total Reward: 51



Episode 1997, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 13.84s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_1998_step_446000.pth
Episode 1998, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 10.70s
Episode 1999, Total Reward: 452.86, Crashes: 17, Epsilon: 0.0100, Time: 14.60s
Episode 2000, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 12.20s
Episode 2001, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.95s
Episode 2002, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2003_step_447000.pth
Episode 2003, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.05s
Episode 2004, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 12.45s
Episode 2005, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.80s
Episode 2006, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 16.80s
Checkpoint saved: checkpoints_simple_rig



Episode 2015, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 11.33s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2016_step_450000.pth
Episode 2016, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 10.00s
Episode 2017, Total Reward: 602.87, Crashes: 2, Epsilon: 0.0100, Time: 11.20s
Episode 2018, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 16.25s
Episode 2019, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.45s
Episode 2020, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 11.40s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2021_step_451000.pth
Episode 2021, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.55s
Episode 2022, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 13.80s
Episode 2023, Total Reward: 382.86, Crashes: 24, Epsilon: 0.0100, Time: 13.10s
Episode 2024, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 13.10s
Checkpoint saved: checkpoints_simple_r



Episode 2027, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.70s




Episode 2028, Total Reward: 392.86, Crashes: 23, Epsilon: 0.0100, Time: 13.65s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2029_step_453000.pth
Episode 2029, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 11.00s
Episode 2030, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 12.05s
Episode 2031, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.20s
Episode 2032, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 12.50s
Episode 2033, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2034_step_454000.pth
Episode 2034, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 10.15s
Episode 2035, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 14.40s
Episode 2036, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.65s
Episode 2037, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.10s
Checkpoint saved: checkpoints_simple_right



Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2061_step_460000.pth
Episode 2061, Total Reward: 362.86, Crashes: 26, Epsilon: 0.0100, Time: 13.02s
Episode 2062, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 24.75s
Episode 2063, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.70s
Episode 2064, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 11.15s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2065_step_461000.pth
Episode 2065, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 10.90s
Episode 2066, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.85s
Episode 2067, Total Reward: -9.99, Crashes: 10, Epsilon: 0.0100, Time: 4.55s
Episode 2068, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 11.20s
Episode 2069, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 14.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2070_step_462000.pth
Episode 2070, Total Reward: 602



Episode 2081, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 16.45s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2082_step_465000.pth
Episode 2082, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.70s
Episode 2083, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.00s
Episode 2084, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.40s
Episode 2085, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.00s
Episode 2086, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2087_step_466000.pth
Episode 2087, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 12.50s
Episode 2088, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 11.50s
Episode 2089, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 14.85s




Episode 2090, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 10.60s
Episode 2091, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 10.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2092_step_467000.pth
Episode 2092, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 13.80s
Episode 2093, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 10.40s
Episode 2094, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 10.95s
Episode 2095, Total Reward: -19.60, Crashes: 12, Epsilon: 0.0100, Time: 8.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2096_step_468000.pth
Episode 2096, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.60s
Episode 2097, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.50s
Episode 2098, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.70s
Episode 2099, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.10s
Episode 2100, Total Reward: 622.87, Crash



Episode 2122, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.92s
Episode 2123, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2124_step_474000.pth
Episode 2124, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.80s
Episode 2125, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 10.50s
Episode 2126, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.90s
Episode 2127, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 14.10s
Episode 2128, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.15s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2129_step_475000.pth
Episode 2129, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 17.90s
Episode 2130, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 14.00s
Episode 2131, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 10.20s
Episode 2132, Total Reward: 552.87, Crashes: 



Episode 2167, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.01s
Episode 2168, Total Reward: 422.87, Crashes: 20, Epsilon: 0.0100, Time: 14.15s
Episode 2169, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 12.10s
Episode 2170, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.50s
Episode 2171, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2172_step_485000.pth
Episode 2172, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.09s
Episode 2173, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.05s
Episode 2174, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.40s
Episode 2175, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 10.40s
Episode 2176, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2177_step_486000.pth
Episode 2177, Total Reward: 522.87, Crash



Episode 2218, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 11.66s
Episode 2219, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.95s
Episode 2220, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.25s
Episode 2221, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 11.85s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2222_step_496000.pth
Episode 2222, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 12.40s
Episode 2223, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 11.20s
Episode 2224, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.55s
Episode 2225, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.27s
Episode 2226, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.35s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2227_step_497000.pth
Episode 2227, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.20s
Episode 2228, Total Reward: 602.86, Crashes



Episode 2249, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 11.85s
Episode 2250, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Episode 2251, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 10.85s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2252_step_502000.pth
Episode 2252, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.75s
Episode 2253, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.60s
Episode 2254, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.55s
Episode 2255, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 9.70s
Episode 2256, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 11.50s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2257_step_503000.pth
Episode 2257, Total Reward: 462.86, Crashes: 16, Epsilon: 0.0100, Time: 11.15s
Episode 2258, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 10.85s
Episode 2259, Total Reward: 492.86, Crashe



Episode 2287, Total Reward: -47.88, Crashes: 17, Epsilon: 0.0100, Time: 12.38s
Episode 2288, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 10.95s
Episode 2289, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 11.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2290_step_510000.pth
Episode 2290, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.85s
Episode 2291, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.05s
Episode 2292, Total Reward: 452.87, Crashes: 17, Epsilon: 0.0100, Time: 11.50s
Episode 2293, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2294_step_511000.pth
Episode 2294, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 12.00s
Episode 2295, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.85s
Episode 2296, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 10.50s
Episode 2297, Total Reward: 622.87, Cras



Episode 2338, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 12.50s
Episode 2339, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 12.20s
Episode 2340, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.70s
Episode 2341, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 10.60s
Episode 2342, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.15s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2343_step_521000.pth
Episode 2343, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 13.60s
Episode 2344, Total Reward: 502.87, Crashes: 12, Epsilon: 0.0100, Time: 10.70s
Episode 2345, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.95s
Episode 2346, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 13.35s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2347_step_522000.pth
Episode 2347, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.70s
Episode 2348, Total Reward: 582.87, Cra



Episode 2352, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 11.47s
Episode 2353, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.30s
Episode 2354, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.85s
Episode 2355, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2356_step_524000.pth
Episode 2356, Total Reward: 12.86, Crashes: 11, Epsilon: 0.0100, Time: 12.81s
Episode 2357, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 12.00s
Episode 2358, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 11.40s
Episode 2359, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 12.45s
Episode 2360, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2361_step_525000.pth
Episode 2361, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.80s
Episode 2362, Total Reward: 622.86, Crashes:



Episode 2376, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.15s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2377_step_529000.pth
Episode 2377, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 14.60s
Episode 2378, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 12.85s
Episode 2379, Total Reward: 602.87, Crashes: 2, Epsilon: 0.0100, Time: 12.05s
Episode 2380, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 12.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2381_step_530000.pth
Episode 2381, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 15.05s
Episode 2382, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 10.90s
Episode 2383, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 11.80s
Episode 2384, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 12.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2385_step_531000.pth
Episode 2385, Total Reward: 462.8



Episode 2397, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 11.02s
Episode 2398, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 12.30s
Episode 2399, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 15.15s
Episode 2400, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.50s
Episode 2401, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2402_step_535000.pth




Episode 2402, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 11.81s
Episode 2403, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 10.00s
Episode 2404, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 9.90s
Episode 2405, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 10.05s
Episode 2406, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2407_step_536000.pth
Episode 2407, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Episode 2408, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.00s
Episode 2409, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.65s
Episode 2410, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.10s
Episode 2411, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2412_step_537000.pth
Episode 2412, Total Reward: 522.86, Crashes



Episode 2426, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 11.17s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2427_step_540000.pth
Episode 2427, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.45s
Episode 2428, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 9.90s
Episode 2429, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Episode 2430, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.70s
Episode 2431, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2432_step_541000.pth
Episode 2432, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.00s
Episode 2433, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Episode 2434, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 9.80s
Episode 2435, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.85s
Episode 2436, Total Reward: 612.87, Crashes: 1, Ep



Episode 2488, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 11.69s
Episode 2489, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.10s
Episode 2490, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.10s
Episode 2491, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2492_step_553000.pth
Episode 2492, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 12.85s
Episode 2493, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 12.75s
Episode 2494, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 11.50s
Episode 2495, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 15.00s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2496_step_554000.pth
Episode 2496, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 13.55s
Episode 2497, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.35s
Episode 2498, Total Reward: 592.87, Crashes:



Episode 2501, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 11.37s
Episode 2502, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 12.65s
Episode 2503, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 10.60s
Episode 2504, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2505_step_556000.pth
Episode 2505, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 12.80s
Episode 2506, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 12.75s
Episode 2507, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 13.45s
Episode 2508, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.20s
Episode 2509, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2510_step_557000.pth
Episode 2510, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.90s
Episode 2511, Total Reward: 602.86, Crashes:



Episode 2536, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 11.32s
Episode 2537, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 11.90s
Episode 2538, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.10s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2539_step_563000.pth
Episode 2539, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 13.40s
Episode 2540, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.75s
Episode 2541, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.55s
Episode 2542, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.45s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2543_step_564000.pth
Episode 2543, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.05s
Episode 2544, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 11.10s
Episode 2545, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 12.04s
Episode 2546, Total Reward: 562.86, Crashe



Episode 2558, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.05s
Episode 2559, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 11.40s
Episode 2560, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.45s




Episode 2561, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.86s
Episode 2562, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 12.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2563_step_568000.pth
Episode 2563, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.95s
Episode 2564, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.55s
Episode 2565, Total Reward: 612.86, Crashes: 1, Epsilon: 0.0100, Time: 10.40s
Episode 2566, Total Reward: 612.86, Crashes: 1, Epsilon: 0.0100, Time: 10.55s
Episode 2567, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.90s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2568_step_569000.pth
Episode 2568, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 10.95s
Episode 2569, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 13.20s
Episode 2570, Total Reward: 472.87, Crashes: 15, Epsilon: 0.0100, Time: 11.60s
Episode 2571, Total Reward: 56.00, Crashes: 



Episode 2587, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 12.41s
Episode 2588, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.95s
Episode 2589, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.05s
Episode 2590, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.10s
Episode 2591, Total Reward: 46.21, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Episode 2592, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 11.20s
Episode 2593, Total Reward: 52.42, Crashes: 4, Epsilon: 0.0100, Time: 5.95s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2594_step_574000.pth




Episode 2594, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 11.50s
Episode 2595, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 11.65s
Episode 2596, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.45s
Episode 2597, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2598_step_575000.pth
Episode 2598, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 11.30s
Episode 2599, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2600, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.95s
Episode 2601, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 14.10s




Episode 2602, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.87s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2603_step_576000.pth
Episode 2603, Total Reward: 472.87, Crashes: 15, Epsilon: 0.0100, Time: 14.30s
Episode 2604, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 15.05s
Episode 2605, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 11.75s
Episode 2606, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 10.90s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2607_step_577000.pth
Episode 2607, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.55s
Episode 2608, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 11.10s




Episode 2609, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 13.03s
Episode 2610, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 10.45s
Episode 2611, Total Reward: 472.86, Crashes: 15, Epsilon: 0.0100, Time: 11.90s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2612_step_578000.pth
Episode 2612, Total Reward: 612.86, Crashes: 1, Epsilon: 0.0100, Time: 10.65s
Episode 2613, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.45s
Episode 2614, Total Reward: 602.87, Crashes: 2, Epsilon: 0.0100, Time: 11.40s
Episode 2615, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 11.20s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2616_step_579000.pth
Episode 2616, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 13.25s
Episode 2617, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.50s
Episode 2618, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 15.10s
Episode 2619, Total Reward: 45.59, Crashes



Episode 2621, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 11.15s
Episode 2622, Total Reward: 572.87, Crashes: 5, Epsilon: 0.0100, Time: 11.95s
Episode 2623, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 12.00s
Episode 2624, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.50s
Episode 2625, Total Reward: 46.30, Crashes: 4, Epsilon: 0.0100, Time: 4.30s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2626_step_581000.pth
Episode 2626, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 12.80s
Episode 2627, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 11.45s
Episode 2628, Total Reward: 602.87, Crashes: 2, Epsilon: 0.0100, Time: 11.30s
Episode 2629, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 12.55s
Episode 2630, Total Reward: 552.86, Crashes: 7, Epsilon: 0.0100, Time: 11.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2631_step_582000.pth
Episode 2631, Total Reward: 622.86, Crashes: 



Episode 2658, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.00s
Episode 2659, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.20s
Episode 2660, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 10.80s
Episode 2661, Total Reward: 612.86, Crashes: 1, Epsilon: 0.0100, Time: 10.60s
Episode 2662, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.05s
Episode 2663, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2664_step_589000.pth
Episode 2664, Total Reward: 562.87, Crashes: 6, Epsilon: 0.0100, Time: 10.15s
Episode 2665, Total Reward: 542.87, Crashes: 8, Epsilon: 0.0100, Time: 12.95s
Episode 2666, Total Reward: 602.87, Crashes: 2, Epsilon: 0.0100, Time: 11.70s
Episode 2667, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.00s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2668_step_590000.pth
Episode 2668, Total Reward: 492.86, Crashe



Episode 2701, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 10.83s
Episode 2702, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.45s
Episode 2703, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 10.20s
Episode 2704, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 10.25s
Episode 2705, Total Reward: 602.86, Crashes: 2, Epsilon: 0.0100, Time: 10.00s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2706_step_598000.pth
Episode 2706, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.95s
Episode 2707, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 11.10s
Episode 2708, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 10.60s
Episode 2709, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 11.25s
Episode 2710, Total Reward: 482.87, Crashes: 14, Epsilon: 0.0100, Time: 11.50s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2711_step_599000.pth
Episode 2711, Total Reward: 552.86, Cras



Episode 2777, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2778, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.15s
Episode 2779, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Episode 2780, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2781, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 10.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2782_step_613000.pth
Episode 2782, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 11.55s
Episode 2783, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.65s
Episode 2784, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 11.60s
Episode 2785, Total Reward: 612.87, Crashes: 1, Epsilon: 0.0100, Time: 10.45s
Episode 2786, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2787_step_614000.pth
Episode 2787, Total Reward: 602.86, Crashes: 2, 



Episode 2865, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 12.31s
Episode 2866, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.15s
Episode 2867, Total Reward: 582.86, Crashes: 4, Epsilon: 0.0100, Time: 11.30s
Episode 2868, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.60s
Episode 2869, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.25s




Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2870_step_630000.pth
Episode 2870, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.71s
Episode 2871, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2872, Total Reward: 542.86, Crashes: 8, Epsilon: 0.0100, Time: 11.35s
Episode 2873, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 11.65s




Episode 2874, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.89s




Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2875_step_631000.pth
Episode 2875, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.30s
Episode 2876, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.25s
Episode 2877, Total Reward: 492.86, Crashes: 13, Epsilon: 0.0100, Time: 10.80s
Episode 2878, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.75s
Episode 2879, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2880_step_632000.pth
Episode 2880, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 10.70s
Episode 2881, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.10s
Episode 2882, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 10.70s
Episode 2883, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2884, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2885, Total Reward: 622.87, Crashes: 



Episode 2896, Total Reward: 492.87, Crashes: 13, Epsilon: 0.0100, Time: 12.00s
Episode 2897, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.05s
Episode 2898, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 13.55s
Episode 2899, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2900, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2901_step_636000.pth
Episode 2901, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.05s
Episode 2902, Total Reward: 592.87, Crashes: 3, Epsilon: 0.0100, Time: 10.03s
Episode 2903, Total Reward: 572.86, Crashes: 5, Epsilon: 0.0100, Time: 10.55s
Episode 2904, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Episode 2905, Total Reward: 552.87, Crashes: 7, Epsilon: 0.0100, Time: 9.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2906_step_637000.pth
Episode 2906, Total Reward: 602.86, Crashes: 2,



Episode 2928, Total Reward: 582.87, Crashes: 4, Epsilon: 0.0100, Time: 12.53s
Episode 2929, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.00s
Episode 2930, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2931, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2932, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.65s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2933_step_642000.pth
Episode 2933, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.95s
Episode 2934, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.70s
Episode 2935, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 10.55s
Episode 2936, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 2937, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.65s
Episode 2938, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 10.55s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_



Episode 2962, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 10.56s
Episode 2963, Total Reward: 532.86, Crashes: 9, Epsilon: 0.0100, Time: 10.05s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2964_step_648000.pth
Episode 2964, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 13.10s
Episode 2965, Total Reward: 532.87, Crashes: 9, Epsilon: 0.0100, Time: 12.25s
Episode 2966, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.55s
Episode 2967, Total Reward: 512.86, Crashes: 11, Epsilon: 0.0100, Time: 13.30s
Episode 2968, Total Reward: 562.86, Crashes: 6, Epsilon: 0.0100, Time: 11.25s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_2969_step_649000.pth
Episode 2969, Total Reward: 482.86, Crashes: 14, Epsilon: 0.0100, Time: 15.15s
Episode 2970, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 10.15s
Episode 2971, Total Reward: 442.86, Crashes: 18, Epsilon: 0.0100, Time: 13.15s
Episode 2972, Total Reward: 622.86, Cras



Episode 3000, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 10.31s
Episode 3001, Total Reward: 512.87, Crashes: 11, Epsilon: 0.0100, Time: 11.60s
Episode 3002, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.70s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_3003_step_656000.pth
Episode 3003, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.85s
Episode 3004, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 10.35s
Episode 3005, Total Reward: 502.86, Crashes: 12, Epsilon: 0.0100, Time: 10.20s
Episode 3006, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.00s
Episode 3007, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.85s
Episode 3008, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_3009_step_657000.pth
Episode 3009, Total Reward: 522.87, Crashes: 10, Epsilon: 0.0100, Time: 9.95s
Episode 3010, Total Reward: 512.86, Crashes: 



Episode 3119, Total Reward: 452.87, Crashes: 17, Epsilon: 0.0100, Time: 12.11s
Episode 3120, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Episode 3121, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 12.35s
Episode 3122, Total Reward: 592.86, Crashes: 3, Epsilon: 0.0100, Time: 10.00s
Episode 3123, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.80s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_3124_step_679000.pth
Episode 3124, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.70s
Episode 3125, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 10.05s
Episode 3126, Total Reward: 622.86, Crashes: 0, Epsilon: 0.0100, Time: 9.90s
Episode 3127, Total Reward: 522.86, Crashes: 10, Epsilon: 0.0100, Time: 10.50s
Episode 3128, Total Reward: 622.87, Crashes: 0, Epsilon: 0.0100, Time: 9.75s
Checkpoint saved: checkpoints_simple_right_turn\checkpoint_episode_3129_step_680000.pth
Episode 3129, Total Reward: 582.86, Crashes: 



Episode 3131, Total Reward: 80.49, Crashes: 1, Epsilon: 0.0100, Time: 6.95s


Exception in thread Thread-680428 (__send_act_get_obs_and_wait):
Traceback (most recent call last):
  File "d:\mini_conda\envs\collabkart\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "d:\mini_conda\envs\collabkart\Lib\site-packages\ipykernel\ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "d:\mini_conda\envs\collabkart\Lib\threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "d:\mini_conda\envs\collabkart\Lib\site-packages\rtgym\envs\real_time_env.py", line 438, in __send_act_get_obs_and_wait
    self.__update_obs_rew_terminated_truncated()  # capture observation
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\mini_conda\envs\collabkart\Lib\site-packages\rtgym\envs\real_time_env.py", line 452, in __update_obs_rew_terminated_truncated
    o, r, d, i = self.interface.get_obs_rew_terminated_info()
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\mini_conda\envs\coll

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from collections import deque
import os
from tmrl import get_environment
import time
# Assume device is defined earlier in the code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ComplexDQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(ComplexDQN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        
        # Calculate the size of flattened features
        conv_out_size = self._get_conv_out(h, w)
        
        # Fully connected layers
        self.fc1 = nn.Linear(conv_out_size + 9, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, outputs)
        
        # Dropout layers
        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)
        
        # Attention mechanism
        self.attention = nn.MultiheadAttention(embed_dim=128, num_heads=4)

    def _get_conv_out(self, h, w):
        o = F.relu(self.bn1(self.conv1(torch.zeros(1, 4, h, w))))
        o = F.relu(self.bn2(self.conv2(o)))
        o = F.relu(self.bn3(self.conv3(o)))
        o = F.relu(self.bn4(self.conv4(o)))
        return int(np.prod(o.size()))

    def forward(self, x, additional_inputs):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        
        # Apply attention mechanism
        x = x.view(x.size(0), x.size(1), -1)  # Reshape for attention
        x, _ = self.attention(x, x, x)
        x = x.view(x.size(0), -1)  # Flatten
        
        # Concatenate with additional inputs
        x = torch.cat((x, additional_inputs), dim=1)
        
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        return self.fc3(x)

class ComplexDQNAgent:
    def __init__(self, n_actions, memory_size=100000, batch_size=64, gamma=0.99, 
                 epsilon_start=1.0, epsilon_final=0.01, epsilon_decay=0.995, 
                 learning_rate=0.0005, target_update=10, 
                 checkpoint_dir="checkpoints_complex"):
        self.n_actions = n_actions
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.target_update = target_update
        self.checkpoint_dir = checkpoint_dir
        self.steps = 0

        # Initialize networks
        self.policy_net = ComplexDQN(64, 64, n_actions).to(device)
        self.target_net = ComplexDQN(64, 64, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        # Initialize optimizer and learning rate scheduler
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=10000, gamma=0.95)

    def select_action(self, state):
        if random.random() > self.epsilon:
            with torch.no_grad():
                image = torch.FloatTensor(state[3]).unsqueeze(0).to(device)
                additional = torch.FloatTensor(np.concatenate([state[0], state[1], state[2], state[4], state[5]])).unsqueeze(0).to(device)
                return self.policy_net(image, additional).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

        state_image_batch = torch.FloatTensor(np.stack([s[3] for s in state_batch])).to(device)
        state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in state_batch])).to(device)

        next_state_image_batch = torch.FloatTensor(np.stack([s[3] for s in next_state_batch])).to(device)
        next_state_additional_batch = torch.FloatTensor(np.stack([np.concatenate([s[0], s[1], s[2], s[4], s[5]]) for s in next_state_batch])).to(device)

        action_batch = torch.LongTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        done_batch = torch.FloatTensor(done_batch).to(device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
        state_action_values = self.policy_net(state_image_batch, state_additional_batch).gather(1, action_batch.unsqueeze(1))

        # Compute V(s_{t+1}) for all next states
        next_state_values = self.target_net(next_state_image_batch, next_state_additional_batch).max(1)[0].detach()

        # Compute the expected Q values
        expected_state_action_values = reward_batch + (1 - done_batch) * self.gamma * next_state_values

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()
        self.scheduler.step()

        # Update epsilon
        self.epsilon = max(self.epsilon_final, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save_checkpoint(self, episode):
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        checkpoint = {
            'episode': episode,
            'model_state_dict': self.policy_net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'epsilon': self.epsilon,
            'steps': self.steps
        }
        filename = os.path.join(self.checkpoint_dir, f"checkpoint_episode_{episode}_step_{self.steps}.pth")
        torch.save(checkpoint, filename)
        print(f"Checkpoint saved: {filename}")

    def load_checkpoint(self, filename):
        checkpoint = torch.load(filename)
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.target_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.epsilon = checkpoint['epsilon']
        self.steps = checkpoint['steps']
        return checkpoint['episode']

    def train(self, env, num_episodes, max_steps_per_episode=1000):
        for episode in range(num_episodes):
            state = env.reset()
            total_reward = 0
            
            for step in range(max_steps_per_episode):
                action = self.select_action(state)
                next_state, reward, done, _ = env.step(action.item())
                
                self.remember(state, action.item(), reward, next_state, done)
                self.replay()
                
                state = next_state
                total_reward += reward
                self.steps += 1
                
                if done:
                    break
            
            if episode % self.target_update == 0:
                self.update_target_network()
                self.save_checkpoint(episode)
            
            print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {self.epsilon:.2f}")

    def evaluate(self, env, num_episodes):
        self.policy_net.eval()
        total_rewards = []

        for _ in range(num_episodes):
            state = env.reset()
            episode_reward = 0
            done = False

            while not done:
                with torch.no_grad():
                    image = torch.FloatTensor(state[3]).unsqueeze(0).to(device)
                    additional = torch.FloatTensor(np.concatenate([state[0], state[1], state[2], state[4], state[5]])).unsqueeze(0).to(device)
                    action = self.policy_net(image, additional).max(1)[1].view(1, 1)
                next_state, reward, done, _ = env.step(action.item())
                episode_reward += reward
                state = next_state

            total_rewards.append(episode_reward)

        self.policy_net.train()
        return np.mean(total_rewards), np.std(total_rewards)

def preprocess_observation(obs):
    speed = np.array(obs[0]).flatten()
    steering = np.array(obs[1]).flatten()
    gyro = np.array(obs[2]).flatten()
    image = np.array(obs[3]).astype(np.float32) / 255.0
    prev_action = np.array(obs[4]).flatten()
    action = np.array(obs[5]).flatten()
    
    speed = np.clip(speed / 300.0, -1, 1)
    steering = np.clip(steering / np.pi, -1, 1)
    gyro = np.clip(gyro / np.pi, -1, 1)
    prev_action = np.clip(prev_action, -1, 1)
    action = np.clip(action, -1, 1)
    
    return (speed, steering, gyro, image, prev_action, action)

def env_action_to_agent_action(env_action, n_actions):
    if isinstance(env_action, (int, np.integer)):
        return env_action
    elif isinstance(env_action, (float, np.float32, np.float64)):
        return int((env_action + 1) * (n_actions - 1) / 2)
    elif isinstance(env_action, np.ndarray):
        return np.argmax(env_action)
    else:
        raise ValueError(f"Unexpected action type: {type(env_action)}")

def agent_action_to_env_action(agent_action, n_actions):
    steering = (agent_action * 2 / (n_actions - 1)) - 1
    throttle = 1.0
    brake = 0.0
    return [throttle, brake, steering]

def detect_crash(obs, prev_obs, speed_threshold=1.0):
    if prev_obs is None:
        return False
    
    current_speed = np.linalg.norm(obs[0])
    prev_speed = np.linalg.norm(prev_obs[0])
    
    if prev_speed - current_speed > speed_threshold:
        return True
    
    return False

def train(env, agent, num_episodes, max_steps_per_episode):
    for episode in range(num_episodes):
        obs, _ = env.reset()
        state = preprocess_observation(obs)
        total_reward = 0
        episode_start_time = time.time()
        prev_obs = None
        crashes = 0

        for step in range(max_steps_per_episode):
            agent_action = agent.select_action(state)
            env_action = agent_action_to_env_action(agent_action.item(), agent.n_actions)
            next_obs, env_reward, terminated, truncated, _ = env.step(env_action)
            next_state = preprocess_observation(next_obs)
            
            if detect_crash(next_obs, prev_obs):
                crashes += 1
                env_reward-= 10
            done = terminated or truncated
            
            elapsed_time = time.time() - episode_start_time
            time_factor = 1 / (elapsed_time + 1)
            reward = (env_reward)
            
            agent.remember(state, agent_action.item(), reward, next_state, done)
            agent.replay()
            
            state = next_state
            prev_obs = next_obs
            total_reward += reward

            agent.steps += 1

            if agent.steps % 1000 == 0:
                agent.save_checkpoint(episode)

            if done:
                break
        # total_reward-= 20*crashes
        if episode % 10 == 0:
            agent.update_target_network()

        print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Crashes: {crashes}, Epsilon: {agent.epsilon:.4f}, Time: {elapsed_time:.2f}s")


In [2]:
if __name__ == "__main__":
    env = get_environment()
    n_actions = 3
    agent = ComplexDQNAgent(n_actions)
    agent.load_checkpoint("checkpoints_complex\checkpoint_episode_365_step_73000.pth")
    train(env, agent, num_episodes=5000, max_steps_per_episode=5000)

  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Error(s) in loading state_dict for ComplexDQN:
	Missing key(s) in state_dict: "conv4.weight", "conv4.bias", "bn4.weight", "bn4.bias", "bn4.running_mean", "bn4.running_var", "attention.in_proj_weight", "attention.in_proj_bias", "attention.out_proj.weight", "attention.out_proj.bias". 
	size mismatch for fc1.weight: copying a param with shape torch.Size([512, 1033]) from checkpoint, the shape in current model is torch.Size([512, 2057]).