In [7]:
import time
import random
import pydirectinput
import win32gui

pydirectinput.PAUSE = 0.1

# Define key constants
KEYS = {
    'ACCELERATE': 'w',
    'BRAKE': 's',
    'STEER_LEFT': 'a',
    'STEER_RIGHT': 'd'
}

def focus_trackmania_window():
    window_name = "Trackmania"  # Adjust this if needed
    hwnd = win32gui.FindWindow(None, window_name)
    if hwnd:
        win32gui.SetForegroundWindow(hwnd)
        print("Successfully focused on Trackmania window")
        return True
    print("Could not find Trackmania window")
    return False

def send_inputs(actions):
    for action in actions:
        pydirectinput.keyDown(KEYS[action])
    time.sleep(random.uniform(0.1, 0.5))  # Hold keys for a random duration
    for action in actions:
        pydirectinput.keyUp(KEYS[action])

def get_random_actions():
    num_actions = random.randint(1, len(KEYS))  # Choose 1 to 4 actions
    return random.sample(list(KEYS.keys()), num_actions)

def main_loop():
    if not focus_trackmania_window():
        print("Couldn't find Trackmania window. Make sure the game is running.")
        return

    try:
        while True:
            actions = get_random_actions()
            print(f"Pressing: {actions}")
            send_inputs(actions)
            time.sleep(random.uniform(0.1, 0.3))  # Random delay between inputs
    except KeyboardInterrupt:
        print("\nScript stopped by user.")

if __name__ == "__main__":
    print("Starting in 5 seconds. Please make sure Trackmania is running...")
    time.sleep(5)
    main_loop()

ModuleNotFoundError: No module named 'game_interface'

In [6]:
pip install game_interface

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement game_interface (from versions: none)
ERROR: No matching distribution found for game_interface


In [18]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import cv2
from PIL import ImageGrab
import pydirectinput
import win32gui
import time
import random
from collections import deque

pydirectinput.PAUSE = 0.1

class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self._calculate_conv_output_dims(input_shape), 512)
        self.fc2 = nn.Linear(512, num_actions)
        
    def _calculate_conv_output_dims(self, input_shape):
        x = torch.zeros(1, *input_shape)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        return int(np.prod(x.size()))
    
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)


In [9]:
# Define key constants
KEYS = {
    'ACCELERATE': 'w',
    'BRAKE': 's',
    'STEER_LEFT': 'a',
    'STEER_RIGHT': 'd'
}

def focus_trackmania_window():
    window_name = "Trackmania"  # Adjust this if needed
    hwnd = win32gui.FindWindow(None, window_name)
    if hwnd:
        win32gui.SetForegroundWindow(hwnd)
        print("Successfully focused on Trackmania window")
        return True
    print("Could not find Trackmania window")
    return False

def preprocess_image(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image, (84, 84))
    image = np.expand_dims(image, axis=0)  # Add channel dimension
    return image / 255.0  # Normalize to [0, 1]

def capture_screen():
    screen = np.array(ImageGrab.grab(bbox=(0, 40, 800, 640)))  # Adjust the bbox as needed
    return preprocess_image(screen)

def send_inputs(actions):
    for action in actions:
        pydirectinput.keyDown(KEYS[action])
    time.sleep(random.uniform(0.1, 0.5))  # Hold keys for a random duration
    for action in actions:
        pydirectinput.keyUp(KEYS[action])

def get_random_actions():
    num_actions = random.randint(1, len(KEYS))  # Choose 1 to 4 actions
    return random.sample(list(KEYS.keys()), num_actions)

class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
    
    def push(self, transition):
        self.memory.append(transition)
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)


In [10]:
def select_action(state, policy_net, epsilon, num_actions):
    if random.random() > epsilon:
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            return policy_net(state).argmax(dim=1).item()
    else:
        return random.randrange(num_actions)

def optimize_model(memory, policy_net, target_net, optimizer, batch_size, gamma):
    if len(memory) < batch_size:
        return
    
    transitions = memory.sample(batch_size)
    batch = list(zip(*transitions))
    
    state_batch = torch.tensor(batch[0], dtype=torch.float32)
    action_batch = torch.tensor(batch[1], dtype=torch.int64).unsqueeze(1)
    reward_batch = torch.tensor(batch[2], dtype=torch.float32)
    next_state_batch = torch.tensor(batch[3], dtype=torch.float32)
    
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    next_state_values = target_net(next_state_batch).max(1)[0].detach()
    
    expected_state_action_values = reward_batch + (gamma * next_state_values)
    
    loss = nn.functional.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [11]:
def game_ended():
    # Implement your game end detection logic here
    # For simplicity, we'll return False (assuming the game is running)
    return False

def main():
    input_shape = (1, 84, 84)
    num_actions = len(KEYS)
    policy_net = DQN(input_shape, num_actions)
    target_net = DQN(input_shape, num_actions)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    
    optimizer = optim.Adam(policy_net.parameters())
    memory = ReplayMemory(10000)
    
    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_decay = 0.995
    gamma = 0.99
    batch_size = 32
    update_target_every = 1000
    
    if not focus_trackmania_window():
        print("Couldn't find Trackmania window. Make sure the game is running.")
        return

    num_episodes = 1000
    for episode in range(num_episodes):
        state = capture_screen()
        done = False
        total_reward = 0
        step = 0
        
        while not done:
            if not focus_trackmania_window():
                print("Trackmania window lost focus.")
                return

            action = select_action(state, policy_net, epsilon, num_actions)
            send_inputs([list(KEYS.keys())[action]])
            
            next_state = capture_screen()
            reward = 1  # Modify this based on the game state
            if game_ended():  # Implement this function based on your game's end condition
                reward += 100
                done = True
            
            memory.push((state, action, reward, next_state))
            state = next_state
            total_reward += reward
            
            optimize_model(memory, policy_net, target_net, optimizer, batch_size, gamma)
            
            step += 1
            if step % update_target_every == 0:
                target_net.load_state_dict(policy_net.state_dict())
        
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        print(f"Episode {episode}: Total reward: {total_reward}")
        
        # Save the model periodically
        if episode % 50 == 0:
            torch.save(policy_net.state_dict(), f"model_episode_{episode}.pth")

if __name__ == "__main__":
    print("Starting in 5 seconds. Please make sure Trackmania is running...")
    time.sleep(5)
    main()


Starting in 5 seconds. Please make sure Trackmania is running...
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfull

  state_batch = torch.tensor(batch[0], dtype=torch.float32)


Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania window
Successfully focused on Trackmania

KeyboardInterrupt: 

In [6]:
import pytesseract
from PIL import ImageGrab
import numpy as np
import cv2
import win32gui

def focus_trackmania_window(window_name="Trackmania"):
    hwnd = win32gui.FindWindow(None, window_name)
    if hwnd:
        win32gui.SetForegroundWindow(hwnd)
        print("Focused on Trackmania window.")
        return True
    print("Couldn't find Trackmania window.")
    return False

def capture_screen(bbox=None):
    print(f"Capturing screen with bbox: {bbox}")
    return ImageGrab.grab(bbox=bbox)

def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    _, thresh = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV)
    return thresh

def detect_game_end(image):
    text = pytesseract.image_to_string(image, lang='eng')
    return "improve" in text.lower()

def game_ended():
    if not focus_trackmania_window():
        return False

    # Adjust the bbox to match the area where "improve" text might appear
    bbox = (0, 40, 800, 640)  # Example bounding box, adjust as needed
    screen = capture_screen(bbox)
    preprocessed_image = preprocess_image(screen)

    if detect_game_end(preprocessed_image):
        print("Game ended detected.")
        return True
    return False

# Example usage
if __name__ == "__main__":
    if game_ended():
        print("Game has ended.")
    else:
        print("Game is still running.")


error: (0, 'SetForegroundWindow', 'No error message is available')

In [9]:
pip install --upgrade easyocr

Note: you may need to restart the kernel to use updated packages.


In [3]:
import bidi.algorithm

ModuleNotFoundError: No module named 'bidi.algorithm'

In [2]:
import pytesseract
text = pytesseract.image_to_string('image.jpg')

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [2]:
pip uninstall tmrl

In [5]:
python -m tmrl --install

SyntaxError: invalid syntax (4264521255.py, line 1)

In [2]:
from threading import Thread
from tuto_envs.dummy_rc_drone_interface import DUMMY_RC_DRONE_CONFIG

# TMRL imports:
from tmrl.networking import Server, RolloutWorker, Trainer
from tmrl.util import partial
from tmrl.envs import GenericGymEnv
import tmrl.config.config_constants as cfg
from tmrl.training_offline import TorchTrainingOffline
from tmrl.custom.custom_algorithms import SpinupSacAgent
from tmrl.custom.custom_models import SquashedGaussianMLPActor, MLPActorCritic
from tmrl.custom.custom_memories import GenericTorchMemory


# Set this to True only for debugging your pipeline.
CRC_DEBUG = False

# Name used for training checkpoints and models saved in the TmrlData folder.
# If you change anything, also change this name (or delete the saved files in TmrlData).
my_run_name = "tutorial_minimal_drone"


# First, you need to define your Gymnasium environment.
# TMRL is typically useful to train real-time robots.
# Thus, we use Real-Time Gym to define a dummy RC drone as an example.
# (Implemented in tuto_envs.dummy_rc_drone_interface)

# === Environment ======================================================================================================

# rtgym interface:

# Environment class:

env_cls = partial(GenericGymEnv, id="real-time-gym-ts-v1", gym_kwargs={"config": my_rtgym_config})

# Observation and action space:

dummy_env = env_cls()
act_space = dummy_env.action_space
obs_space = dummy_env.observation_space

print(f"action space: {act_space}")
print(f"observation space: {obs_space}")


# Now that we have defined our environment, let us train an agent with the generic TMRL pipeline.
# TMRL pipelines have a central communication Server, a Trainer, and one to several RolloutWorkers.


# === TMRL Server ======================================================================================================

# The TMRL Server is the central point of communication between TMRL entities.
# The Trainer and the RolloutWorkers connect to the Server.

security = None  # This is fine for secure local networks. On the Internet, use "TLS" instead.
password = cfg.PASSWORD  # This is the password defined in TmrlData/config/config.json

server_ip = "127.0.0.1"  # This is the localhost IP. Change it for your public IP if you want to run on the Internet.
server_port = 6666  # On the Internet, the machine hosting the Server needs to be reachable via this port.

if __name__ == "__main__":
    # Instantiating a TMRL Server is straightforward.
    # More arguments are available for, e.g., using TLS. Please refer to the TMRL documentation.
    my_server = Server(security=security, password=password, port=server_port)


# === TMRL Worker ======================================================================================================

# TMRL RolloutWorkers are responsible for collecting training samples.
# A RolloutWorker contains an ActorModule, which encapsulates its policy.

# ActorModule:

# SquashedGaussianMLPActor processes observations through an MLP.
# It is designed to work with the SAC algorithm.
actor_module_cls = partial(SquashedGaussianMLPActor)

# Worker local files

weights_folder = cfg.WEIGHTS_FOLDER
model_path = str(weights_folder / (my_run_name + ".tmod"))  # Current model will be stored here.
model_path_history = str(weights_folder / (my_run_name + "_"))
model_history = -1  # let us not save a model history.

# Instantiation of the RolloutWorker object:

if __name__ == "__main__":
    my_worker = RolloutWorker(
        env_cls=env_cls,
        actor_module_cls=actor_module_cls,
        sample_compressor=None,
        device="cpu",
        server_ip=server_ip,
        server_port=server_port,
        password=password,
        max_samples_per_episode=1000,
        model_path=model_path,
        # model_path_history=model_path_history,  # not used when model_history is -1
        model_history=model_history,
        crc_debug=CRC_DEBUG)

    # Note: at this point, the RolloutWorker is not collecting samples yet.
    # Nevertheless, it connects to the Server.


# === TMRL Trainer =====================================================================================================

# The TMRL Trainer is where your training algorithm lives.
# It connects to the Server, to retrieve training samples collected from the RolloutWorkers.
# Periodically, it also sends updated policies to the Server, which forwards them to the RolloutWorkers.

# TMRL Trainers contain a Training class. Currently, only TrainingOffline is supported.
# TrainingOffline notably contains a Memory class, and a TrainingAgent class.
# The Memory is a replay buffer. In TMRL, you are able and encouraged to define your own Memory.
# This is how you can implement highly optimized ad-hoc pipelines for your applications.
# Nevertheless, TMRL also defines a generic, non-optimized Memory that can be used for any pipeline.
# The TrainingAgent contains your training algorithm per-se.
# TrainingOffline is meant for asynchronous off-policy algorithms, such as Soft Actor-Critic.

# Trainer local files:

weights_folder = cfg.WEIGHTS_FOLDER
checkpoints_folder = cfg.CHECKPOINTS_FOLDER
model_path = str(weights_folder / (my_run_name + "_t.tmod"))
checkpoints_path = str(checkpoints_folder / (my_run_name + "_t.tcpt"))

# Dummy environment OR (observation space, action space) tuple:
# env_cls = partial(GenericGymEnv, id="real-time-gym-ts-v1", gym_kwargs={"config": my_rtgym_config})
env_cls = (obs_space, act_space)

# Memory:

memory_cls = partial(GenericTorchMemory,
                     memory_size=1e6,
                     batch_size=32,
                     crc_debug=CRC_DEBUG)

# Training agent:

training_agent_cls = partial(SpinupSacAgent,
                             model_cls=MLPActorCritic,
                             gamma=0.99,
                             polyak=0.995,
                             alpha=0.2,
                             lr_actor=1e-3,
                             lr_critic=1e-3,
                             lr_entropy=1e-3,
                             learn_entropy_coef=True,
                             target_entropy=None)

# Training parameters:

epochs = 10  # maximum number of epochs, usually set this to np.inf
rounds = 10  # number of rounds per epoch
steps = 1000  # number of training steps per round
update_buffer_interval = 100
update_model_interval = 100
max_training_steps_per_env_step = 2.0
start_training = 400
device = None

# Training class:

training_cls = partial(
    TorchTrainingOffline,
    env_cls=env_cls,
    memory_cls=memory_cls,
    training_agent_cls=training_agent_cls,
    epochs=epochs,
    rounds=rounds,
    steps=steps,
    update_buffer_interval=update_buffer_interval,
    update_model_interval=update_model_interval,
    max_training_steps_per_env_step=max_training_steps_per_env_step,
    start_training=start_training,
    device=device)

# Trainer instance:

if __name__ == "__main__":
    my_trainer = Trainer(
        training_cls=training_cls,
        server_ip=server_ip,
        server_port=server_port,
        password=password,
        model_path=model_path,
        checkpoint_path=checkpoints_path)  # None for not saving training checkpoints


# === Running the pipeline =============================================================================================

# Now we have everything we need.
# Typically, you will run your TMRL Server, Trainer and RolloutWorkers in different terminals / machines.
# But for simplicity, in this tutorial, we run them in different threads instead.
# Note that the Server is already running (it started running when instantiated).


# Separate threads for running the RolloutWorker and Trainer:


def run_worker(worker):
    worker.run(test_episode_interval=10, verbose=True)


def run_trainer(trainer):
    trainer.run()


if __name__ == "__main__":
    daemon_thread_worker = Thread(target=run_worker, args=(my_worker, ), kwargs={}, daemon=True)
    daemon_thread_worker.start()  # start the worker daemon thread

    run_trainer(my_trainer)

    # the worker daemon thread will be killed here.

ModuleNotFoundError: No module named 'tuto_envs'

In [2]:
from tmrl import get_environment
from time import sleep
import numpy as np

# LIDAR observations are of shape: ((1,), (4, 19), (3,), (3,))
# representing: (speed, 4 last LIDARs, 2 previous actions)
# actions are [gas, break, steer], analog between -1.0 and +1.0
def model(obs):
    """
    simplistic policy for LIDAR observations
    """
    deviation = obs[1].mean(0)
    print(deviation)
    deviation /= (deviation.sum() + 0.001)
    steer = 0
    for i in range(19):
        steer += (i - 9) * deviation[i]
    steer = - np.tanh(steer * 4)
    steer = min(max(steer, -1.0), 1.0)
    return np.array([1.0, 0.0, steer])

# Let us retrieve the TMRL Gymnasium environment.
# The environment you get from get_environment() depends on the content of config.json
env = get_environment()

sleep(1.0)  # just so we have time to focus the TM20 window after starting the script

obs, info = env.reset()  # reset environment
for _ in range(5000):  # rtgym ensures this runs at 20Hz by default
    act = model(obs)  # compute action
    obs, rew, terminated, truncated, info = env.step(act)  # step (rtgym ensures healthy time-steps)
    if terminated or truncated:
        env.reset()
env.wait()  # rtgym-specific method to artificially 'pause' the environment when needed

AssertionError: OpenPlanet stopped sending data since more than 10.0s.

In [13]:
import tmrl
from tmrl.custom.utils.control_gamepad import gamepad_close_finish_pop_up_tm20,control_gamepad
from tmrl.envs import GenericGymEnv
import time
import numpy as np

class SimpleTrackManiaEnv(GenericGymEnv):
    def __init__(self):
        super().__init__(control_gamepad())
        self.controller = self.env
        self.action_space = self.env.action_space

    def reset(self):
        """Reset the environment and restart the map."""
        obs = super().reset()
        self._restart_map()
        return obs

    def _restart_map(self):
        """Restart the current map."""
        self.controller.restart()
        time.sleep(2)  # Wait for the map to reload

    def step(self, action):
        """
        Perform a step in the environment.
        Check if the race is completed and restart if necessary.
        """
        obs, reward, done, info = super().step(action)

        if self._is_race_completed(info):
            print("Race completed! Restarting...")
            obs = self.reset()
            done = False

        return obs, reward, done, info

    def _is_race_completed(self, info):
        """
        Check if the race is completed based on the info dictionary.
        """
        # Adjust this condition based on how your environment indicates race completion
        return info.get('race_finished', False)

def only_w_action():
    """
    Create an action that only presses 'W' (accelerate).
    Adjust this based on your action space structure.
    """
    action = np.zeros(3)  # Assuming action space is [steering, acceleration, brake]
    action[1] = 1.0  # Set acceleration to maximum
    return action

# Main loop
env = get_environment()

num_episodes = 10
for episode in range(num_episodes):
    obs = env.reset()
    done = False
    total_reward = 0
    step_count = 0

    while not done:
        action = only_w_action()
        obs, reward, done, info = env.step(action)
        total_reward += reward
        step_count += 1

        if step_count % 100 == 0:
            print(f"Episode {episode + 1}, Step {step_count}, Current Reward: {total_reward}")

    print(f"Episode {episode + 1} finished. Total steps: {step_count}, Total reward: {total_reward}")

env.close()

TypeError: control_gamepad() missing 2 required positional arguments: 'gamepad' and 'control'

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tmrl import get_environment
from collections import deque
import random
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.LayerNorm(128)
        self.fc2 = nn.Linear(128, 256)
        self.bn2 = nn.LayerNorm(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.LayerNorm(128)
        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.LayerNorm(64)
        self.fc5 = nn.Linear(64, output_dim)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn3(self.fc3(x)))
        x = torch.relu(self.bn4(self.fc4(x)))
        return self.fc5(x)

class DQNAgent:
    def __init__(self, state_shape, action_space):
        self.state_shape = state_shape
        self.action_space = action_space
        self.memory = deque(maxlen=10000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = DQN(state_shape[0], action_space).to(device)
        self.target_model = DQN(state_shape[0], action_space).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.update_target_model()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_space)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            act_values = self.model(state)
        return torch.argmax(act_values).item()

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).to(device)

        current_q = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        max_next_q = self.target_model(next_states).max(1)[0]
        expected_q = rewards + (1 - dones) * self.gamma * max_next_q

        loss = nn.MSELoss()(current_q, expected_q.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

def preprocess_obs(obs):
    speed = obs[0]
    lidar = obs[1].flatten()
    prev_actions = obs[2]
    return np.concatenate([speed, lidar, prev_actions])

def action_to_env_action(action):
    if action == 0:
        return np.array([1.0, 0.0, -1.0])  # Full throttle, turn left
    elif action == 1:
        return np.array([1.0, 0.0, 0.0])   # Full throttle, go straight
    else:
        return np.array([1.0, 0.0, 1.0])   # Full throttle, turn right

def calculate_speed_reward(speed):
    return speed * 0.1  # Scale the reward to be smaller than the main rewards

env = get_environment()
state_shape = (1 + 4*19 + 3,)  # speed + flattened LIDAR + previous actions
action_space = 3  # Left, Straight, Right

agent = DQNAgent(state_shape, action_space)
batch_size = 32
n_episodes = 3000
time_limit = 300  # 30 seconds time limit

for e in range(n_episodes):
    obs, info = env.reset()
    state = preprocess_obs(obs)
    total_reward = 0
    done = False
    start_time = time.time()
    step_count = 0
    episode_speeds = []
    
    while not done:
        action = agent.act(state)
        env_action = action_to_env_action(action)
        next_obs, reward, terminated, truncated, info = env.step(env_action)
        
        step_count += 1
        current_time = time.time()
        elapsed_time = current_time - start_time
        
        speed = next_obs[0]
        episode_speeds.append(speed)
        
        speed_reward = calculate_speed_reward(speed)
        if elapsed_time > time_limit:
            print(f"Time limit exceeded. Elapsed time: {elapsed_time:.2f}s")
            done = False
            reward = 0
        else:
            done = terminated or truncated
            if done:
                if terminated:
                    print(f"Episode terminated (e.g., crash). Elapsed time: {elapsed_time:.2f}s")
                    reward = -100
                else:
                    print(f"Episode truncated (e.g., completed). Elapsed time: {elapsed_time:.2f}s")
                    reward = 10000
            else:
                reward = 0
        
        reward += speed_reward
        
        next_state = preprocess_obs(next_obs)
        
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        if elapsed_time > time_limit:
            break
        
        time.sleep(0.01)
    
    if e % 10 == 0:
        agent.update_target_model()
    
    episode_time = time.time() - start_time
    average_speed = np.mean(episode_speeds) if episode_speeds else 0
    
    # Modified print statement to handle potential NumPy arrays
    print(f"Episode: {e}/{n_episodes}, Total Reward: {float(total_reward):.2f}, Epsilon: {agent.epsilon:.2f}, "
          f"Time: {episode_time:.2f}s, Steps: {step_count}, Avg Speed: {float(average_speed):.2f}")

    time.sleep(0.1)

torch.save(agent.model.state_dict(), 'tmrl_dqn_model.pth')

print("Training finished.")

Using device: cuda


  from .autonotebook import tqdm as notebook_tqdm
  states = torch.FloatTensor(states).to(device)
  return F.mse_loss(input, target, reduction=self.reduction)


Episode terminated (e.g., crash). Elapsed time: 18.35s
Episode: 0/3000, Total Reward: 816.75, Epsilon: 0.20, Time: 18.37s, Steps: 353, Avg Speed: 25.97


  print(f"Episode: {e}/{n_episodes}, Total Reward: {float(total_reward):.2f}, Epsilon: {agent.epsilon:.2f}, "


Episode terminated (e.g., crash). Elapsed time: 12.35s
Episode: 1/3000, Total Reward: 222.38, Epsilon: 0.06, Time: 12.37s, Steps: 233, Avg Speed: 13.84
Episode terminated (e.g., crash). Elapsed time: 8.20s
Episode: 2/3000, Total Reward: 86.02, Epsilon: 0.03, Time: 8.22s, Steps: 150, Avg Speed: 12.40
Episode terminated (e.g., crash). Elapsed time: 14.65s
Episode: 3/3000, Total Reward: 504.59, Epsilon: 0.01, Time: 14.67s, Steps: 279, Avg Speed: 21.67
Episode terminated (e.g., crash). Elapsed time: 4.75s
Episode: 4/3000, Total Reward: -63.25, Epsilon: 0.01, Time: 4.77s, Steps: 81, Avg Speed: 4.54
Episode terminated (e.g., crash). Elapsed time: 4.80s
Episode: 5/3000, Total Reward: 32.39, Epsilon: 0.01, Time: 4.82s, Steps: 82, Avg Speed: 16.15
Episode terminated (e.g., crash). Elapsed time: 4.75s
Episode: 6/3000, Total Reward: -18.56, Epsilon: 0.01, Time: 4.77s, Steps: 81, Avg Speed: 10.05
Episode terminated (e.g., crash). Elapsed time: 4.75s
Episode: 7/3000, Total Reward: -64.73, Epsilon: 



Episode terminated (e.g., crash). Elapsed time: 14.38s
Episode: 1945/3000, Total Reward: 215.28, Epsilon: 0.01, Time: 14.39s, Steps: 242, Avg Speed: 13.03
Episode terminated (e.g., crash). Elapsed time: 16.70s
Episode: 1946/3000, Total Reward: 457.60, Epsilon: 0.01, Time: 16.72s, Steps: 320, Avg Speed: 17.42
Episode terminated (e.g., crash). Elapsed time: 23.00s
Episode: 1947/3000, Total Reward: 832.00, Epsilon: 0.01, Time: 23.02s, Steps: 446, Avg Speed: 20.90
Episode terminated (e.g., crash). Elapsed time: 12.20s
Episode: 1948/3000, Total Reward: 277.53, Epsilon: 0.01, Time: 12.22s, Steps: 230, Avg Speed: 16.41
Episode terminated (e.g., crash). Elapsed time: 20.05s
Episode: 1949/3000, Total Reward: 706.34, Epsilon: 0.01, Time: 20.07s, Steps: 387, Avg Speed: 20.84
Episode terminated (e.g., crash). Elapsed time: 17.20s
Episode: 1950/3000, Total Reward: 421.75, Epsilon: 0.01, Time: 17.22s, Steps: 330, Avg Speed: 15.81
Episode terminated (e.g., crash). Elapsed time: 4.75s
Episode: 1951/30

In [3]:
pip install gym

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
     ---------------------------------------- 0.0/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
     - ----------------------------------- 30.7/721.7 kB 100.9 kB/s eta 0:00:07
     - ----------------------------------- 30.7/721.7 kB 100.9 kB/s eta 0:00:07
     -- ----------------------------------- 41.0/721.7 kB 98.5 kB/s eta 0:00:07
     --- --------------------------------- 61.4/721.7 kB 131.3 kB/s eta 0:00:06
     ---- -------------------------------- 81.9/721.7 kB 164.0 kB/s eta 0:00:04
     ---- -------------------------------- 92.2/721.7 kB 169.3 kB/s eta 0:00:04
     ----- ----------

In [1]:
import numpy as np
from tmrl import get_environment

def visualize_observation(obs):
    speed = obs[0]
    lidar = obs[1]
    prev_actions = obs[2]

    print("Observation Details:")
    print(f"Speed: {speed}")
    
    print("\nLIDAR Data (shape: {lidar.shape}):")
    print(lidar)
    
    print("\nPrevious Actions:")
    print(prev_actions)
    
    print("\nPreprocessed Observation:")
    preprocessed = preprocess_obs(obs)
    print(f"Shape: {preprocessed.shape}")
    print(preprocessed)

# Use this function
env = get_environment()
obs, info = env.reset()
print(obs,info)

(array([0.], dtype=float32), array([1.], dtype=float32), array([0.], dtype=float32), array([[[ 31,  31,  31, ...,  31,  31,  31],
        [ 31,  31,  31, ...,  31,  31,  31],
        [ 31,  31,  31, ...,  31,  31,  31],
        ...,
        [217, 218, 218, ..., 215, 216, 215],
        [220, 220, 218, ..., 216, 218, 218],
        [220, 219, 217, ..., 216, 218, 218]],

       [[ 31,  31,  31, ...,  31,  31,  31],
        [ 31,  31,  31, ...,  31,  31,  31],
        [ 31,  31,  31, ...,  31,  31,  31],
        ...,
        [217, 218, 218, ..., 215, 216, 215],
        [220, 220, 218, ..., 216, 218, 218],
        [220, 219, 217, ..., 216, 218, 218]],

       [[ 31,  31,  31, ...,  31,  31,  31],
        [ 31,  31,  31, ...,  31,  31,  31],
        [ 31,  31,  31, ...,  31,  31,  31],
        ...,
        [217, 218, 218, ..., 215, 216, 215],
        [220, 220, 218, ..., 216, 218, 218],
        [220, 219, 217, ..., 216, 218, 218]],

       [[ 31,  31,  31, ...,  31,  31,  31],
        [ 31,  

In [2]:
obs

(array([0.], dtype=float32),
 array([1.], dtype=float32),
 array([0.], dtype=float32),
 array([[[ 31,  31,  31, ...,  31,  31,  31],
         [ 31,  31,  31, ...,  31,  31,  31],
         [ 31,  31,  31, ...,  31,  31,  31],
         ...,
         [217, 218, 218, ..., 215, 216, 215],
         [220, 220, 218, ..., 216, 218, 218],
         [220, 219, 217, ..., 216, 218, 218]],
 
        [[ 31,  31,  31, ...,  31,  31,  31],
         [ 31,  31,  31, ...,  31,  31,  31],
         [ 31,  31,  31, ...,  31,  31,  31],
         ...,
         [217, 218, 218, ..., 215, 216, 215],
         [220, 220, 218, ..., 216, 218, 218],
         [220, 219, 217, ..., 216, 218, 218]],
 
        [[ 31,  31,  31, ...,  31,  31,  31],
         [ 31,  31,  31, ...,  31,  31,  31],
         [ 31,  31,  31, ...,  31,  31,  31],
         ...,
         [217, 218, 218, ..., 215, 216, 215],
         [220, 220, 218, ..., 216, 218, 218],
         [220, 219, 217, ..., 216, 218, 218]],
 
        [[ 31,  31,  31, ...,  31,

In [3]:
info

{}

: 

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tmrl import get_environment
from collections import deque
import random
import time

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# DQN Model
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# DQN Agent
class DQNAgent:
    def __init__(self, state_shape, action_space):
        self.state_shape = state_shape
        self.action_space = action_space
        self.memory = deque(maxlen=100000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.model = DQN(state_shape, action_space).to(device)
        self.target_model = DQN(state_shape, action_space).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.0005, weight_decay=1e-4)
        self.update_target_model()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, float(reward), next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_space)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            act_values = self.model(state)
        return torch.argmax(act_values).item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return

        minibatch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.FloatTensor(np.array(states)).to(device)
        actions = torch.LongTensor(np.array(actions)).to(device)
        rewards = torch.FloatTensor(np.array(rewards)).to(device)
        next_states = torch.FloatTensor(np.array(next_states)).to(device)
        dones = torch.FloatTensor(np.array(dones)).to(device)

        current_q = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_model(next_states)
        max_next_q = torch.max(next_q_values, 1)[0]
        expected_q = rewards + (1 - dones) * self.gamma * max_next_q

        loss = F.smooth_l1_loss(current_q, expected_q.detach())

        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Preprocessing functions
def preprocess_lidar(lidar):
    lidar = np.array(lidar)
    if len(lidar.shape) > 1:
        lidar = lidar.flatten()
    
    max_range = 100
    lidar = np.clip(lidar, 0, max_range)
    lidar_normalized = lidar / max_range
    lidar_log = np.log1p(lidar_normalized) / np.log1p(1)
    
    return lidar_log

def normalize_state(state):
    speed, lidar, prev_actions = state
    normalized_speed = np.array([speed / 300.0])  # Assuming max speed is 300
    normalized_lidar = preprocess_lidar(lidar)
    normalized_prev_actions = np.array(prev_actions) / 2.0  # Assuming actions are in [-1, 1]
    
    # Ensure all arrays are 1-dimensional
    normalized_speed = normalized_speed.flatten()
    normalized_lidar = normalized_lidar.flatten()
    normalized_prev_actions = normalized_prev_actions.flatten()
    
    return np.concatenate([normalized_speed, normalized_lidar, normalized_prev_actions])

def preprocess_obs(obs):
    speed = obs[0]
    lidar = obs[1]
    prev_actions = obs[2]
    
    return normalize_state((speed, lidar, prev_actions))

# Helper functions
def action_to_env_action(action):
    if action == 0:
        return np.array([1.0, 0.0, -1.0])  # Full throttle, turn left
    elif action == 1:
        return np.array([1.0, 0.0, 0.0])   # Full throttle, go straight
    else:
        return np.array([1.0, 0.0, 1.0])   # Full throttle, turn right

def calculate_reward(speed, progress, done, truncated):
    speed_reward = speed * 0.1
    progress_reward = progress * 10
    
    if done and not truncated:
        return -100  # Penalty for crashing
    elif truncated:
        return 1000 + progress_reward  # Bonus for completing the track
    else:
        return speed_reward + progress_reward

# Main training loop
def train_agent():
    env = get_environment()

    # Get an initial observation to determine the state shape
    initial_obs, _ = env.reset()
    try:
        initial_state = preprocess_obs(initial_obs)
        state_shape = initial_state.shape
        print(f"Processed state shape: {state_shape}")
    except Exception as e:
        print(f"Error preprocessing initial observation: {e}")
        print(f"Initial observation: {initial_obs}")
        return

    action_space = 3  # Left, Straight, Right

    agent = DQNAgent(state_shape[0], action_space)
    batch_size = 64
    n_episodes = 5000
    time_limit = 300  # 30 seconds time limit

    for e in range(n_episodes):
        obs, info = env.reset()
        try:
            state = preprocess_obs(obs)
        except Exception as e:
            print(f"Error preprocessing observation in episode {e}: {e}")
            print(f"Observation: {obs}")
            continue

        total_reward = 0
        done = False
        start_time = time.time()
        step_count = 0
        episode_speeds = []
        
        while not done:
            action = agent.act(state)
            env_action = action_to_env_action(action)
            next_obs, reward, terminated, truncated, info = env.step(env_action)
            
            try:
                next_state = preprocess_obs(next_obs)
            except Exception as e:
                print(f"Error preprocessing next observation in episode {e}, step {step_count}: {e}")
                print(f"Next observation: {next_obs}")
                break

            step_count += 1
            current_time = time.time()
            elapsed_time = current_time - start_time
            
            speed = next_obs[0]
            episode_speeds.append(speed)
            
            progress = info.get('progress', 0)
            done = terminated or truncated
            reward = calculate_reward(speed, progress, terminated, truncated)
            
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            if len(agent.memory) > batch_size:
                try:
                    agent.replay(batch_size)
                except Exception as e:
                    print(f"Error during replay in episode {e}, step {step_count}: {e}")
            
            if elapsed_time > time_limit:
                break
            
            time.sleep(0.01)
        
        if e % 5 == 0:
            agent.update_target_model()
        
        episode_time = time.time() - start_time
        average_speed = np.mean(episode_speeds) if episode_speeds else 0
        
        print(f"Episode: {e}/{n_episodes}, Total Reward: {float(total_reward):.2f}, Epsilon: {agent.epsilon:.2f}, "
              f"Time: {episode_time:.2f}s, Steps: {step_count}, Avg Speed: {float(average_speed):.2f}, Progress: {progress:.2f}")

        if e % 100 == 0:
            torch.save(agent.model.state_dict(), f'tmrl_dqn_model_episode_{e}.pth')

        time.sleep(0.1)

    torch.save(agent.model.state_dict(), 'tmrl_dqn_model_final.pth')
    print("Training finished.")

if __name__ == "__main__":
    train_agent()

Using device: cuda
Processed state shape: (80,)


  from .autonotebook import tqdm as notebook_tqdm
  self.memory.append((state, action, float(reward), next_state, done))
  print(f"Episode: {e}/{n_episodes}, Total Reward: {float(total_reward):.2f}, Epsilon: {agent.epsilon:.2f}, "


Episode: 0/5000, Total Reward: 814.73, Epsilon: 0.86, Time: 19.52s, Steps: 376, Avg Speed: 24.43, Progress: 0.00
Episode: 1/5000, Total Reward: 175.34, Epsilon: 0.80, Time: 7.12s, Steps: 128, Avg Speed: 21.51, Progress: 0.00
Episode: 2/5000, Total Reward: 80.85, Epsilon: 0.76, Time: 5.62s, Steps: 98, Avg Speed: 18.46, Progress: 0.00
Episode: 3/5000, Total Reward: 797.11, Epsilon: 0.62, Time: 20.87s, Steps: 403, Avg Speed: 22.33, Progress: 0.00
Episode: 4/5000, Total Reward: -24.47, Epsilon: 0.60, Time: 4.77s, Steps: 81, Avg Speed: 9.33, Progress: 0.00
Episode: 5/5000, Total Reward: 593.98, Epsilon: 0.50, Time: 18.52s, Steps: 356, Avg Speed: 19.50, Progress: 0.00
Episode: 6/5000, Total Reward: 60.27, Epsilon: 0.48, Time: 6.07s, Steps: 107, Avg Speed: 14.99, Progress: 0.00
Episode: 7/5000, Total Reward: 206.90, Epsilon: 0.43, Time: 9.82s, Steps: 182, Avg Speed: 16.87, Progress: 0.00




Episode: 8/5000, Total Reward: 205.03, Epsilon: 0.39, Time: 11.77s, Steps: 210, Avg Speed: 14.53, Progress: 0.00
Episode: 9/5000, Total Reward: 426.79, Epsilon: 0.34, Time: 14.07s, Steps: 267, Avg Speed: 19.73, Progress: 0.00
Episode: 10/5000, Total Reward: 804.07, Epsilon: 0.29, Time: 18.47s, Steps: 355, Avg Speed: 25.55, Progress: 0.00
Episode: 11/5000, Total Reward: -43.22, Epsilon: 0.28, Time: 4.77s, Steps: 81, Avg Speed: 7.01, Progress: 0.00
Episode: 12/5000, Total Reward: -51.49, Epsilon: 0.26, Time: 6.17s, Steps: 109, Avg Speed: 4.46, Progress: 0.00
Episode: 13/5000, Total Reward: 168.71, Epsilon: 0.25, Time: 6.07s, Steps: 107, Avg Speed: 25.13, Progress: 0.00
Episode: 14/5000, Total Reward: 317.68, Epsilon: 0.23, Time: 9.67s, Steps: 179, Avg Speed: 23.33, Progress: 0.00
Episode: 15/5000, Total Reward: -49.68, Epsilon: 0.22, Time: 4.87s, Steps: 83, Avg Speed: 6.07, Progress: 0.00
Episode: 16/5000, Total Reward: 10.77, Epsilon: 0.20, Time: 7.17s, Steps: 129, Avg Speed: 8.59, Prog