In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# DQN Agent class
class DQNAgent:
    def __init__(self, state_shape, num_actions):
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.model = self.build_model()

    def build_model(self):
        model = Sequential([
            Dense(64, activation='relu', input_shape=(self.state_shape,)),
            Dense(64, activation='relu'),
            Dense(self.num_actions)  # Output layer with one node for each action
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                      loss='mean_squared_error')
        return model

    def select_action(self, state, epsilon=0.1):
        if np.random.rand() < epsilon:
            return np.random.randint(self.num_actions)
        else:
            state = np.expand_dims(state, axis=0)
            q_values = self.model.predict(state)
            return np.argmax(q_values)

    def train(self, states, target_Q):
        self.model.fit(states, target_Q, verbose=0)

# Helper function to sample a mini-batch from the replay buffer
def sample_mini_batch_from_replay_buffer(replay_buffer, batch_size):
    mini_batch_indices = np.random.randint(len(replay_buffer), size=batch_size)
    return [replay_buffer[i] for i in mini_batch_indices]

# Environment setup
def get_reward(current_state, goal_state):
    # Implement your reward function based on the current and goal states
    # For example, you can use negative distance as the reward
    distance = np.linalg.norm(current_state - goal_state)
    return -distance

# DQN hyperparameters
state_shape = 4  # (x_current, y_current, x_goal, y_goal)
num_actions = 4  # For example, 4 actions: up, down, left, right
batch_size = 32
num_episodes = 1000
discount_factor = 0.99
epsilon_start = 1.0
epsilon_end = 0.1
epsilon_decay_steps = 1000

# Initialize the DQN agent
agent = DQNAgent(state_shape, num_actions)

# Replay buffer to store experiences
replay_buffer = []

# Training loop
epsilon = epsilon_start
for episode in range(num_episodes):
    current_state = np.random.rand(2)  # Initialize the current state randomly
    goal_state = np.random.rand(2)     # Initialize the goal state randomly

    total_reward = 0
    done = False

    while not done:
        # Epsilon-greedy action selection
        action = agent.select_action(np.concatenate([current_state, goal_state]), epsilon)

        # Simulate the action and observe the next state and reward
        next_state = current_state.copy()  # For simplicity, we assume the agent moves by one step
        if action == 0:
            next_state[1] += 1  # Move up
        elif action == 1:
            next_state[1] -= 1  # Move down
        elif action == 2:
            next_state[0] -= 1  # Move left
        else:
            next_state[0] += 1  # Move right

        reward = get_reward(current_state, goal_state)

        # Store the experience in the replay buffer
        replay_buffer.append({
            'state': np.concatenate([current_state, goal_state]),
            'action': action,
            'reward': reward,
            'next_state': np.concatenate([next_state, goal_state]),
            'done': done
        })

        total_reward += reward
        current_state = next_state

        if len(replay_buffer) >= batch_size:
            # Sample a mini-batch from the replay buffer
            mini_batch = sample_mini_batch_from_replay_buffer(replay_buffer, batch_size)

            states = np.array([experience['state'] for experience in mini_batch])
            actions = np.array([experience['action'] for experience in mini_batch])
            rewards = np.array([experience['reward'] for experience in mini_batch])
            next_states = np.array([experience['next_state'] for experience in mini_batch])
            dones = np.array([experience['done'] for experience in mini_batch])

            # Compute the target Q-values based on the Bellman equation
            target_Q = rewards + discount_factor * np.amax(agent.model.predict(next_states), axis=1) * (1 - dones)

            # Create a mask to zero out the Q-values for terminal states
            target_Q *= (1 - dones)

            # Train the DQN agent with the mini-batch
            agent.train(states, target_Q)

    # Decay epsilon for epsilon-greedy exploration
    epsilon = max(epsilon_end, epsilon_start - episode / epsilon_decay_steps)

    print(f"Episode {episode}: Total Reward = {total_reward}")

# After training, you can use the trained model for making predictions:
# goal_state = np.array([goal_x, goal_y])  # Set the goal state for testing
# current_state = np.array([start_x, start_y])  # Set the current state for testing
# action = agent.select_action(np.concatenate([current_state, goal_state]), epsilon=0.0)
# print(f"Selected Action: {action}")


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2023-07-20 11:52:02.685156: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-20 11:52:03.088855: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-20 11:52:03.093037: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# DQN Agent class
class DQNAgent:
    def __init__(self, state_shape, num_actions):
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.model = self.build_model()

    def build_model(self):
        model = Sequential([
            Dense(64, activation='relu', input_shape=(self.state_shape,)),
            Dense(64, activation='relu'),
            Dense(self.num_actions)  # Output layer with one node for each action
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                      loss='mean_squared_error')
        return model

    def select_action(self, state, epsilon=0.1):
        if np.random.rand() < epsilon:
            return np.random.randint(self.num_actions)
        else:
            state = np.expand_dims(state, axis=0)
            q_values = self.model.predict(state)
            return np.argmax(q_values)

    def train(self, states, target_Q):
        self.model.fit(states, target_Q, verbose=0)

In [3]:
# Helper function to sample a mini-batch from the replay buffer
def sample_mini_batch_from_replay_buffer(replay_buffer, batch_size):
    mini_batch_indices = np.random.randint(len(replay_buffer), size=batch_size)
    return [replay_buffer[i] for i in mini_batch_indices]

# Environment setup
def get_reward(current_state, goal_state):
    # Implement your reward function based on the current and goal states
    # For example, you can use negative distance as the reward
    distance = np.linalg.norm(current_state - goal_state)
    return -distance

In [4]:
# DQN hyperparameters
state_shape = 4  # (x_current, y_current, x_goal, y_goal)
num_actions = 4  # For example, 4 actions: up, down, left, right
batch_size = 32
num_episodes = 10
discount_factor = 0.99
epsilon_start = 1.0
epsilon_end = 0.1
epsilon_decay_steps = 1000

# Initialize the DQN agent
agent = DQNAgent(state_shape, num_actions)

# Replay buffer to store experiences
replay_buffer = []

In [6]:
# Training loop
epsilon = epsilon_start
for episode in range(num_episodes):
    current_state = np.random.rand(2)  # Initialize the current state randomly
    goal_state = np.random.rand(2)     # Initialize the goal state randomly

    total_reward = 0
    done = False

    while not done:
        # Epsilon-greedy action selection
        action = agent.select_action(np.concatenate([current_state, goal_state]), epsilon)

        # Simulate the action and observe the next state and reward
        next_state = current_state.copy()  # For simplicity, we assume the agent moves by one step
        if action == 0:
            next_state[1] += 1  # Move up
        elif action == 1:
            next_state[1] -= 1  # Move down
        elif action == 2:
            next_state[0] -= 1  # Move left
        else:
            next_state[0] += 1  # Move right

        reward = get_reward(current_state, goal_state)

        # Store the experience in the replay buffer
        replay_buffer.append({
            'state': np.concatenate([current_state, goal_state]),
            'action': action,
            'reward': reward,
            'next_state': np.concatenate([next_state, goal_state]),
            'done': done
        })

        total_reward += reward
        current_state = next_state

        if len(replay_buffer) >= batch_size:
            # Sample a mini-batch from the replay buffer
            mini_batch = sample_mini_batch_from_replay_buffer(replay_buffer, batch_size)

            states = np.array([experience['state'] for experience in mini_batch])
            actions = np.array([experience['action'] for experience in mini_batch])
            rewards = np.array([experience['reward'] for experience in mini_batch])
            next_states = np.array([experience['next_state'] for experience in mini_batch])
            dones = np.array([experience['done'] for experience in mini_batch])
            print("DONES: ",dones)

            # Compute the target Q-values based on the Bellman equation
            target_Q = rewards + discount_factor * np.amax(agent.model.predict(next_states), axis=1) * (1 - dones)

            # Create a mask to zero out the Q-values for terminal states
            target_Q *= (1 - dones)
            print("1-dones: ", target_Q)

    # Decay epsilon for epsilon-greedy exploration
    epsilon = max(epsilon_end, epsilon_start - episode / epsilon_decay_steps)

    print(f"Episode {episode}: Total Reward = {total_reward}")

# After training, you can use the trained model for making predictions:
# goal_state = np.array([goal_x, goal_y])  # Set the goal state for testing
# current_state = np.array([start_x, start_y])  # Set the current state for testing
# action = agent.select_action(np.concatenate([current_state, goal_state]), epsilon=0.0)
# print(f"Selected Action: {action}")

DONES:  [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False]
1-dones:  [ -4.35691161  -5.74569641  -3.78709321  -4.82001159  -1.71241617
  -6.39225821  -6.26684928  -8.83537026 -10.83245268  -4.82001159
  -6.26684928 -12.44371289  -8.83537026  -6.43353659 -12.44371289
  -9.41819356  -9.3382988   -9.41819356  -9.3382988   -4.82001159
 -12.19023078  -4.35691161  -6.43353659 -10.70312753  -6.43353659
 -11.96068226  -8.01967294 -11.5831928  -11.07157232  -4.1011998
  -9.25412193  -8.92970202]
DONES:  [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False]
1-dones:  [ -4.1011998  -10.78017974  -3.28533841 -10.02754388  -9.41819356
 -11.96068226 -12.48619483  -6.91730974 -12.44371289  -1.71241617
  -9.10481059  -4

KeyboardInterrupt: 

In [8]:
print(1-True)

0
