<a href="https://colab.research.google.com/github/Perfect-Cube/Volkswagon-imobilothon-4.0/blob/main/DDPG_QLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install tensorflow gym numpy




In [3]:
import gym
from gym import spaces
import numpy as np
import tensorflow as tf
import random
from collections import deque

# Define the Environment
class BatteryOptimizationEnv(gym.Env):
    def __init__(self):
        super(BatteryOptimizationEnv, self).__init__()
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)  # Acceleration/deceleration
        self.observation_space = spaces.Box(low=0, high=1, shape=(5,), dtype=np.float32)  # Battery, speed, etc.
        self.state = np.array([1.0, 0.5, 0.0, 0.5, 0.5])  # Arbitrary initial state
        self.battery_level = 1.0

    def reset(self):
        self.state = np.array([1.0, 0.5, 0.0, 0.5, 0.5])
        self.battery_level = 1.0
        return self.state

    def step(self, action):
        battery_usage = np.abs(action[0]) * 0.01  # Simplified battery consumption model
        self.battery_level -= battery_usage
        speed = self.state[1] + action[0] * 0.1
        self.state = np.clip(np.array([self.battery_level, speed, self.state[2], self.state[3], self.state[4]]), 0, 1)
        reward = -battery_usage
        done = self.battery_level <= 0.1
        return self.state, reward, done, {}

# Define the DDPG Agent
class DDPGAgent:
    def __init__(self, env):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.gamma = 0.99
        self.tau = 0.005
        self.buffer_capacity = 50000
        self.batch_size = 64
        self.noise_std_dev = 0.2
        self.buffer = deque(maxlen=self.buffer_capacity)
        self.actor_model = self.build_actor()
        self.critic_model = self.build_critic()
        self.target_actor = self.build_actor()
        self.target_critic = self.build_critic()
        self.update_target_networks(tau=1.0)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)

    def build_actor(self):
        inputs = tf.keras.layers.Input(shape=(self.state_size,)) # The shape should be a tuple
        out = tf.keras.layers.Dense(256, activation="relu")(inputs)
        out = tf.keras.layers.Dense(256, activation="relu")(out)
        outputs = tf.keras.layers.Dense(self.action_size, activation="tanh")(out)
        return tf.keras.Model(inputs, outputs)

    def build_critic(self):
        state_input = tf.keras.layers.Input(shape=(self.state_size,)) # The shape should be a tuple
        action_input = tf.keras.layers.Input(shape=(self.action_size,)) # The shape should be a tuple
        concat = tf.keras.layers.Concatenate()([state_input, action_input])
        out = tf.keras.layers.Dense(256, activation="relu")(concat)
        out = tf.keras.layers.Dense(256, activation="relu")(out)
        outputs = tf.keras.layers.Dense(1)(out)
        return tf.keras.Model([state_input, action_input], outputs)

    def update_target_networks(self, tau=None):
        tau = self.tau if tau is None else tau
        for target_weights, main_weights in zip(self.target_actor.weights, self.actor_model.weights):
            target_weights.assign(tau * main_weights + (1 - tau) * target_weights)
        for target_weights, main_weights in zip(self.target_critic.weights, self.critic_model.weights):
            target_weights.assign(tau * main_weights + (1 - tau) * target_weights)

    def add_to_buffer(self, experience):
        self.buffer.append(experience)

    def sample_from_buffer(self):
        batch = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return states, actions, rewards, next_states, dones

    def train(self):
        if len(self.buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones = self.sample_from_buffer()
        next_actions = self.target_actor(next_states)
        target_qs = rewards + self.gamma * (1 - dones) * tf.squeeze(self.target_critic([next_states, next_actions]))
        with tf.GradientTape() as tape:
            critic_qs = tf.squeeze(self.critic_model([states, actions]))
            critic_loss = tf.keras.losses.MSE(critic_qs, target_qs)
        grads = tape.gradient(critic_loss, self.critic_model.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(grads, self.critic_model.trainable_variables))
        with tf.GradientTape() as tape:
            actions_pred = self.actor_model(states)
            actor_loss = -tf.reduce_mean(self.critic_model([states, actions_pred]))
        grads = tape.gradient(actor_loss, self.actor_model.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(grads, self.actor_model.trainable_variables))
        self.update_target_networks()

    def act(self, state, noise=0.1):
        state = np.expand_dims(state, axis=0)
        action = tf.squeeze(self.actor_model(state)).numpy()
        action += noise * np.random.normal(scale=self.noise_std_dev)
        return np.clip(action, -1, 1)

    def train_agent(self, episodes=1000):
        for episode in range(episodes):
            state = self.env.reset()
            episode_reward = 0
            done = False
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                self.add_to_buffer((state, action, reward, next_state, done))
                self.train()
                episode_reward += reward
                state = next_state
            print(f"Episode {episode+1}: Reward: {episode_reward}")

# Train the agent
env = BatteryOptimizationEnv()
agent = DDPGAgent(env)
agent.train_agent(episodes=100)


IndexError: invalid index to scalar variable.

In [8]:
import gym
from gym import spaces
import numpy as np
import tensorflow as tf
import random
from collections import deque

# Define the Environment
class BatteryOptimizationEnv(gym.Env):
    def __init__(self):
        super(BatteryOptimizationEnv, self).__init__()
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)  # Acceleration/deceleration
        self.observation_space = spaces.Box(low=0, high=1, shape=(5,), dtype=np.float32)  # Battery, speed, etc.
        self.state = np.array([1.0, 0.5, 0.0, 0.5, 0.5])  # Arbitrary initial state
        self.battery_level = 1.0

    def reset(self):
        self.state = np.array([1.0, 0.5, 0.0, 0.5, 0.5])
        self.battery_level = 1.0
        return self.state

    def step(self, action):
        battery_usage = np.abs(action[0]) * 0.01  # Simplified battery consumption model
        self.battery_level -= battery_usage
        speed = self.state[1] + action[0] * 0.1
        self.state = np.clip(np.array([self.battery_level, speed, self.state[2], self.state[3], self.state[4]]), 0, 1)
        reward = -battery_usage
        done = self.battery_level <= 0.1
        return self.state, reward, done, {}

# Define the DDPG Agent
class DDPGAgent:
    def __init__(self, env):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.gamma = 0.99
        self.tau = 0.005
        self.buffer_capacity = 50000
        self.batch_size = 64
        self.noise_std_dev = 0.2
        self.buffer = deque(maxlen=self.buffer_capacity)
        self.actor_model = self.build_actor()
        self.critic_model = self.build_critic()
        self.target_actor = self.build_actor()
        self.target_critic = self.build_critic()
        self.update_target_networks(tau=1.0)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)

    def build_actor(self):
        inputs = tf.keras.layers.Input(shape=(self.state_size,))
        out = tf.keras.layers.Dense(256, activation="relu")(inputs)
        out = tf.keras.layers.Dense(256, activation="relu")(out)
        outputs = tf.keras.layers.Dense(self.action_size, activation="tanh")(out)
        return tf.keras.Model(inputs, outputs)

    def build_critic(self):
        state_input = tf.keras.layers.Input(shape=(self.state_size,))  # Corrected shape here
        action_input = tf.keras.layers.Input(shape=(self.action_size,))
        concat = tf.keras.layers.Concatenate()([state_input, action_input])
        out = tf.keras.layers.Dense(256, activation="relu")(concat)
        out = tf.keras.layers.Dense(256, activation="relu")(out)
        outputs = tf.keras.layers.Dense(1)(out)
        return tf.keras.Model([state_input, action_input], outputs)

    def update_target_networks(self, tau=None):
        tau = self.tau if tau is None else tau
        for target_weights, main_weights in zip(self.target_actor.weights, self.actor_model.weights):
            target_weights.assign(tau * main_weights + (1 - tau) * target_weights)
        for target_weights, main_weights in zip(self.target_critic.weights, self.critic_model.weights):
            target_weights.assign(tau * main_weights + (1 - tau) * target_weights)

    def add_to_buffer(self, experience):
        self.buffer.append(experience)

    def sample_from_buffer(self):
        batch = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return states, actions, rewards, next_states, dones

    def train(self):
        if len(self.buffer) < self.batch_size:
            return

        # Sample a batch from the buffer
        states, actions, rewards, next_states, dones = self.sample_from_buffer()

        # Convert rewards and dones to tensors with the right shape
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        dones = tf.convert_to_tensor(dones, dtype=tf.float32)

        # Compute target Q-values
        next_actions = self.target_actor(next_states)
        target_qs = self.target_critic([next_states, next_actions])
        target_qs = rewards + self.gamma * (1 - dones) * tf.squeeze(target_qs)

        # Update Critic Network
        with tf.GradientTape() as tape:
            critic_qs = self.critic_model([states, actions])
            critic_loss = tf.keras.losses.MSE(target_qs, tf.squeeze(critic_qs))
        critic_grads = tape.gradient(critic_loss, self.critic_model.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic_model.trainable_variables))

        # Update Actor Network
        with tf.GradientTape() as tape:
            actions_pred = self.actor_model(states)
            actor_loss = -tf.reduce_mean(self.critic_model([states, actions_pred]))
        actor_grads = tape.gradient(actor_loss, self.actor_model.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor_model.trainable_variables))

        # Update target networks
        self.update_target_networks()

    def act(self, state, noise=0.1):
      state = np.expand_dims(state, axis=0)
      action = self.actor_model(state).numpy()  # Remove tf.squeeze
      action += noise * np.random.normal(scale=self.noise_std_dev, size=self.action_size)  # Add size to noise
      return np.clip(action, -1, 1)

    def train_agent(self, episodes=1000):
        for episode in range(episodes):
            state = self.env.reset()
            episode_reward = 0
            done = False
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                self.add_to_buffer((state, action, reward, next_state, done))
                self.train()
                episode_reward += reward
                state = next_state
            print(f"Episode {episode+1}: Reward: {episode_reward}")

# Train the agent
env = BatteryOptimizationEnv()
agent = DDPGAgent(env)
agent.train_agent(episodes=100)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.

In [15]:
import gym
from gym import spaces
import numpy as np
import random

# Define the Environment
class BatteryOptimizationEnv(gym.Env):
    def __init__(self):
        super(BatteryOptimizationEnv, self).__init__()
        # Discretize state and action spaces for Q-learning
        self.battery_levels = np.linspace(0, 1, 11)  # Battery level from 0.0 to 1.0 in steps of 0.1
        self.speeds = np.linspace(0, 1, 11)  # Speed level from 0.0 to 1.0 in steps of 0.1
        self.action_space = spaces.Discrete(3)  # -1, 0, +1 acceleration values
        self.observation_space = spaces.Discrete(len(self.battery_levels) * len(self.speeds))

        # Define initial state
        self.state = (10, 5)  # Initial battery level and speed index (1.0 battery, 0.5 speed)
        self.done = False

    def reset(self):
        self.state = (10, 5)  # Reset battery to full and moderate speed
        self.done = False
        return self.state

    def step(self, action):
        battery_idx, speed_idx = self.state

        # Define battery consumption based on action
        if action == 0:  # Decelerate
            speed_idx = max(0, speed_idx - 1)
            battery_consumed = 0.01
        elif action == 1:  # Maintain speed
            battery_consumed = 0.02
        else:  # Accelerate
            speed_idx = min(len(self.speeds) - 1, speed_idx + 1)
            battery_consumed = 0.03

        # Update battery level
        battery_idx = max(0, battery_idx - int(battery_consumed * 10))
        self.state = (battery_idx, speed_idx)

        # Calculate reward
        reward = -battery_consumed
        self.done = battery_idx <= 20  # End episode if battery level is too low

        return self.state, reward, self.done, {}

    def get_state_index(self, state):
        battery_idx, speed_idx = state
        return battery_idx * len(self.speeds) + speed_idx

# Define the Q-learning Agent
class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.995):
        self.env = env
        self.q_table = np.zeros((env.observation_space.n, env.action_space.n))
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay

    def choose_action(self, state):
        if random.uniform(0, 1) < self.exploration_rate:
            return self.env.action_space.sample()  # Explore
        state_index = self.env.get_state_index(state)
        return np.argmax(self.q_table[state_index])  # Exploit

    def update_q_table(self, state, action, reward, next_state):
        state_index = self.env.get_state_index(state)
        next_state_index = self.env.get_state_index(next_state)

        # Q-learning update rule
        best_next_action = np.argmax(self.q_table[next_state_index])
        td_target = reward + self.discount_factor * self.q_table[next_state_index, best_next_action]
        self.q_table[state_index, action] += self.learning_rate * (td_target - self.q_table[state_index, action])

        # Print the Q-table after each update
        print(f"Updated Q-table for state {state} and action {action}:\n{self.q_table}")

    def train(self, episodes=10):
        for episode in range(episodes):
            state = self.env.reset()
            total_reward = 0
            done = False

            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)

                # Update Q-table
                self.update_q_table(state, action, reward, next_state)
                total_reward += reward
                state = next_state

            # Decay exploration rate
            self.exploration_rate *= self.exploration_decay
            print(f"Episode {episode + 1}: Total Reward: {total_reward}")

# Train the agent
env = BatteryOptimizationEnv()
agent = QLearningAgent(env)
agent.train(episodes=10)


Updated Q-table for state (10, 5) and action 2:
[[ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.     0.   ]
 [ 0.     0.    