In [4]:
import numpy as np
import tensorflow as tf
# ===== Q-Learning Example =====
# Define the environment (simple example)
n_states = 5  # Number of states
n_actions = 2  # Number of possible actions

# Initialize the Q-table with zeros
Q = np.zeros((n_states, n_actions))

# Set hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

# Example learning loop
for episode in range(1000):
    state = np.random.randint(0, n_states)  # Random starting state

    done = False
    steps = 0  # Add a step counter to avoid infinite loops
    while not done and steps < 100:  # Limit the number of steps per episode
        # Choose an action using epsilon-greedy strategy
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(0, n_actions)  # Explore
        else:
            action = np.argmax(Q[state, :])  # Exploit known Q-values

        # Simulate action and observe reward and next state
        next_state = np.random.randint(0, n_states)  # Random next state
        reward = np.random.uniform(-1, 1)  # Random reward

        # Update Q-value using Bellman equation
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])

        state = next_state  # Move to the next state
        steps += 1  # Increment step counter

        # Example condition to end the episode
        if steps >= 100:  # End episode after 100 steps
            done = True



In [7]:
# ===== Policy-Gradient REINFORCE Example =====
# Define the policy network (simple neural network)
n_states = 4  # Example: 4 input features
n_actions = 2  # Example: 2 possible actions

# Build the policy model with explicit Input layer
model = tf.keras.Sequential([
    tf.keras.Input(shape=(n_states,)),    # added explicit Input declaration
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(n_actions, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# Function to sample an action based on policy distribution
def get_action(state):
    action_probs = model(state[np.newaxis, :])
    return np.random.choice(n_actions, p=action_probs.numpy()[0])

# Placeholder for rewards and actions
states = []
actions = []
rewards = []

# Example learning loop
for episode in range(1000):
    state = np.random.rand(n_states)  # Example random state

    done = False
    while not done:
        # Sample an action from the policy
        action = get_action(state)
        next_state = np.random.rand(n_states)  # Simulate next state
        reward = np.random.uniform(-1, 1)  # Simulate reward

        # Store trajectory
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

        # Break when a stopping condition is met (random here for simplicity)
        if np.random.rand() < 0.1:
            break

    # Compute cumulative rewards
    cumulative_rewards = np.zeros_like(rewards)
    for t in reversed(range(len(rewards))):
        cumulative_rewards[t] = (
            rewards[t] + (0.9 * cumulative_rewards[t+1] if t+1 < len(rewards) else 0)
        )

    # Update policy using the REINFORCE algorithm
    with tf.GradientTape() as tape:
        action_probs = model(np.array(states, dtype=np.float32))
        action_masks = tf.one_hot(actions, n_actions)
        log_probs = tf.reduce_sum(action_masks * tf.math.log(action_probs), axis=1)
        loss = -tf.reduce_mean(log_probs * cumulative_rewards)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Clear trajectory for next episode
    states, actions, rewards = [], [], []


In [9]:
# Q Learning uses value based methods to estimate the Q values of state action pairs and is effective for discrete action spaces. Often employing the Epsilon greedy strategy for exploration.

In [10]:
# In contrast, policy gradient directly learns the policy mapping states to actions, making it suitable for continuous or high dimensional action spaces and uses gradient descent to optimize the policy.

In [13]:
# Q learning is typically used for simpler environments with discrete actions, while policy gradients are better suited for more complex tasks with continuous actions, such as robotics.

In [14]:
# Q learning efficiently learns the best action by updating its action value function, its queue function, through rewards obtained from the environment.

In [15]:
#  In contrast, policy gradients are more appropriate for environments with continuous action spaces, such as controlling a robotic arm, where actions involve fine, continuous adjustments like varying joint angles. This approach is advantageous in environments where randomness and exploration are essential for finding optimal solutions.

In [16]:
# Use Q learning when the action space is small and discrete. You need a straightforward approach to estimate state action values. The environment is less complex and can be modeled with a Q table.

In [17]:
# Use policy gradients when the action space is large or continuous. You need a more flexible approach that can handle complex policies. You are working with high dimensional environments, like video games or robotics.