In [3]:
import gym
import numpy as np
import tensorflow as tf

# Setup Environment
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Q-network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(24, activation='relu', input_shape=(state_size,)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(action_size)
])

# Q-learning parameters
gamma = 0.99
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Q-learning algorithm
def q_learning(state, action, reward, next_state, done):
    target = reward + gamma * np.max(model.predict(next_state))
    with tf.GradientTape() as tape:
        q_values = model(state, training=True)
        action_mask = tf.one_hot(action, action_size)
        selected_q_value = tf.reduce_sum(q_values * action_mask, axis=1)
        loss = tf.reduce_mean(tf.square(target - selected_q_value))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return reward

# Training loop
num_episodes = 10
total_reward = 0
for episode in range(num_episodes):
    state = np.reshape(env.reset(), [1, state_size])
    done = False
    episode_reward = 0

    while not done:
        action = np.argmax(model.predict(state))
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        episode_reward += q_learning(state, action, reward, next_state, done)
        state = next_state

    total_reward += episode_reward

average_reward = total_reward / num_episodes
print(f"Average Reward: {average_reward}")


Average Reward: 9.3
