In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import random
import gym
import matplotlib.pyplot as plt  # Import for plotting

EPISODES = 60
BATCH_SIZE = 64
GAMMA = 0.99 
EPSILON_DECAY = 0.995 
EPSILON_MIN = 0.01 
LEARNING_RATE = 0.001
TAU = 0.125  

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = []
        self.epsilon = 1.0  
        self.model = self.build_model() 
        self.target_model = self.build_model() 
        self.update_target_model()  
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

    def build_model(self):
        model = models.Sequential()
        model.add(layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))  
        return model

    def update_target_model(self):
        target_weights = self.target_model.get_weights()
        model_weights = self.model.get_weights()
        new_weights = [TAU * model_w + (1 - TAU) * target_w for model_w, target_w in zip(model_weights, target_weights)]
        self.target_model.set_weights(new_weights)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def epsilon_greedy(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(np.expand_dims(state, axis=0))  
        return np.argmax(q_values[0]) 
    def replay(self):
        if len(self.memory) < BATCH_SIZE:
            return

        minibatch = random.sample(self.memory, BATCH_SIZE)
        states = np.array([item[0] for item in minibatch])
        actions = np.array([item[1] for item in minibatch])
        rewards = np.array([item[2] for item in minibatch])
        next_states = np.array([item[3] for item in minibatch])
        dones = np.array([item[4] for item in minibatch])

        next_q_values = self.target_model(next_states) 
        max_next_q_values = np.max(next_q_values, axis=1)  
        target = rewards + GAMMA * max_next_q_values * (1 - dones)

        with tf.GradientTape() as tape:
            q_values = self.model(states)  
            q_action = tf.reduce_sum(q_values * tf.one_hot(actions, self.action_size), axis=1)  
            loss = tf.reduce_mean(tf.square(target - q_action))  

        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        self.update_target_model()

        if self.epsilon > EPSILON_MIN:
            self.epsilon *= EPSILON_DECAY


def train_dqn():
    env = gym.make('CartPole-v1', render_mode="human")
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)

    rewards_per_episode = []  # List to store total rewards for each episode

    for episode in range(EPISODES):
        state = env.reset()
        if isinstance(state, tuple):  
            state = state[0]
        state = np.array(state) 
        done = False
        total_reward = 0

        while not done:
            action = agent.epsilon_greedy(state)  
            next_state, reward, done, truncated, _ = env.step(action) 
            if isinstance(next_state, tuple):  
                next_state = next_state[0]
            next_state = np.array(next_state)  

            agent.remember(state, action, reward, next_state, done)  
            agent.replay() 

            state = next_state 
            total_reward += reward

        rewards_per_episode.append(total_reward)  # Append total reward to the list
        print(f"Episode: {episode+1}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.4f}")

    env.close()

    # Plot the rewards per episode
    plt.plot(rewards_per_episode)
    plt.title("Total Rewards Per Episode")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.show()

if __name__ == "__main__":
    train_dqn()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  if not isinstance(terminated, (bool, np.bool8)):


Episode: 1, Total Reward: 26.0, Epsilon: 1.0000
Episode: 2, Total Reward: 44.0, Epsilon: 0.9655
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
Episode: 3, Total Reward: 16.0, Epsilon: 0.8911
Episode: 4, Total Reward: 19.0, Epsilon: 0.8102
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


KeyboardInterrupt: 