In [1]:
import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# Define Deep Q-Network (DQN) model
class DQN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [3]:
# Define replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def add(self, state, action, reward, next_state, done):
        experience = (state, action, reward, next_state, done)
        self.buffer.append(experience)
        if len(self.buffer) > self.capacity:
            self.buffer.pop(0)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)


In [4]:
# Define Deep Q-Learning agent
class DQNAgent:
    def __init__(self, input_size, output_size, hidden_size=64, learning_rate=0.001,
                 gamma=0.99, epsilon_start=1.0, epsilon_decay=0.995, epsilon_min=0.01,
                 replay_buffer_capacity=10000, batch_size=64):
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.replay_buffer = ReplayBuffer(replay_buffer_capacity)
        self.batch_size = batch_size
        self.q_network = DQN(input_size, hidden_size, output_size)
        self.target_network = DQN(input_size, hidden_size, output_size)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.loss_function = nn.MSELoss()

    def epsilon_greedy_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(range(self.output_size))
        else:
            with torch.no_grad():
                q_values = self.q_network(torch.FloatTensor(state))
            return torch.argmax(q_values).item()

    def train(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if len(self.replay_buffer.buffer) > self.batch_size:
            batch = self.replay_buffer.sample(self.batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)
            states = torch.FloatTensor(states)
            actions = torch.LongTensor(actions)
            rewards = torch.FloatTensor(rewards)
            next_states = torch.FloatTensor(next_states)
            dones = torch.FloatTensor(dones)
            q_values = self.q_network(states)
            next_q_values = self.target_network(next_states).max(1)[0]
            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
            q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
            loss = self.loss_function(q_values, target_q_values.detach())
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())


In [5]:
# Initialize environment and agent
env = gym.make('CartPole-v1')
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
agent = DQNAgent(input_size, output_size)

In [6]:
# Training
num_episodes = 1000  # Define the number of episodes
max_steps_per_episode = 500  # Maximum number of steps per episode

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    step = 0  # Initialize step counter for the current episode

    while not done and step < max_steps_per_episode:
        action = agent.epsilon_greedy_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        step += 1  # Increment step counter

    if episode % 100 == 0:
        agent.update_target_network()

    print(f"Episode {episode}, Total Reward: {total_reward}, Steps: {step}")

# Close the environment
env.close()

ValueError: too many values to unpack (expected 4)

In [None]:
#OnlyRuns ON GOOGLE COLAB