In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random


# Define the network architecture
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# Define the replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.index = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.index] = (state, action, reward, next_state, done)
        self.index = (self.index + 1) % self.capacity

    def sample(self, batch_size):
        batch = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in batch:
            state, action, reward, next_state, done = self.buffer[i]
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)
        return (
            torch.tensor(np.array(states)).float(),
            torch.tensor(np.array(actions)).long(),
            torch.tensor(np.array(rewards)).unsqueeze(1).float(),
            torch.tensor(np.array(next_states)).float(),
            torch.tensor(np.array(dones)).unsqueeze(1).int()
        )

    def __len__(self):
        return len(self.buffer)


# Define the Double DQN agent
class DDQNAgent:
    def __init__(self, state_size, action_size, seed, learning_rate=1e-3, capacity=1000000,
                 discount_factor=0.99, tau=1e-3, update_every=4, batch_size=64):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.tau = tau
        self.update_every = update_every
        self.batch_size = batch_size
        self.steps = 0

        self.qnetwork_local = QNetwork(state_size, action_size)
        self.qnetwork_target = QNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate)
        self.replay_buffer = ReplayBuffer(capacity)
        self.update_target_network()

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay buffer
        self.replay_buffer.push(state, action, reward, next_state, done)

        # Learn every update_every steps
        self.steps += 1
        if self.steps % self.update_every == 0:
            if len(self.replay_buffer) > self.batch_size:
                experiences = self.replay_buffer.sample(self.batch_size)
                self.learn(experiences)

    def act(self, state, eps=0.0):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + self.discount_factor * (Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions.view(-1, 1))

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

    def update_target_network(self):
        # Update target network parameters with polyak averaging
        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

In [None]:
import gym
import numpy as np
from ddqn import DDQNAgent
import matplotlib.pyplot as plt

# Create the environment
env = gym.make('CartPole-v1',render_mode="human")

# Get the state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Set the random seed
seed = 0

# Create the DDQN agent
agent = DDQNAgent(state_size, action_size, seed)

# Set the number of episodes and the maximum number of steps per episode
num_episodes = 1500
max_steps = 1500

# Set the exploration rate
eps = eps_start = 1.0
eps_end = 0.01
eps_decay = 0.995

# Set the rewards and scores lists
rewards = []
scores = []

# Run the training loop
for i_episode in range(num_episodes):
    print(f'Episode: {i_episode}')
    # Initialize the environment and the state
    state = env.reset()[0]
    score = 0
    # eps = eps_end + (eps_start - eps_end) * np.exp(-i_episode / eps_decay)
    # Update the exploration rate
    eps = max(eps_end, eps_decay * eps)
    
    # Run the episode
    for t in range(max_steps):
        # Select an action and take a step in the environment
        action = agent.act(state, eps)
        next_state, reward, done, trunc, _ = env.step(action)
        # Store the experience in the replay buffer and learn from it
        agent.step(state, action, reward, next_state, done)
        # Update the state and the score
        state = next_state
        score += reward
        # Break the loop if the episode is done or truncated
        if done or trunc:
            break
        
    print(f"\tScore: {score}, Epsilon: {eps}")
    # Save the rewards and scores
    rewards.append(score)
    scores.append(np.mean(rewards[-100:]))

# Close the environment
env.close()

plt.ylabel("Score")
plt.xlabel("Episode")
plt.plot(range(len(rewards)), rewards)
plt.plot(range(len(rewards)), scores)
plt.legend(['Reward', "Score"])
plt.show()

Episode: 0
	Score: 13.0, Epsilon: 0.995
Episode: 1
	Score: 13.0, Epsilon: 0.990025
Episode: 2
	Score: 12.0, Epsilon: 0.985074875
Episode: 3
	Score: 37.0, Epsilon: 0.9801495006250001
Episode: 4
	Score: 46.0, Epsilon: 0.9752487531218751
Episode: 5
	Score: 19.0, Epsilon: 0.9703725093562657
Episode: 6
	Score: 20.0, Epsilon: 0.9655206468094844
Episode: 7
	Score: 45.0, Epsilon: 0.960693043575437
Episode: 8
	Score: 26.0, Epsilon: 0.9558895783575597
Episode: 9
	Score: 9.0, Epsilon: 0.9511101304657719
Episode: 10
	Score: 26.0, Epsilon: 0.946354579813443
Episode: 11
	Score: 21.0, Epsilon: 0.9416228069143757
Episode: 12
	Score: 24.0, Epsilon: 0.9369146928798039
Episode: 13
	Score: 36.0, Epsilon: 0.9322301194154049
Episode: 14
	Score: 14.0, Epsilon: 0.9275689688183278
Episode: 15
	Score: 23.0, Epsilon: 0.9229311239742362
Episode: 16
	Score: 10.0, Epsilon: 0.918316468354365
Episode: 17
	Score: 25.0, Epsilon: 0.9137248860125932
Episode: 18
	Score: 13.0, Epsilon: 0.9091562615825302
Episode: 19
	Score

	Score: 9.0, Epsilon: 0.46444185833082485
Episode: 153
	Score: 9.0, Epsilon: 0.46211964903917074
Episode: 154
	Score: 19.0, Epsilon: 0.4598090507939749
Episode: 155
	Score: 14.0, Epsilon: 0.457510005540005
Episode: 156
	Score: 10.0, Epsilon: 0.45522245551230495
Episode: 157
	Score: 12.0, Epsilon: 0.4529463432347434
Episode: 158
	Score: 12.0, Epsilon: 0.4506816115185697
Episode: 159
	Score: 11.0, Epsilon: 0.4484282034609769
Episode: 160
	Score: 9.0, Epsilon: 0.446186062443672
Episode: 161
	Score: 11.0, Epsilon: 0.4439551321314536
Episode: 162
	Score: 10.0, Epsilon: 0.4417353564707963
Episode: 163
	Score: 11.0, Epsilon: 0.43952667968844233
Episode: 164
	Score: 9.0, Epsilon: 0.43732904629000013
Episode: 165
	Score: 10.0, Epsilon: 0.4351424010585501
Episode: 166
	Score: 12.0, Epsilon: 0.43296668905325736
Episode: 167
	Score: 12.0, Epsilon: 0.43080185560799106
Episode: 168
	Score: 10.0, Epsilon: 0.4286478463299511
Episode: 169
	Score: 12.0, Epsilon: 0.42650460709830135
Episode: 170
	Score: 

	Score: 13.0, Epsilon: 0.22007483514733558
Episode: 302
	Score: 10.0, Epsilon: 0.2189744609715989
Episode: 303
	Score: 9.0, Epsilon: 0.2178795886667409
Episode: 304
	Score: 10.0, Epsilon: 0.2167901907234072
Episode: 305
	Score: 9.0, Epsilon: 0.21570623976979014
Episode: 306
	Score: 12.0, Epsilon: 0.21462770857094118
Episode: 307
	Score: 11.0, Epsilon: 0.21355457002808648
Episode: 308
	Score: 13.0, Epsilon: 0.21248679717794605
Episode: 309
	Score: 9.0, Epsilon: 0.21142436319205632
Episode: 310
	Score: 11.0, Epsilon: 0.21036724137609603
Episode: 311
	Score: 11.0, Epsilon: 0.20931540516921554
Episode: 312
	Score: 9.0, Epsilon: 0.20826882814336947
Episode: 313
	Score: 10.0, Epsilon: 0.20722748400265262
Episode: 314
	Score: 16.0, Epsilon: 0.20619134658263935
Episode: 315
	Score: 11.0, Epsilon: 0.20516038984972615
Episode: 316
	Score: 11.0, Epsilon: 0.2041345879004775
Episode: 317
	Score: 12.0, Epsilon: 0.2031139149609751
Episode: 318
	Score: 10.0, Epsilon: 0.20209834538617025
Episode: 319
	

	Score: 9.0, Epsilon: 0.1042820154910064
Episode: 451
	Score: 10.0, Epsilon: 0.10376060541355137
Episode: 452
	Score: 10.0, Epsilon: 0.1032418023864836
Episode: 453
	Score: 9.0, Epsilon: 0.10272559337455119
Episode: 454
	Score: 10.0, Epsilon: 0.10221196540767843
Episode: 455
	Score: 11.0, Epsilon: 0.10170090558064004
Episode: 456
	Score: 8.0, Epsilon: 0.10119240105273684
Episode: 457
	Score: 12.0, Epsilon: 0.10068643904747315
Episode: 458
	Score: 14.0, Epsilon: 0.10018300685223579
Episode: 459
	Score: 10.0, Epsilon: 0.0996820918179746
Episode: 460
	Score: 24.0, Epsilon: 0.09918368135888474
Episode: 461
	Score: 9.0, Epsilon: 0.09868776295209031
Episode: 462
	Score: 10.0, Epsilon: 0.09819432413732986
Episode: 463
	Score: 10.0, Epsilon: 0.09770335251664321
Episode: 464
	Score: 9.0, Epsilon: 0.09721483575406
Episode: 465
	Score: 10.0, Epsilon: 0.09672876157528969
Episode: 466
	Score: 8.0, Epsilon: 0.09624511776741324
Episode: 467
	Score: 11.0, Epsilon: 0.09576389217857617
Episode: 468
	Sco

	Score: 10.0, Epsilon: 0.04966213277390804
Episode: 599
	Score: 10.0, Epsilon: 0.0494138221100385
Episode: 600
	Score: 12.0, Epsilon: 0.04916675299948831
Episode: 601
	Score: 22.0, Epsilon: 0.04892091923449087
Episode: 602
	Score: 11.0, Epsilon: 0.04867631463831842
Episode: 603
	Score: 10.0, Epsilon: 0.048432933065126825
Episode: 604
	Score: 13.0, Epsilon: 0.048190768399801194
Episode: 605
	Score: 10.0, Epsilon: 0.04794981455780219
Episode: 606
	Score: 10.0, Epsilon: 0.04771006548501318
Episode: 607
	Score: 9.0, Epsilon: 0.047471515157588115
Episode: 608
	Score: 15.0, Epsilon: 0.047234157581800176
Episode: 609
	Score: 15.0, Epsilon: 0.046997986793891174
Episode: 610
	Score: 18.0, Epsilon: 0.04676299685992172
Episode: 611
	Score: 14.0, Epsilon: 0.04652918187562211
Episode: 612
	Score: 10.0, Epsilon: 0.046296535966244
Episode: 613
	Score: 14.0, Epsilon: 0.046065053286412784
Episode: 614
	Score: 16.0, Epsilon: 0.04583472801998072
Episode: 615
	Score: 12.0, Epsilon: 0.045605554379880814
Ep

	Score: 19.0, Epsilon: 0.023888845163905856
Episode: 745
	Score: 17.0, Epsilon: 0.023769400938086327
Episode: 746
	Score: 29.0, Epsilon: 0.023650553933395897
Episode: 747
	Score: 10.0, Epsilon: 0.023532301163728918
Episode: 748
	Score: 11.0, Epsilon: 0.023414639657910272
Episode: 749
	Score: 21.0, Epsilon: 0.023297566459620722
Episode: 750
	Score: 14.0, Epsilon: 0.023181078627322618
Episode: 751
	Score: 28.0, Epsilon: 0.023065173234186005
Episode: 752
	Score: 10.0, Epsilon: 0.022949847368015076
Episode: 753
	Score: 10.0, Epsilon: 0.022835098131175
Episode: 754
	Score: 9.0, Epsilon: 0.022720922640519125
Episode: 755
	Score: 9.0, Epsilon: 0.02260731802731653
Episode: 756
	Score: 20.0, Epsilon: 0.022494281437179946
Episode: 757
	Score: 29.0, Epsilon: 0.022381810029994047
Episode: 758
	Score: 20.0, Epsilon: 0.022269900979844076
Episode: 759
	Score: 10.0, Epsilon: 0.022158551474944856
Episode: 760
	Score: 12.0, Epsilon: 0.022047758717570132
Episode: 761
	Score: 52.0, Epsilon: 0.021937519923

	Score: 11.0, Epsilon: 0.01154893304942575
Episode: 890
	Score: 25.0, Epsilon: 0.011491188384178622
Episode: 891
	Score: 195.0, Epsilon: 0.011433732442257729
Episode: 892
	Score: 119.0, Epsilon: 0.01137656378004644
Episode: 893
	Score: 16.0, Epsilon: 0.011319680961146208
Episode: 894
	Score: 43.0, Epsilon: 0.011263082556340478
Episode: 895
	Score: 12.0, Epsilon: 0.011206767143558775
Episode: 896
	Score: 110.0, Epsilon: 0.011150733307840981
Episode: 897
	Score: 12.0, Epsilon: 0.011094979641301777
Episode: 898
	Score: 9.0, Epsilon: 0.011039504743095268
Episode: 899
	Score: 101.0, Epsilon: 0.01098430721937979
Episode: 900
	Score: 227.0, Epsilon: 0.010929385683282892
Episode: 901
	Score: 124.0, Epsilon: 0.010874738754866477
Episode: 902
	Score: 30.0, Epsilon: 0.010820365061092144
Episode: 903
	Score: 145.0, Epsilon: 0.010766263235786683
Episode: 904
	Score: 165.0, Epsilon: 0.01071243191960775
Episode: 905
	Score: 206.0, Epsilon: 0.010658869760009713
Episode: 906
	Score: 71.0, Epsilon: 0.01