In [None]:
# pip3 install gym==0.17.1

In [14]:
import random
from collections import deque
from itertools import count

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [15]:
env_name = "CartPole"

# Define hyperparameters
lr = 1e-4
GAMMA = 0.99
EXPLORE = 20000
INITIAL_EPSILON = 0.1
FINAL_EPSILON = 0.0001
REPLAY_MEMORY = 50000
BATCH = 16
UPDATE_STEPS = 4

# Initialize exploration parameter and other variables
epsilon = INITIAL_EPSILON
learn_steps = 0
begin_learn = False
episode_reward = 0

In [16]:
class QNetwork(nn.Module):
    def __init__(self, n_state=4, n_action=2, network_type=1):
        super(QNetwork, self).__init__()

        # Define network architecture
        self.fc1 = nn.Linear(n_state, 64)
        self.relu = nn.ReLU()
        self.fc_value = nn.Linear(64, 256)
        self.fc_adv = nn.Linear(64, 256)
        self.value = nn.Linear(256, 1)
        self.adv = nn.Linear(256, n_action)

        # Store the type of network (type 1 or type 2)
        self.network_type = network_type

    def forward(self, state):
        # Forward pass through the network
        y = self.relu(self.fc1(state))
        value = self.relu(self.fc_value(y))
        adv = self.relu(self.fc_adv(y))

        value = self.value(value)
        adv = self.adv(adv)

        adv_average = torch.mean(adv, dim=1, keepdim=True)
        adv_max = torch.max(adv, dim=1, keepdim=True)

        # Compute Q-values based on the network type
        if self.network_type == 1:
            Q = value + adv - adv_average
        elif self.network_type == 2:
            Q = value + adv - adv_max

        return Q
    
    def select_action(self, state):
        # Select action based on Q-values
        with torch.no_grad():
            Q = self.forward(state)
            action_index = torch.argmax(Q, dim=1)
        return action_index.item()


In [17]:
class Memory:
    def __init__(self, memory_size: int):
        self.memory_size = memory_size
        self.buffer = deque(maxlen=self.memory_size)

    def add(self, experience):
        self.buffer.append(experience)

    def size(self) -> int:
        return len(self.buffer)

    def sample(self, batch_size: int, continuous: bool = True):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)

        if continuous:
            start_index = random.randint(0, len(self.buffer) - batch_size)
            return [self.buffer[i] for i in range(start_index, start_index + batch_size)]
        else:
            indexes = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
            return [self.buffer[i] for i in indexes]

    def clear(self):
        self.buffer.clear()

In [20]:
if env_name == "Acrobot":
    env = gym.make("Acrobot-v1")
elif env_name == "CartPole":
    env = gym.make("CartPole-v1")
    
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
print(n_state, n_action)
# Initialize Q-networks
onlineQNetwork = QNetwork(n_state=n_state, n_action=n_action).to(device)
targetQNetwork = QNetwork(n_state=n_state, n_action=n_action).to(device)
targetQNetwork.load_state_dict(onlineQNetwork.state_dict())

# Initialize optimizer
optimizer = torch.optim.Adam(onlineQNetwork.parameters(), lr=lr)

# Initialize memory replay buffer
memory_replay = Memory(REPLAY_MEMORY)

4 2


In [21]:
for epoch in range(600):

    state = env.reset()
    episode_reward = 0
    for time_steps in range(200):
        p = random.random()
        if p < epsilon:
            action = random.randint(0, n_action-1)
        else:
            tensor_state = torch.FloatTensor(state).unsqueeze(0).to(device)
            action = onlineQNetwork.select_action(tensor_state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        memory_replay.add((state, next_state, action, reward, done))
        if memory_replay.size() > 128:
            if begin_learn is False:
                print('learn begin!')
                begin_learn = True
            learn_steps += 1
            if learn_steps % UPDATE_STEPS == 0:
                targetQNetwork.load_state_dict(onlineQNetwork.state_dict())
            batch = memory_replay.sample(BATCH, False)
            batch_state, batch_next_state, batch_action, batch_reward, batch_done = zip(*batch)

            batch_state = torch.FloatTensor(batch_state).to(device)
            batch_next_state = torch.FloatTensor(batch_next_state).to(device)
            batch_action = torch.FloatTensor(batch_action).unsqueeze(1).to(device)
            batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(device)
            batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(device)

            with torch.no_grad():
                onlineQ_next = onlineQNetwork(batch_next_state)
                targetQ_next = targetQNetwork(batch_next_state)
                online_max_action = torch.argmax(onlineQ_next, dim=1, keepdim=True)
                y = batch_reward + (1 - batch_done) * GAMMA * targetQ_next.gather(1, online_max_action.long())

            loss = F.mse_loss(onlineQNetwork(batch_state).gather(1, batch_action.long()), y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if epsilon > FINAL_EPSILON:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        if done:
            break
        state = next_state

    if epoch % 10 == 0:
        # torch.save(onlineQNetwork.state_dict(), 'ddqn-policy.para')
        print('Ep {}\tMoving average score: {:.2f}\t'.format(epoch, episode_reward))

Ep 0	Moving average score: 9.00	
Ep 10	Moving average score: 9.00	
learn begin!
Ep 20	Moving average score: 9.00	


  batch_state = torch.FloatTensor(batch_state).to(device)


Ep 30	Moving average score: 10.00	
Ep 40	Moving average score: 10.00	
Ep 50	Moving average score: 9.00	
Ep 60	Moving average score: 9.00	
Ep 70	Moving average score: 13.00	
Ep 80	Moving average score: 9.00	
Ep 90	Moving average score: 11.00	
Ep 100	Moving average score: 9.00	
Ep 110	Moving average score: 8.00	
Ep 120	Moving average score: 10.00	
Ep 130	Moving average score: 10.00	
Ep 140	Moving average score: 8.00	
Ep 150	Moving average score: 12.00	
Ep 160	Moving average score: 8.00	
Ep 170	Moving average score: 10.00	
Ep 180	Moving average score: 11.00	
Ep 190	Moving average score: 12.00	
Ep 200	Moving average score: 12.00	
Ep 210	Moving average score: 11.00	
Ep 220	Moving average score: 10.00	
Ep 230	Moving average score: 13.00	
Ep 240	Moving average score: 14.00	
Ep 250	Moving average score: 11.00	
Ep 260	Moving average score: 10.00	
Ep 270	Moving average score: 10.00	
Ep 280	Moving average score: 12.00	
Ep 290	Moving average score: 13.00	
Ep 300	Moving average score: 9.00	
Ep 310	

### Trying after Training

In [32]:
done = False
state = env.reset()
scores = []

for _ in range(100):
    state = env.reset()
    done = False
    score = 0
    while not done:
        # _ = env.render()
        # print(state)
        action = onlineQNetwork.select_action(torch.FloatTensor(state).unsqueeze(0).to(device))
        next_state, reward, done, _ = env.step(action)
        score += reward
        state = next_state
    scores.append(score)