# Importing dependancies

In [1]:
import gym

import numpy as np
import random
from collections import namedtuple, deque

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Model definition

We can use the state variables (position of cart, angle of pole)

In [3]:
class DQN(nn.Module):

    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fn1 = nn.Linear(input_size, 64)
        self.fn2 = nn.Linear(64, 64)
        self.fn3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = F.relu(self.fn1(x))
        x = F.relu(self.fn2(x))
        x = F.relu(self.fn3(x))
        return x


# Memory
The agent should memorize what it learned before so it can learn from it

In [4]:
Experience = namedtuple('Experience', ('curr_state', 'action', 'next_state', 'reward', 'is_done'))

class Memory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def remember(self, *args):
        self.memory.append(Experience(*args))

    def recall(self, batch_size):
        experiences = random.sample(self.memory, batch_size)
        batch = Experience(*zip(*experiences))
        return batch

    def __len__(self):
        return len(self.memory)

# Creating the agent

In [9]:
class DQNAgent:
    def __init__(self, env):

        self.state_size = env.observation_space.shape[0] # input size
        self.action_size = env.action_space.n # output size
        
        self.model = DQN(self.state_size, self.action_size).to(device)

        self.loss_fn = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        self.exploration_rate = 1 # initial exploration rate, always leave at 1
        self.exploration_rate_decay = 0.9999 # rate at which the exploration decreases
        self.exploration_rate_min = 0.1 # minimun exploration rate
        
        self.gamma = 0.99 # falloff for Q score

        self.batch_size = 64
        self.num_epochs = 200

        self.memory = Memory(10000) # how many of the previous samples are used
    
    def act(self, state):
        if random.random() < self.exploration_rate:
            action = random.randrange(self.action_size) # act randomly
        else:
            with torch.no_grad():
                action = self.act_ideal(state) # act ideally
            
        self.exploration_rate *= self.exploration_rate_decay
        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
        
        return action
    
    def act_ideal(self, state):
        return self.model(state).max(1)[1].item()
    
    def train_step(self):
        if len(self.memory) < self.batch_size:
            return
        batch = self.memory.recall(self.batch_size)
        
        curr_states = torch.stack(batch.curr_state).squeeze(1)
        actions = torch.Tensor(batch.action).to(device)
        next_states = torch.stack(batch.next_state).squeeze(1)
        rewards = torch.Tensor(batch.reward).to(device)
        is_dones = torch.Tensor(batch.is_done).to(device)
        
        # This is the fundamental logic behind calulating a deep Q value.
        curr_Q = self.model(curr_states).mul(actions).sum(1)
        next_Q = self.model(next_states).max(1)[0]
        expected_Q = rewards + (1 - is_dones) * self.gamma * next_Q

        loss = self.loss_fn(expected_Q, curr_Q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    
    def train(self, env, print_epochs=False):
        for epoch in range(self.num_epochs):
            is_done = False
            curr_state = env.reset()
            curr_state = torch.Tensor(curr_state).to(device)
            score = 0
            
            while not is_done:

                action = self.act(curr_state.unsqueeze(0))
                next_state, reward, is_done, _ = env.step(action)
                next_state = torch.Tensor(next_state).to(device)
                action_encode = np.eye(2)[action].tolist()
                self.memory.remember(curr_state, action_encode, next_state, reward, is_done)
                      
                self.train_step()

                score += 1
            
            if print_epochs:
                print("Epoch: " + str(epoch + 1) + ". Score is: " + str(score))


In [10]:
env = gym.make("CartPole-v0")
agent = DQNAgent(env)
agent.train(env, print_epochs=True)

Epoch: 1. Score is: 30
Epoch: 2. Score is: 16
Epoch: 3. Score is: 18
Epoch: 4. Score is: 18
Epoch: 5. Score is: 11
Epoch: 6. Score is: 18
Epoch: 7. Score is: 10
Epoch: 8. Score is: 13
Epoch: 9. Score is: 16
Epoch: 10. Score is: 17
Epoch: 11. Score is: 27
Epoch: 12. Score is: 23
Epoch: 13. Score is: 14
Epoch: 14. Score is: 19
Epoch: 15. Score is: 45
Epoch: 16. Score is: 37
Epoch: 17. Score is: 16
Epoch: 18. Score is: 32
Epoch: 19. Score is: 19
Epoch: 20. Score is: 28
Epoch: 21. Score is: 23
Epoch: 22. Score is: 19
Epoch: 23. Score is: 29
Epoch: 24. Score is: 15
Epoch: 25. Score is: 15
Epoch: 26. Score is: 28
Epoch: 27. Score is: 13
Epoch: 28. Score is: 12
Epoch: 29. Score is: 19
Epoch: 30. Score is: 68
Epoch: 31. Score is: 18
Epoch: 32. Score is: 16
Epoch: 33. Score is: 40
Epoch: 34. Score is: 14
Epoch: 35. Score is: 20
Epoch: 36. Score is: 27
Epoch: 37. Score is: 13
Epoch: 38. Score is: 11
Epoch: 39. Score is: 29
Epoch: 40. Score is: 21
Epoch: 41. Score is: 35
Epoch: 42. Score is: 19
E