# Solving Cartpole v0 by DQN

In [15]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
env = gym.make("CartPole-v1")

In [9]:
observation = env.reset()

for _ in range(100):
    env.render()
    action = env.action_space.sample() # your agent here (this takes random actions)
    observation, reward, done, info = env.step(action)
    
    print(observation)

    if done:
        print("Episode finished")
        observation = env.reset()
env.close()

[-0.0228608  -0.18481121 -0.00115526  0.3098936 ]
[-0.02655703  0.01032718  0.00504261  0.01684656]
[-0.02635048  0.20537645  0.00537954 -0.27424111]
[-2.22429525e-02  4.00421238e-01 -1.05280207e-04 -5.65222474e-01]
[-0.01423453  0.59554467 -0.01140973 -0.85793857]
[-0.00232363  0.79082018 -0.0285685  -1.15418716]
[ 0.01349277  0.59608222 -0.05165224 -0.87059742]
[ 0.02541441  0.40169946 -0.06906419 -0.59459115]
[ 0.0334484   0.20760868 -0.08095602 -0.32443777]
[ 0.03760058  0.40378441 -0.08744477 -0.64151269]
[ 0.04567626  0.20998326 -0.10027502 -0.3775977 ]
[ 0.04987593  0.01641779 -0.10782698 -0.11813949]
[ 0.05020429  0.2129062  -0.11018977 -0.44280013]
[ 0.05446241  0.40940077 -0.11904577 -0.76808472]
[ 0.06265042  0.60594272 -0.13440747 -1.09572794]
[ 0.07476928  0.80255385 -0.15632202 -1.42738167]
[ 0.09082036  0.60967006 -0.18486966 -1.18735559]
[ 0.10301376  0.8066435  -0.20861677 -1.53182377]
[ 0.11914663  1.00357906 -0.23925324 -1.88171169]
Episode finished
[ 0.01659897  0.1

# Section 1 - Solving by using DQN with $\epsilon$-greedy policy

Checklist:
1. Objective function
2. Preprocess data
3. Samples generation

In [11]:
# Environment understanding
print("State space", env.observation_space)
print("Action space", env.action_space)

State space Box(4,)
Action space Discrete(2)


## 1. The policy network
The network takes in the state of the game and decide what we should do. 

For simplicity, use a simple 2-layer NN that takes in the observations and then produce a single number indicating the probability of pushing LEFT or RIGHT. It is standard to use a stochastic policy, meaning that the NN will only produce a probability of each action. 

We are going to train our model with a single experience:
1. Let the model estimate Q values of the old state
2. Let the model estimate Q values of the new state
3. Calculate the new target Q value for the action, using the known reward
4. Train the model with input = (old state), output = (target Q values)

In [20]:
# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.01
EPSILON = 0.9
GAMMA = 0.9
C = 100  # Update the network parameters every C iteration
MEMORY_CAPACITY = 2000  # Capacity of experience replay memory

In [26]:
from collections import namedtuple, deque
from random import sample

In [32]:
Experience = namedtuple("Experience", "s a r s_ done")

In [33]:
test_list = deque(maxlen=5)

In [34]:
test_list.append(Experience(1, 2, 1, 3, False))
test_list.append(Experience(1, 2, 1, 3, False))
test_list.append(Experience(1, 2, 1, 3, False))
test_list.append(Experience(1, 2, 1, 3, False))
test_list.append(Experience(1, 2, 1, 3, False))
test_list.append(Experience(1, 2, 2, 4, False))
test_list.append(Experience(1, 3, 1, 5, False))
test_list.append(Experience(1, 4, 1, 6, False))

In [46]:
s_ls, a_ls, r_ls, s__ls, done_list = zip(*sample(test_list, 2))

In [47]:
s_ls

(1, 1)

https://morvanzhou.github.io/tutorials/machine-learning/torch/4-05-DQN/

In [None]:
Experience = namedtuple("Experience", "s a r s_ done")

class DQNAgent:
    """
    A Deep Q learning agent
    https://towardsdatascience.com/reinforcement-learning-tutorial-part-3-basic-deep-q-learning-186164c3bf4
    https://morvanzhou.github.io/tutorials/machine-learning/torch/4-05-DQN/
    """
    
    def __init__(self, env, network, learning_rate=0.1, gamma=0.95, epsilon=1.0):
        
        # Environment parameters
        n_actions = env.action_space.n
        n_state = env.observation_space.shape[0]
        
        # NN - Q function with random parameters
        self.eval_net, self.target_net = network(), network()
        
        # Experience replay
        self.experience_memory = deque(maxlen=MEMORY_CAPACITY)
        
        # Other parameters
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        
    def choose_action(self, x)
        """
        Epsilon Greedy Policy
        if eps = 0 -> Greedy policy
        
        Args:
            x (torch.Variable): features
        """
        if random.uniform(0, 1) < self.eps:
            return self.action_space.sample()
        else:
            actions_value = self.eval_net.forward(x)
            action = torch.max(action_value, 1)[1].data.numpy()[0, 0]
            return action
    
    def store_experience(self, s, a, r, s_pi):
        self.experience_memory.append(Experience(s, a, r, s_pi))
    
    def learn(self):
        
        # Sample the data from experience memory
        sample_values = sample(self.experience_memory, BATCH_SIZE)
        b_s, b_a, b_r, b_s_pi, b_done = zip(*sample_values)
        
        b_s_torch = torch.FloatTensor(b_s)
        b_a_torch = torch.FloatTensor(b_a)
        b_r_torch = torch.FloatTensor(b_r)
        b_s_pi_torch = torch.FloatTensor(b_s_pi)
        
        # Calculate the new Q value
        
        
        pass
        
    

In [18]:
def epsilon_greedy_policy(qs, env, eps):
    """
    Epsilon Greedy Policy
    if eps = 0 -> Greedy policy
    """
    if random.uniform(0, 1) < eps:
        return env.action_space.sample()
    else:
        return np.argmax(qs)

In [19]:
class Network(nn.Module):
    """
    Policy Network
    """
    
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        
        self.fc1 = nn.Linear(4, 12)
        self.fc2 = nn.Linear(12, 2)
        
    def forward(self, x):
        """
        Forward pass
        Essentially, the forward pass return the Q value
        """
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x
    
def loss_fn(output, labels):
    """
    Compute the loss given outputs and labels
    
    Args:
        outputs (Variable)
        labels (Variable)
    """
    pass

def some_measurement(outputs, labels):
    """
    Compute the performance measurement, given the outputs and labels for all images
    """
    pass

In [None]:
def train(old_state, action, reward, new_state):
    