# Approximate q-learning

In this notebook you will teach a lasagne neural network to do Q-learning.

__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [2]:
import gym
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from pycrayon import CrayonClient
client = CrayonClient(hostname='localhost')
crayon = client.create_experiment('pytorch-3')

In [5]:
env = gym.make("CartPole-v0").env
env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape

#plt.imshow(env.render("rgb_array"))

[2017-04-21 16:49:07,142] Making new env: CartPole-v0


In [6]:
CUDA = True
batch_size = 4
L1_size = 50

gamma = 0.99

In [7]:
print("n_actions={}, state_dim={}".format(n_actions, state_dim))

n_actions=2, state_dim=(4,)


# Approximate (deep) Q-learning: building the network

In this section we will build and train naive Q-learning with theano/lasagne

First step is initializing input variables

In [8]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc_1 = nn.Linear(int(np.prod(state_dim)), L1_size)
        self.fc_out = nn.Linear(L1_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc_1(x))
        x = self.fc_out(x)
        return x

In [9]:
model = Net()
if CUDA:
    model.cuda()
print(model)

Net (
  (fc_1): Linear (4 -> 50)
  (fc_out): Linear (50 -> 2)
)


In [10]:
def states_to_var(s):
    return Variable(torch.from_numpy(np.array(s, dtype=np.float32)), volatile=False)

In [11]:
optimizer = optim.Adam(model.parameters(), lr=0.0005)

### Playing the game

In [12]:
def choose_action(q_values, epsilon):
    if np.random.rand() < epsilon:
        return torch.LongTensor([np.random.choice(n_actions)])
    else:
        return q_values.max(dim=1)[1].data[0]

In [17]:
epsilon = 0.25 #initial epsilon

def generate_session(t_max=1000):
    """play env with approximate q-learning agent and train it at the same time"""
    
    total_reward = 0
    s = env.reset()
    losses = []
    q_values = []
    
    for t in range(t_max):  
        in_state = states_to_var([s])
        if CUDA:
            in_state = in_state.cuda()
        q = model(in_state)
        a = choose_action(q, epsilon).cpu()[0]
        new_s,r,done,info = env.step(a)
        
        optimizer.zero_grad()
        
        in_state = states_to_var([new_s])
        if CUDA:
            in_state = in_state.cuda()
        new_q = model(in_state).detach()
        chosen_q = q[:, a]
        if done:
            valid_q = r
        else:
            valid_q = r + gamma * new_q.max(dim=1)[0]        
        loss = (valid_q - chosen_q)**2
        
        loss.backward()
        optimizer.step()
        losses.append(loss.data.cpu().numpy()[0])
        q_values.append(q.mean().data.cpu()[0])
        
        total_reward+=r
        
        s = new_s
        if done: break
       
    crayon.add_scalar_value("q_mean", float(np.mean(q_values)))
    crayon.add_scalar_value("reward", total_reward)
    crayon.add_scalar_value("loss", float(np.mean(losses)))
            
    return total_reward
        

In [18]:
for i in range(100):
    
    rewards = [generate_session() for _ in range(100)] #generate new sessions
    
    epsilon*=0.95
    
    print ("%d: mean reward:%.3f\tepsilon:%.5f"%(i, np.mean(rewards),epsilon))

    if np.mean(rewards) > 300:
        print ("You Win!")
        break
        
    assert epsilon!=0, "Please explore environment"

0: mean reward:11.070	epsilon:0.23750
1: mean reward:10.490	epsilon:0.22562
2: mean reward:10.960	epsilon:0.21434
3: mean reward:12.970	epsilon:0.20363
4: mean reward:15.120	epsilon:0.19345
5: mean reward:12.600	epsilon:0.18377
6: mean reward:12.360	epsilon:0.17458
7: mean reward:17.460	epsilon:0.16586
8: mean reward:11.340	epsilon:0.15756
9: mean reward:20.260	epsilon:0.14968
10: mean reward:17.560	epsilon:0.14220
11: mean reward:21.050	epsilon:0.13509
12: mean reward:20.510	epsilon:0.12834
13: mean reward:21.100	epsilon:0.12192
14: mean reward:27.650	epsilon:0.11582
15: mean reward:37.940	epsilon:0.11003
16: mean reward:42.640	epsilon:0.10453
17: mean reward:57.300	epsilon:0.09930
18: mean reward:81.880	epsilon:0.09434
19: mean reward:96.530	epsilon:0.08962
20: mean reward:71.350	epsilon:0.08514
21: mean reward:44.630	epsilon:0.08088
22: mean reward:135.510	epsilon:0.07684
23: mean reward:113.680	epsilon:0.07300
24: mean reward:112.680	epsilon:0.06935
25: mean reward:104.170	epsilon:

### Video

In [None]:
epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training

In [None]:
#record sessions
import gym.wrappers
env = gym.wrappers.Monitor(env,directory="videos",force=True)
sessions = [generate_session() for _ in range(100)]
env.close()
#unwrap 
env = env.env.env
#upload to gym
#gym.upload("./videos/",api_key="<your_api_key>") #you'll need me later

#Warning! If you keep seeing error that reads something like"DoubleWrapError",
#run env=gym.make("CartPole-v0");env.reset();

In [None]:
#show video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices