In [1]:
import os
import tqdm

In [2]:
import gym
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
env = gym.make("LunarLander-v2").env
env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape

In [4]:
import theano
import theano.tensor as T
current_states = T.matrix("states[batch,units]")
actions = T.ivector("action_ids[batch]")
rewards = T.vector("rewards[batch]")
next_states = T.matrix("next states[batch,units]")
is_end = T.ivector("vector[batch] where 1 means that session just ended")

In [5]:
import lasagne
from lasagne.layers import *
l_states = InputLayer((None,)+state_dim)
nn = DenseLayer(l_states, 100, nonlinearity=lasagne.nonlinearities.rectify)
nn = DenseLayer(nn, 50, nonlinearity=lasagne.nonlinearities.rectify)
l_qvalues = DenseLayer(nn,num_units=n_actions,nonlinearity=None)

Предскажем qvalue для текущих состояний

In [6]:
predicted_qvalues = get_output(l_qvalues,{l_states:current_states})
get_qvalues = theano.function([current_states], T.argmax(predicted_qvalues, axis=1))
predicted_qvalues_for_actions = predicted_qvalues[T.arange(actions.shape[0]),actions]

loss function and update

In [7]:
predicted_next_qvalues = get_output(l_qvalues,{l_states:next_states})
gamma = 0.99
target_qvalues_for_actions = rewards + gamma * T.max(predicted_next_qvalues, axis=1)
target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions
target_qvalues_for_actions = theano.gradient.disconnected_grad(target_qvalues_for_actions)

In [8]:
loss = lasagne.objectives.squared_error(predicted_qvalues_for_actions, target_qvalues_for_actions)

In [9]:
all_weights = get_all_params(l_qvalues,trainable=True)
updates = lasagne.updates.adam(loss.mean(),all_weights,learning_rate=1e-3)

In [10]:
train_step = theano.function([current_states, actions, rewards, next_states, is_end], updates=updates, allow_input_downcast=True)

Playing the game

In [11]:
epsilon = 0.25 
def generate_session(t_max=1000):
    total_reward = 0
    s = env.reset()
    for t in range(t_max):
        q_values = get_qvalues(np.array([s],dtype=np.float32))[0] 
        rnd = np.random.uniform()
        if rnd < epsilon:
            a = np.random.choice(np.arange(n_actions))
        else:
            a = q_values
        new_s,r,done,info = env.step(a)
        train_step(np.array([s],dtype=np.float32),[a],[r],
                   np.array([new_s],dtype=np.float32),[done])
        total_reward+=r
        s = new_s
        if done: break
    return total_reward

In [12]:
from tqdm import tqdm

In [13]:
for i in tqdm(range(20)):
    rewards = [generate_session() for _ in range(30)] 
    epsilon*=0.95
    print ("reward:%.3f"%(np.mean(rewards)))
    if np.mean(rewards) > 300:
        print ("You Win!")
        break 
    assert epsilon!=0, "Please explore environment"

  5%|▌         | 1/20 [00:01<00:36,  1.91s/it]

reward:-229.088


 10%|█         | 2/20 [00:10<01:34,  5.22s/it]

reward:-218.565


 15%|█▌        | 3/20 [00:18<01:45,  6.23s/it]

reward:-182.731


 20%|██        | 4/20 [00:34<02:16,  8.51s/it]

reward:-124.675


 25%|██▌       | 5/20 [01:01<03:03, 12.25s/it]

reward:-91.655


 30%|███       | 6/20 [01:23<03:15, 13.99s/it]

reward:-78.438


 35%|███▌      | 7/20 [01:50<03:24, 15.73s/it]

reward:-30.160


 40%|████      | 8/20 [02:12<03:18, 16.51s/it]

reward:23.632


 45%|████▌     | 9/20 [02:31<03:05, 16.83s/it]

reward:49.311


 50%|█████     | 10/20 [02:52<02:52, 17.26s/it]

reward:22.680


 55%|█████▌    | 11/20 [03:12<02:37, 17.54s/it]

reward:49.426


 60%|██████    | 12/20 [03:33<02:22, 17.78s/it]

reward:71.335


 65%|██████▌   | 13/20 [03:49<02:03, 17.62s/it]

reward:52.868


 70%|███████   | 14/20 [04:18<01:50, 18.48s/it]

reward:91.157


 75%|███████▌  | 15/20 [04:43<01:34, 18.89s/it]

reward:15.034


 80%|████████  | 16/20 [05:04<01:16, 19.02s/it]

reward:136.494


 85%|████████▌ | 17/20 [05:19<00:56, 18.79s/it]

reward:93.796


 90%|█████████ | 18/20 [05:35<00:37, 18.61s/it]

reward:111.209


 95%|█████████▌| 19/20 [05:51<00:18, 18.48s/it]

reward:115.516


100%|██████████| 20/20 [06:14<00:00, 18.73s/it]

reward:136.124





In [18]:
for i in tqdm(range(20)):
    rewards = [generate_session() for _ in range(30)] 
    epsilon*=0.95
    print ("reward:%.3f"%(np.mean(rewards)))
    if np.mean(rewards) > 300:
        print ("You Win!")
        break 
    assert epsilon!=0, "Please explore environment"

  5%|▌         | 1/20 [00:13<04:18, 13.59s/it]

reward:119.263


 10%|█         | 2/20 [00:29<04:29, 14.95s/it]

reward:97.891


 15%|█▌        | 3/20 [00:41<03:53, 13.74s/it]

reward:147.143


 20%|██        | 4/20 [00:52<03:30, 13.17s/it]

reward:111.504


 25%|██▌       | 5/20 [01:10<03:31, 14.07s/it]

reward:129.228


 30%|███       | 6/20 [01:26<03:20, 14.34s/it]

reward:150.288


 35%|███▌      | 7/20 [01:41<03:07, 14.44s/it]

reward:156.783


 40%|████      | 8/20 [01:53<02:50, 14.21s/it]

reward:128.498


 45%|████▌     | 9/20 [02:09<02:37, 14.35s/it]

reward:139.925


 50%|█████     | 10/20 [02:25<02:25, 14.56s/it]

reward:181.754


 55%|█████▌    | 11/20 [02:39<02:10, 14.46s/it]

reward:160.153


 60%|██████    | 12/20 [02:47<01:51, 13.99s/it]

reward:113.205


 65%|██████▌   | 13/20 [02:59<01:36, 13.79s/it]

reward:146.414


 70%|███████   | 14/20 [03:10<01:21, 13.61s/it]

reward:146.545


 75%|███████▌  | 15/20 [03:22<01:07, 13.50s/it]

reward:164.601


 80%|████████  | 16/20 [03:32<00:53, 13.26s/it]

reward:151.680


 85%|████████▌ | 17/20 [03:43<00:39, 13.17s/it]

reward:133.900


 90%|█████████ | 18/20 [03:53<00:25, 12.99s/it]

reward:122.855


 95%|█████████▌| 19/20 [04:06<00:12, 12.95s/it]

reward:101.556


100%|██████████| 20/20 [04:16<00:00, 12.83s/it]

reward:167.580





In [19]:
import pickle
def save_network(filename,param_values):
    f = open(filename, 'wb')
    pickle.dump(param_values, f)
    f.close()
save_network("model.npz", get_all_param_values(l_qvalues))

In [21]:
def load_network(filename):
    f = open(filename, 'rb')
    param_values = pickle.load(f)
    f.close()
    return param_values

saved_params = load_network("model.npz")
lasagne.layers.set_all_param_values(nn, saved_params)

updates = lasagne.updates.adam(loss.mean(), network, learning_rate=1e-3)
train_step = theano.function([current_states, actions, rewards, next_states, is_end],
                             updates=updates, allow_input_downcast=True)