In [1]:
import gym
import numpy as np

In [2]:
env = gym.make("Taxi-v2")

n_actions = env.action_space.n

[2017-10-22 01:10:49,983] Making new env: Taxi-v2


In [3]:
#start new game
env.reset();

# display the game state
env.render()

+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



rendering:
    - blue: passenger
    - magenta: destination
    - yellow: empty taxi
    - green: full taxi
    - other letters: locations

In [5]:
print("observations:", env.observation_space, 'n =', env.observation_space.n)
print("actions:", env.action_space, 'n =', env.action_space.n)

observations: Discrete(500) n = 500
actions: Discrete(6) n = 6


Можно для начала немного поиграть :)

Список имеющихся состояний: "South", "North", "East", "West", "Pickup", "Dropoff"

In [76]:
#start new game
env.reset();

# display the game state
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [97]:
new_obs, reward, is_done, _ = env.step(5)
print("new observation code:", new_obs)
print("reward:", reward)
print("is game over?:", is_done)
print("printing new state:")
env.render()

new observation code: 97
reward: 20
is game over?: True
printing new state:
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)


Теперь попробуем программу научить делать то же самое...

In [112]:
env.action_space.sample()  # calculate random action

5

In [114]:
n_states = env.observation_space.n
print("n_states: ", n_states)
n_actions = env.action_space.n
print("n_actions: ", n_actions)

n_states:  500
n_actions:  6


In [119]:
# Пусть policy - это будет последовательность шагов
def get_random_policy(max_steps=100):
    """
    Build a numpy array representing agent policy.
    This array must have one element per each of 16 environment states.
    Element must be an integer from 0 to 3, representing action
    to take from that state.
    """
    
    # Для каждой из 16 клеток будем хранить, действие, которое надо совершить, если находимося в этой клетке
    
    return np.random.randint(0, 6, max_steps)

In [120]:
get_random_policy(20)

array([2, 3, 5, 1, 4, 0, 3, 3, 1, 2, 0, 3, 4, 2, 3, 3, 3, 5, 3, 3])

In [167]:
def sample_reward(env, policy, max_steps=100):
    """
    Interact with an environment, return sum of all rewards.
    If game doesn't end on t_max (e.g. agent walks into a wall), 
    force end the game and return whatever reward you got so far.
    Tip: see signature of env.step(...) method above.
    """
    # max_steps - ограничение итераций на игру (ограничение на длительность игры)
    # s - изначальное состояние
    s = env.reset()
    #print(env.render())
    total_reward = 0
    is_done = False
    
    while not is_done and max_steps > 0:
        # возвращает новое состояние, награду и окончена ли игра
        s, r, is_done, _ = env.step(policy[-max_steps])
        total_reward = r
        max_steps -= 1
    
    return total_reward, is_done#, env.render()

In [168]:
sample_reward(env, get_random_policy(100), max_steps=100)

(-1, False)

In [169]:
def evaluate(policy, n_times=100):
    """Run several evaluations and average the score the policy gets."""
    # Смотрим, как часто добирались до выхода)
    rewards = [sample_reward(env, policy)[0] for _ in range(n_times)]
    return float(np.mean(rewards))
        

In [196]:
evaluate(get_random_policy())

-0.79

Попробуем генетический алгоритм.

In [154]:
def crossover(policy1, policy2, p=0.5):
    """
    for each state, with probability p take action from policy1, else policy2
    """
    assert(len(policy1) == len(policy2))
    # Берём 2 разные policy и смешиваем их с вероятностью p
    r = np.random.random_sample(len(policy1))
    return policy1 * (r <= p) + policy2 * (r > p)

In [206]:
def mutation(policy, p=0.1):
    """
    for each state, with probability p replace action with random action
    Tip: mutation can be written as crossover with random policy
    """
    # p значений поменяем на другие случайные
    return crossover(policy, get_random_policy(len(policy)), 1 - p)


In [208]:
n_epochs = 500 #how many cycles to make
pool_size = 100 #how many policies to maintain (храним 100 самых крутых стратегий на данный момент)
n_crossovers = 50 #how many crossovers to make on each step (добавляем 50 corss)
n_mutations = 50 #how many mutations to make on each tick (добавляем 50 мутир)
# обрезаем худшие


In [209]:
print("initializing...")
pool = [get_random_policy(2000) for _ in range(pool_size)] # spawn a list of pool_size random policies
pool_scores = [evaluate(p) for p in pool]# evaluate every policy in the pool, return list of scores

initializing...


In [210]:
#main loop
for epoch in range(n_epochs):
    print("Epoch %s:"%epoch)
    
    crossovered = [crossover(random.choice(pool), random.choice(pool))
                   for _ in range(n_crossovers)]# crossover random guys from pool, n_crossovers total
    mutated = [mutation(random.choice(pool)) 
               for _ in range(n_mutations)]# add several new policies at random, n_mutations total
    
    assert type(crossovered) == type(mutated) == list
    
    #add new policies to the pool
    pool += crossovered + mutated # add up old population with crossovers/mutations
    # evaluate all policies again
    pool_scores += [evaluate(p) for p in crossovered] + [evaluate(p) for p in mutated]
    
    #select pool_size best policies
    selected_indices = np.argsort(pool_scores)[-pool_size:]
    pool = [pool[i] for i in selected_indices]
    pool_scores = [pool_scores[i] for i in selected_indices]

    #print the best policy so far (last in ascending score order)
    print("best score:", pool_scores[-1])
    #print_policy(pool[-1])

Epoch 0:
best score: 1.31
Epoch 1:
best score: 2.78
Epoch 2:
best score: 4.88
Epoch 3:
best score: 4.88
Epoch 4:
best score: 4.88
Epoch 5:
best score: 4.88
Epoch 6:
best score: 4.88
Epoch 7:
best score: 4.88
Epoch 8:
best score: 4.88
Epoch 9:
best score: 4.88
Epoch 10:
best score: 4.88
Epoch 11:
best score: 4.88
Epoch 12:
best score: 5.3
Epoch 13:
best score: 5.3
Epoch 14:
best score: 5.3
Epoch 15:
best score: 5.3
Epoch 16:
best score: 5.3
Epoch 17:
best score: 5.3
Epoch 18:
best score: 5.3
Epoch 19:
best score: 5.3
Epoch 20:
best score: 5.51
Epoch 21:
best score: 5.51
Epoch 22:
best score: 5.51
Epoch 23:
best score: 5.51
Epoch 24:
best score: 5.51
Epoch 25:
best score: 5.51
Epoch 26:
best score: 5.51
Epoch 27:
best score: 5.51
Epoch 28:
best score: 5.51
Epoch 29:
best score: 5.51
Epoch 30:
best score: 5.51
Epoch 31:
best score: 5.51
Epoch 32:
best score: 5.51
Epoch 33:
best score: 6.35
Epoch 34:
best score: 6.35
Epoch 35:
best score: 6.35
Epoch 36:
best score: 6.35
Epoch 37:
best scor

In [203]:
import random