In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from tensorboardX import SummaryWriter


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.animation as manimation
import numpy as np
import copy

In [3]:
from helpers.replay_buffer import ReplayBuffer
from helpers.chain_environment import SimpleChain
from helpers.shedules import LinearSchedule

In [4]:
def convert_to_var(arr, astype='float32', add_dim=False):
    if add_dim:
        v = Variable(torch.from_numpy(np.array([arr]).astype(astype)))
    else:
        v = Variable(torch.from_numpy(arr.astype(astype))) 
    return v

In [5]:
class DQNnet(nn.Module):
    def __init__(self, num_actions, input_dim, hidden_size=20):
        super().__init__()
        self.net = nn.Sequential(
                            nn.Linear(input_dim, hidden_size),
                            nn.ReLU(),
                            nn.Linear(hidden_size, num_actions))
    def forward(self, x):
        out = self.net(x)
        return out

In [6]:
def check_model_copying():
    # define models
    model = DQNnet(num_actions=num_actions, input_dim=input_dim)
    target_model = copy.deepcopy(model)
    # create input variable
    s = chain_env.reset()
    s_var = convert_to_var(s, add_dim=True)
    # eval outputs of models on this variable
    model_output = model.forward(s_var)
    target_model_output = target_model.forward(s_var).detach() # detach stop gradients
    # check that both modules give the same outputs
    np.testing.assert_array_equal(model_output.data.numpy(), target_model_output.data.numpy())
    # define everything necessary for optimization step and make opt step
    mse_loss_func = nn.MSELoss()
    loss = mse_loss_func(model_output, target_model_output+10)
    optimizer = torch.optim.RMSprop(model.parameters())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    # eval outputs of models on the variable after opt step
    upd_model_output = model.forward(s_var)
    upd_target_model_output = target_model.forward(s_var).detach()
    # check that parameters of model have been updated
    assert (model_output.data.numpy() == upd_model_output.data.numpy).sum() == 0
    # check that parameters of target model have not been updated
    np.testing.assert_array_equal(upd_target_model_output.data.numpy(), target_model_output.data.numpy())

In [7]:
def optimize_dqn_loss(optimizer, model, target_model, batch, gamma,
                      target_type='standard', tau=None):
    states_batch, actions_batch, rewards_batch, next_states_batch, dones_batch = batch
    states_batch_var = convert_to_var(states_batch)
    actions_batch_var = convert_to_var(actions_batch[:, np.newaxis], astype='int64')
    rewards_batch_var = convert_to_var(rewards_batch)
    next_states_batch_var = convert_to_var(next_states_batch)
    dones_batch_var = convert_to_var(dones_batch)
    
    q_values = model.forward(states_batch_var).gather(1, actions_batch_var)

    next_q_values = target_model.forward(next_states_batch_var).detach()
    next_q_values = next_q_values.max(dim=1)[0]
    next_q_values[dones_batch_var.byte()] = 0
    q_values_targets = rewards_batch_var + gamma * next_q_values

    mse_loss_func = nn.MSELoss()
    loss = mse_loss_func(q_values, q_values_targets)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.data[0]

In [8]:
import os, shutil

def clear_folder(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(e)

In [9]:
def plot_amd_log_images(state_action_count, all_q_values, t, folder):
    if t == 0:
        clear_folder(folder)
        
    fig = plt.figure(figsize=(10, 6))
    plt.subplot(121)
    s_a_visitations_plot = plt.imshow(state_action_count / state_action_count.sum(), vmin=0, vmax=1)
    plt.colorbar()
    plt.title('S-A Visitations')
    plt.subplot(122)
    q_values_plot = plt.imshow(all_q_values)
    plt.colorbar()
    plt.title('Q values')
    plt.savefig(folder+'/{}.png'.format(t))
    plt.close(fig)

In [10]:
def eval_agent(env, model, agent_type='simple_dqn'):
    episode_total_reward = 0
    state = env.reset()
    while True:
        action = epsilon_greedy_act(state, model, 0)
        next_state, rew, done, _ = env.step(action)
        state = next_state
        episode_total_reward += rew
        if done:
            break
    env.reset()
    return episode_total_reward

In [11]:
def epsilon_greedy_act(state, model, eps_t):
    state_var = convert_to_var(state, add_dim=True)
    q_values = model.forward(state_var).data.numpy()[0]
    if np.random.rand() < eps_t:
        action = np.random.randint(num_actions)
    else:
        action = q_values.argmax()
    return action

In [1]:
batch_size = 32

def train(env,
          eps_greedy_exploration_params,
          gamma=0.99,
          max_steps=100,
          learning_starts_in_steps=100,
          train_freq_in_steps=1,
          update_freq_in_steps=10,
          plot_freq_in_steps=10,
          eval_freq_in_episodes=5,
          seed=None
          ):
    if seed:
        np.random.seed(seed)
        torch.manual_seed(seed)
    
    num_actions = env.action_space.n
    dim_states = env.observation_space.shape[0]
    n_all_states = env.get_all_states().shape[0]
    # define models
    model = DQNnet(num_actions, dim_states)
    target_model = copy.deepcopy(model)
    # define optimizator
    optimizer = torch.optim.RMSprop(model.parameters())
    # define shedule of epsilon in epsilon-greedy exploration
    schedule_timesteps=int(eps_greedy_exploration_params['exploration_fraction'] * max_steps)
    eps_shedule = LinearSchedule(schedule_timesteps=schedule_timesteps,
                                 initial_p=1.0,
                                 final_p=eps_greedy_exploration_params['exploration_final_eps'])
    folder = 'mylogs'
    clear_folder(folder)
    writer = SummaryWriter(folder)
    
    replay_buffer = ReplayBuffer(1000, seed=seed)
    num_episodes = 0
    sum_rewards_per_episode = [0]
    list_rewards_per_episode = [[]]
    state_action_count = np.zeros((n_all_states, num_actions))
    count_good_rewards = 0
    state = env.reset()
    for t in range(max_steps):
        eps_t = eps_shedule.value(t)
        action = epsilon_greedy_act(state, model, eps_t)
        state_action_count[env.cur_state_id][action] += 1

        next_state, rew, done, _ = env.step(action)
        replay_buffer.add(state, action, rew, next_state, done)
        state = next_state

        if rew == 1:
            count_good_rewards += 1
        sum_rewards_per_episode[-1] += rew
        list_rewards_per_episode[-1].append(rew)
        if done:
            num_episodes += 1
            print('Episodes:', num_episodes, sum_rewards_per_episode[-1])
            sum_rewards_per_episode.append(0)
            list_rewards_per_episode.append([])
            state = env.reset()

        if t > learning_starts_in_steps and t % train_freq_in_steps == 0:
            batch = replay_buffer.sample(batch_size)
            loss = optimize_dqn_loss(optimizer, model, target_model, batch, gamma)
            writer.add_scalar('dqn/loss', loss, t)

        if t > learning_starts_in_steps and t % update_freq_in_steps == 0:
            target_model = copy.deepcopy(model)
        """    
        if done and eval_freq_in_episodes is not None and num_episodes % eval_freq_in_episodes == 0:
            test_episode_reward = eval_agent(env, model)
            if test_episode_reward == 10:
                print('Successfully solved environment in {} episodes'.format(num_episodes))
                break
        """

        all_states_var = convert_to_var(env.get_all_states())
        all_q_values = model.forward(all_states_var).data.numpy()
        for i in range(n_all_states):
            if (2 <= i < n_all_states - 2) and n_all_states > 10:
                continue
            else:
                writer.add_scalars('dqn/q_values/state_{}'.format(i+1), {'action_right': all_q_values[i][1],
                                                                         'action_left': all_q_values[i][0]},
                                                                      t)
        writer.add_scalar('dqn/count_good_reward', count_good_rewards, t)
        writer.add_scalar('dqn/eps_t', eps_t, t)
        
        if t % plot_freq_in_steps == 0:
            plot_amd_log_images(state_action_count, all_q_values, t, 'logs/images_logs/images')
    return state_action_count, num_episodes

In [2]:
input_dim=5
chain_env=SimpleChain(input_dim)
num_actions = chain_env.action_space.n
dim_states = chain_env.observation_space.shape[0]

NameError: name 'SimpleChain' is not defined

In [3]:
eps_greedy_exploration_params = {'exploration_fraction': 0.5,
                                 'exploration_final_eps': 0.05,
                                 'flag': True}

In [4]:
state_action_count, num_episodes = train(chain_env,
                                          eps_greedy_exploration_params,
                                          max_steps=500*(input_dim+9),
                                          learning_starts_in_steps=5*(input_dim+9),
                                          update_freq_in_steps=1*(input_dim+9),
                                          plot_freq_in_steps=1*(input_dim+9),
                                          seed=12
                                          )

NameError: name 'chain_env' is not defined

In [None]:
state_action_count.shape

In [None]:
state_action_count