### Using CNN with 25 outputs

In [None]:
import gym
import gym_warehouse
import numpy as np
import copy

env_id = "warehouse-v0"


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, info = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return info['image']

    def step(self, ac):
        return self.env.step(ac)
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip       = skip

    def reset(self):
        return self.env.reset()

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2: self._obs_buffer[0] = obs
            if i == self._skip - 1: self._obs_buffer[1] = obs
            total_reward += np.sum(reward)
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)



    
def make_warehouse_cnn(env_id):
    env = gym.make(env_id)
#     assert 'NoFrameskip' in env.spec.id
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    return env


env = make_warehouse_cnn(env_id)
state = env.reset()
state = np.transpose(state,(2,1,0))

In [None]:
# env = gym.make('PongNoFrameskip-v0')
# from wrappers import make_atari, wrap_deepmind, wrap_pytorch
# env    = make_atari('PongNoFrameskip-v0')
# env    = wrap_deepmind(env)
# env    = wrap_pytorch(env)

# env.render()

In [None]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

import math, random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

out_dir = 'outdir_cnn_1/'


import math, random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import matplotlib.pyplot as plt

USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)

from collections import deque

gamma      = 0.99

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
#         print("STATE: ",state)
        return np.concatenate(state), action, reward, np.concatenate(next_state), done

    def __len__(self):
        return len(self.buffer)


def update_target(current_model, target_model):
    target_model.load_state_dict(current_model.state_dict())

# estimated_next_q_state_values=[]
# estimated_next_q_value=[]

class CnnDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(CnnDQN, self).__init__()

        self.input_shape = input_shape
        self.num_actions = num_actions

        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        self.fc = nn.Sequential(
            nn.Linear(self.feature_size(), 512),
            nn.ReLU(),
            nn.Linear(512, self.num_actions)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

    def feature_size(self):
        return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)

    def act(self, state, epsilon):
        if random.random() > epsilon:
            state   = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
            q_value = self.forward(state)
            action  = q_value.max(1)[1].data.item()
        else:
            action = random.randrange(env.action_space.n)
        return action

def plot(frame_idx, rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
    plt.plot(rewards)
    plt.savefig(out_dir + 'rewards.png')

    plt.subplot(132)
    plt.title('loss')
    plt.plot(losses)
    plt.savefig(out_dir + 'losses.png')
    plt.show()
    
    

def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)))
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))

    q_values      = current_model(state)    
    next_q_cur_values = current_model(next_state) #next_q_values
    next_q_tar_values = target_model(next_state) #next_q_state_values
    
    reward = torch.sum(reward,dim=1)
    
    q_value       = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value = next_q_tar_values.gather(1, torch.max(next_q_cur_values, 1)[1].unsqueeze(1)).squeeze(1)  
    temp = gamma * next_q_value * (1 - done)
    
    expected_q_value = reward + temp
    
    loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() 
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss

In [None]:
# type(state)

# print(state.shape)

# print(np.transpose(state,(2,1,0)).shape)

In [None]:
# current_model = CnnDQN(env.observation_space.shape, env.action_space.n)
# target_model  = CnnDQN(env.observation_space.shape, env.action_space.n)


current_model = CnnDQN(state.shape, 25)
target_model  = CnnDQN(state.shape, 25)

# current_model = torch.load('outdir/current.model')
# current_model.load_state_dict(torch.load('outdir/current.ckpt'))
# # target_model  = CnnDQN(env.observation_space.shape, env.action_space.n)
# target_model = torch.load('outdir/target.model')
# target_model.load_state_dict(torch.load('outdir/target.ckpt'))

if USE_CUDA:
    current_model = current_model.cuda()
    target_model  = target_model.cuda()

optimizer = optim.Adam(current_model.parameters(), lr=0.00001)

replay_initial = 10000
replay_buffer = ReplayBuffer(100000)

update_target(current_model, target_model)

epsilon_start = 1.0
epsilon_final = 0.001
epsilon_decay = 10000000

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

num_frames = 10000000
batch_size = 32

# losses = np.load('outdir/losses.npy')
# all_rewards = np.load('outdir/rewards.npy')

# losses = losses.tolist()
# all_rewards = all_rewards.tolist()
losses=[]
all_rewards=[]
episode_reward = 0

# state = env.reset()

In [None]:

# env.render()
for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = current_model.act(state, epsilon)
    

    _, reward, done, info = env.step(action)
    next_state = info["image"]
    next_state = np.transpose(next_state,(2,1,0))
    
#     next_state=next_state.flatten()
    
#     print(state)

    replay_buffer.push(state, action, reward, next_state, done)
#     env.render()

    state = next_state
    episode_reward += reward


    if (frame_idx%100000==0):
#         print("EPISODE REWARD: ", episode_reward)
        # SAVING LOSS AND REWARD
        np.save(out_dir + 'losses.npy',np.array(losses))
        np.save(out_dir + 'rewards.npy',np.array(all_rewards))

#         SAVING CHECKPOINTS
        torch.save(current_model.state_dict(),out_dir + 'current.ckpt'.format(frame_idx))
        torch.save(target_model.state_dict(),out_dir + 'target.ckpt'.format(frame_idx))

#         SAVING MODELS
        torch.save(current_model,out_dir + 'current.model')
        torch.save(target_model,out_dir + 'target.model')

    if frame_idx % 10000==0:
        print('Frame: ',frame_idx)
        print("Episode Reward: {}".format(episode_reward))
#         print("ALL REWARDS: {}".format(sum(all_rewards)))
        
    if done:
        print("done")
        state = env.reset()
        all_rewards.append(episode_reward)
#         print("Episode Reward When Done: {}".format(episode_reward))
        episode_reward = 0




    if (len(replay_buffer) > replay_initial) and (frame_idx%4==0):
        loss = compute_td_loss(batch_size)
        losses.append(loss.item())

    if frame_idx % 100000 == 0:
        plot(frame_idx, all_rewards, losses)

    if frame_idx % 100000 == 0:
        update_target(current_model, target_model)