In [1]:
# Example 8-3: Deep Q Network 구현
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
import numpy as np

### Reference: http://www.modulabs.co.kr/RL4RWS/18828
import gym
from gym.envs.registration import register
register(
    id='FrozenLake-v3',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={
        'map_name': '4x4',
        'is_slippery': False
    }
)
env = gym.make("FrozenLake-v3")

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(16,64)
        self.fc2 = nn.Linear(64,64)
        self.fc3 = nn.Linear(64,96)
        self.fc4 = nn.Linear(96,96)
        self.fc5 = nn.Linear(96,64)
        self.fc6 = nn.Linear(64,64)
        self.fc7 = nn.Linear(64,4)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = self.fc7(x)
        return x
    
model = Net()      
    

In [3]:
def onehot2tensor(state):
    tmp = np.zeros(16)
    tmp[state] = 1
    vector = np.array(tmp, dtype='float32')
    tensor = torch.from_numpy(vector).float()
    return tensor

def applymodel(tensor):
    output_tensor = model(tensor)
    output_vector = output_tensor.data.numpy()
    return output_tensor, output_vector

In [14]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr=0.01)

n_episode = 10000
dr = 0.9  ## discount rate
er = 0.1  ## exploration & exploitation rate

total_reward = 0.0
for i_episode in range(n_episode):
    observation = env.reset()
    episode_reward = 0.0
    done = None
    total_loss = 0.0
    for t in range(100):
        current_state = observation
        optimizer.zero_grad()
        current_tensor = onehot2tensor(current_state)
        current_output_tensor, current_output_vector = applymodel(current_tensor)
        
        if np.random.rand() < er:
            action = env.action_space.sample()
        else:
            action = np.argmax(current_output_vector) 
        
        observation, reward, done, info = env.step(action)
        observation_tensor = onehot2tensor(observation)
        observation_output_tensor, observation_output_vector = applymodel(observation_tensor)
        
        q = reward + dr*np.max(observation_output_vector)
        q_vector = np.copy(current_output_vector)
        q_vector[action] = q
        q_variable = torch.Tensor(q_vector)
        
        loss = criterion(current_output_tensor, q_variable)
        loss.backward()
        optimizer.step()
        total_loss += loss.data.item()
        if done:
            episode_reward += reward
    
    total_reward += episode_reward
    if (i_episode+1)%(n_episode/10) == 0:
        print(i_episode+1, total_loss, total_reward)


1000 0.0010226326660500717 18.0
2000 9.340774026115895e-05 34.0
3000 3.0625657686494244e-06 49.0
4000 2.1685012717391565e-26 64.0
5000 2.7041020714003942e-21 75.0
6000 0.00010881005086811331 86.0
7000 0.0003568643608056905 102.0
8000 9.863590718747606e-06 117.0
9000 0.2696659995341668 142.0
10000 9.814388510285925e-08 154.0


In [15]:
print("Total reward:", total_reward)
print("Average reward:", total_reward/n_episode)

Total reward: 154.0
Average reward: 0.0154


In [17]:
## play using learned Q values
n_episode = 1000
total_reward = 0.0
for i_episode in range(n_episode):
    observation = env.reset()
    episode_reward = 0.0
    for t in range(100):
        current_state = observation    
        current_tensor = onehot2tensor(current_state)
        current_output_tensor, current_output_vector = applymodel(current_tensor)
        action = np.argmax(current_output_vector) 
        observation, reward, done, info = env.step(action)
        if done:
            episode_reward += reward
    total_reward += episode_reward
    #print(i_episode, total_reward)

print(current_output_vector)
print("Total reward:", total_reward)
print("Average reward:", total_reward/n_episode)

[-4.2458702e-04  8.5975182e-05  1.1103050e-04 -5.5245910e-06]
Total reward: 0.0
Average reward: 0.0
