In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class DeepQNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        
        return actions

class Agent():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                max_mem_size=100000, eps_end=0.01, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        
        self.Q_eval = DeepQNetwork(self.lr, n_actions=n_actions, input_dims=input_dims, 
                                  fc1_dims=256, fc2_dims=256)
        
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype= np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size # residual. The memory is finite, so we are reusing
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] =  action
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device) # [] is used because of the nn library
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item() # .item() to get integer
        else:
            action = np.random.choice(self.action_space)
            
        return action
    
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size) 
        # select samples the number of self.batch_size out of max_mem 
        batch = np.random.choice(max_mem, self.batch_size, replace=False) # Don't select the same thing again
        # array slicing [0,1,2,...,self.batch_size-1]
        batch_index = np.arange(self.batch_size, dtype=np.int32) 
        
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
        
        action_batch = self.action_memory[batch]
        
        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0
                                                 
        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        

In [2]:
import gym
import numpy as np
from utils import plot_learning_curve

env = gym.make('LunarLander-v2')
agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, input_dims = [8], lr=0.003)
scores, eps_history = [], []
n_games = 500

for i in range(n_games):
    score = 0
    done = False
    observation = env.reset()
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.store_transition(observation, action, reward, observation_, done)
        agent.learn()
        observation = observation_
    scores.append(score)
    eps_history.append(agent.epsilon)
    
    avg_score = np.mean(scores[-100:])
    
    print('episode ', i, 'score %.2f' % score,
         'average score %.2f' % avg_score,
         'epsilon %.2f ' % agent.epsilon)
x = [i+1 for i in range(n_games)]
filename = 'lunar_lander_2020.png'
plot_learning_curve(x, scores, eps_history, filename)
    

episode  0 score -207.90 average score -207.90 epsilon 0.99 
episode  1 score 4.32 average score -101.79 epsilon 0.95 
episode  2 score -169.46 average score -124.34 epsilon 0.91 
episode  3 score -298.81 average score -167.96 epsilon 0.86 
episode  4 score -226.16 average score -179.60 epsilon 0.82 
episode  5 score -73.62 average score -161.94 epsilon 0.76 
episode  6 score -102.26 average score -153.41 epsilon 0.70 
episode  7 score -59.79 average score -141.71 epsilon 0.64 
episode  8 score -77.22 average score -134.54 epsilon 0.59 
episode  9 score -269.75 average score -148.06 epsilon 0.49 
episode  10 score -12.17 average score -135.71 epsilon 0.42 
episode  11 score -103.38 average score -133.01 epsilon 0.28 
episode  12 score -59.69 average score -127.37 epsilon 0.18 
episode  13 score -161.55 average score -129.82 epsilon 0.10 
episode  14 score -82.82 average score -126.68 epsilon 0.01 
episode  15 score -53.73 average score -122.12 epsilon 0.01 
episode  16 score 33.31 aver

episode  136 score 253.98 average score 27.51 epsilon 0.01 
episode  137 score 258.92 average score 30.36 epsilon 0.01 
episode  138 score -27.06 average score 30.86 epsilon 0.01 
episode  139 score -112.31 average score 29.91 epsilon 0.01 
episode  140 score 279.61 average score 33.47 epsilon 0.01 
episode  141 score -37.16 average score 33.65 epsilon 0.01 
episode  142 score -4.81 average score 33.81 epsilon 0.01 
episode  143 score 125.01 average score 35.96 epsilon 0.01 
episode  144 score 197.17 average score 38.12 epsilon 0.01 
episode  145 score 279.16 average score 43.97 epsilon 0.01 
episode  146 score 257.09 average score 46.99 epsilon 0.01 
episode  147 score 196.53 average score 49.58 epsilon 0.01 
episode  148 score 173.32 average score 51.56 epsilon 0.01 
episode  149 score 210.80 average score 53.73 epsilon 0.01 
episode  150 score 224.63 average score 56.42 epsilon 0.01 
episode  151 score -12.95 average score 56.98 epsilon 0.01 
episode  152 score 241.50 average score 

episode  271 score 224.12 average score 67.92 epsilon 0.01 
episode  272 score 16.16 average score 65.95 epsilon 0.01 
episode  273 score -94.95 average score 64.21 epsilon 0.01 
episode  274 score -126.19 average score 61.09 epsilon 0.01 
episode  275 score 211.42 average score 61.21 epsilon 0.01 
episode  276 score -20.33 average score 60.72 epsilon 0.01 
episode  277 score 226.65 average score 60.99 epsilon 0.01 
episode  278 score -40.17 average score 60.47 epsilon 0.01 
episode  279 score -278.38 average score 59.90 epsilon 0.01 
episode  280 score 268.19 average score 62.37 epsilon 0.01 
episode  281 score -64.38 average score 58.61 epsilon 0.01 
episode  282 score -47.71 average score 58.24 epsilon 0.01 
episode  283 score -30.24 average score 54.92 epsilon 0.01 
episode  284 score 46.28 average score 53.08 epsilon 0.01 
episode  285 score -204.46 average score 48.37 epsilon 0.01 
episode  286 score 199.00 average score 50.58 epsilon 0.01 
episode  287 score 242.06 average score

episode  407 score 213.13 average score 148.79 epsilon 0.01 
episode  408 score 233.34 average score 152.96 epsilon 0.01 
episode  409 score 127.94 average score 154.84 epsilon 0.01 
episode  410 score 257.43 average score 154.71 epsilon 0.01 
episode  411 score 239.68 average score 157.14 epsilon 0.01 
episode  412 score -12.31 average score 155.26 epsilon 0.01 
episode  413 score 233.52 average score 157.91 epsilon 0.01 
episode  414 score 162.73 average score 157.04 epsilon 0.01 
episode  415 score 257.48 average score 156.77 epsilon 0.01 
episode  416 score 269.11 average score 160.11 epsilon 0.01 
episode  417 score 264.69 average score 160.00 epsilon 0.01 
episode  418 score 237.06 average score 159.61 epsilon 0.01 
episode  419 score 272.55 average score 160.06 epsilon 0.01 
episode  420 score 255.50 average score 160.51 epsilon 0.01 
episode  421 score 253.36 average score 162.50 epsilon 0.01 
episode  422 score 230.85 average score 164.68 epsilon 0.01 
episode  423 score -15.8

NameError: name 'socres' is not defined