In [1]:
import gym
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import base64, io
import numpy as np
from collections import deque, namedtuple
import time

In [2]:
# Table of environments- https://github.com/openai/gym/wiki/Table-of-environments
ENV_NAME = 'LunarLander-v2'

In [3]:
env = gym.make(ENV_NAME)
env.reset()

STATE_SIZE = len(env.observation_space.low)
ACTION_SIZE = env.action_space.n

done = False

time.sleep(1)
while not done:
    action = random.randint(0,ACTION_SIZE-1)
    new_state, reward, done, _ = env.step(action)    
    env.render()

env.close()

In [4]:
env.observation_space

Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)

In [5]:
env.action_space

Discrete(4)

In [6]:
class QNetwork(nn.Module):
    def init(self, state_size, action_size, seed):
        super(QNetwork, self).init()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
        
    def forward(self, state):
        x = self.fc1(state)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        return self.fc3(x)

In [7]:
def calculate_bucket_sizes(ob_space_box,memory):
    low,high = ob_space_box.low, ob_space_box.high
    start = high[0] - low[0]
    product = 1
    for a,b in zip(low[1:],high[1:]):
        product *= (b-a)/start
    bucket_start = (memory/product)**(1/len(low))
    mem_arr = [int(bucket_start)]
    for a,b in zip(low[1:],high[1:]):
        mem_arr.append(int((b-a)/start * bucket_start))
    return mem_arr

In [8]:
BUFFER_SIZE = int(1e5)  
BATCH_SIZE = 64         
GAMMA = 0.99            
TAU = 1e-3             
LR = 1e-3             
UPDATE_EVERY = 4
RENDER_EVERY = 50
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
class Agent():
    
    def init(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = rewards + gamma * q_targets_next * (1 - dones)
        q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(q_expected, q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)   
        
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [10]:
class ReplayBuffer:
    def init(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
        
    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [12]:
def dqn(episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores = []                        
    scores_window = deque(maxlen=100) 
    eps = eps_start                   
    for i_episode in range(1, episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if i_episode % RENDER_EVERY == 0:
                env.render()
            if done:
                break 
        scores_window.append(score)       
        scores.append(score)              
        eps = max(eps_end, eps_decay*eps)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=200.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
        env.close()
    return scores

agent = Agent(state_size=8, action_size=4, seed=42)
scores = dqn()

Episode 100	Average Score: -153.30
Episode 200	Average Score: -76.425
Episode 300	Average Score: 3.8785
Episode 400	Average Score: 66.70
Episode 500	Average Score: 129.22
Episode 600	Average Score: 176.05
Episode 700	Average Score: 180.65
Episode 800	Average Score: 183.21
Episode 875	Average Score: 201.76
Environment solved in 875 episodes!	Average Score: 201.76


In [13]:
print(scores)

[-67.40681434678442, -147.04335270227904, -93.35642094433472, -237.53897114439368, -128.50168698274024, -71.07253885741804, -272.0517110130799, -103.86621680235301, -135.23279313936123, -106.39074024638303, -148.11371976176747, -90.96117209405877, -34.05879067551106, -362.7630523310892, -333.0431441020667, -60.442844308252205, -112.34816037668699, -155.08318047726104, -168.7177897017663, -209.61276782513949, -25.23902710437177, -368.87632700234303, -281.52416774847063, -134.6876318950338, -184.28442528394316, -255.58491728994403, -69.98163730879797, -105.29104197302694, -99.5793065718472, -79.20542894871394, -31.234493982495223, -123.16788326435301, -181.9671919021605, -164.9270281623543, -59.05688085704969, -233.32901573526667, -131.4712602617549, -250.5949229622769, -90.71346324729868, -118.36101088087139, -108.03997758452228, -58.86677125738018, -242.68649014830618, -315.42875238673713, -113.54625983575306, -76.95944659952069, -234.67158655275728, -199.93741460044617, -136.959398447

In [None]:
fig = plt.figure()
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Trials')
plt.show()