In [1]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPSILON = 0.1
GAMMA = 0.99
TARGET_UPDATE = 10
REPLAY_CAPACITY = 10000
ALPHA = 0.6
BETA = 0.4
BETA_INCREMENT = 0.001
UPDATE_EVERY = 4  
BUFFER_SIZE = int(1e5) 

In [6]:
pip install 'gym[box2d]'

Collecting box2d-py==2.3.5
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25ldone
[?25h  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp38-cp38-macosx_10_9_x86_64.whl size=498234 sha256=16be6af9aed10345c81ee93dacebe4ada591889f977bc5ad6141a72084feddf3
  Stored in directory: /Users/kamsingh/Library/Caches/pip/wheels/8b/95/16/1dc99ff9a3f316ff245fdb5c9086cd13c35dad630809909075
Successfully built box2d-py
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.5
Note: you may need to restart the kernel to use updated packages.


In [7]:
class PrioritizedReplayBuffer():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)

    def push(self, state, action, reward, next_state, done):
        transition = (state, action, reward, next_state, done)
        max_priority = self.priorities.max() if self.memory else 1.0
        if len(self.memory) < self.capacity:
            self.memory.append(transition)
        else:
            self.memory[self.pos] = transition
        self.priorities[self.pos] = max_priority
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta):
        if len(self.memory) == self.capacity:
            priorities = self.priorities
        else:
            priorities = self.priorities[:self.pos]
        probs = priorities ** ALPHA
        probs /= probs.sum()
        indices = np.random.choice(len(self.memory), batch_size, p=probs)
        experiences = [self.memory[i] for i in indices]
        total = len(self.memory)
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights = np.array(weights, dtype=np.float32)
        states = torch.from_numpy(np.vstack([e[0] for e in experiences])).float()
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences])).long()
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences])).float()
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences])).float().
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences]).astype(np.uint8)).float()
        return (states, actions, rewards, next_states, dones, indices, weights)

    def update_priorities(self, indices, priorities):
        for i, priority in zip(indices, priorities):
            self.priorities[i] = priority

    def __len__(self):
        return len(self.memory)


In [8]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed):
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [94]:
class DQNAgent:
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed)
        self.qnetwork_target = QNetwork(state_size, action_size, seed)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LEARNING_RATE)

        # Replay memory
        self.memory = PrioritizedReplayBuffer(BUFFER_SIZE)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        
    def steps(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.push(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample(BATCH_SIZE, BETA)
                self.learn(experiences, GAMMA)
                
                
#     def act(self, state, eps=0.):
#         """Returns actions for given state as per current policy.

#         Params
#         ======
#             state (array_like): current state
#             eps (float): epsilon, for epsilon-greedy action selection
#         """
# #         print(state)
# #         state_array = np.array(state)
# #         print(f'state {state_array} is of type {type(state_array)} and shape {state.shape}')
# #         state_array, _ = state
#         state_array = state[0]
#         state_tensor = torch.from_numpy(state_array).float().unsqueeze(0)
        
#         state = torch.from_numpy(state_array).float().unsqueeze(0)
#         self.qnetwork_local.eval()
#         with torch.no_grad():
#             action_values = self.qnetwork_local(state)
#         self.qnetwork_local.train()

#         # Epsilon-greedy action selection
#         if random.random() > eps:
#             return np.argmax(action_values.cpu().data.numpy())
#         else:
#             return random.choice(np.arange(self.action_size))

## THERE IS SOMETHING WRONG HERE THATS GIVING ME SHAPE PROBLEMS =(
    def act(self, state, eps=0.):
        # Convert state to tensor
        print(f'state position 0 is {state[0:-1]}, and state position 1 is , {state[-1]}')
        state_array = state
        print(np.array(state[0:-1]))
        print(f'state position 0 is {state[0:-1]}, and state position 1 is , {state[-1]}')
        
        print(len(state))
        state_array = np.empty([1,len(state[0:-1])])
        state_array[0:-1], _ = state
        state_tensor = torch.from_numpy(state_array).float().unsqueeze(0)

        # Get action values
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state_tensor)

        # Choose epsilon-greedy action
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def learn(self, experiences, gamma):
            states, actions, rewards, next_states, dones, indices, weights = experiences

            # Compute Q targets for next states
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

            # Compute Q targets for current states
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states).gather(1, actions)

            # Compute loss
            td_errors = Q_targets - Q_expected
            loss = (weights * td_errors ** 2).mean()

            # Minimize the loss
            self.optimizer.zero_grad()
            loss.backward()
            self.memory.update_priorities(indices, td_errors.abs().detach().cpu().numpy())
            self.optimizer.step()

            # Update target network
            self.soft_update(self.qnetwork_local, self.qnetwork_target)

            # Update beta parameter
            global BETA
            BETA = min(1.0, BETA + BETA_INCREMENT)
            
            
    def soft_update(self, local_model, target_model, tau=0.001):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)


In [95]:
import gym

# now theres a problem with the environment
env = gym.make('ALE/seaquest-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size=state_size, action_size=action_size, seed=0)

n_episodes = 1000
max_t = 1000
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.995

for i_episode in range(1, n_episodes+1):
    state = env.reset()
    eps = eps_start
    for t in range(max_t):
        # Select action
        action = agent.act(state, eps)
        print(f'taking action {action}')
        # Take action
        next_state, reward, done, info,_ = env.step(action)  # why does this return 5 values not 4???
        # Store experience
        agent.steps(state, action, reward, next_state, done)
        # Update state
        state = next_state
        # Update epsilon
        eps = max(eps_end, eps_decay*eps)
        # If episode is done, exit loop
        if done:
            break
    # Print episode score
    print(f"Episode {i_episode} score: {t+1}")
    
    # Update beta parameter
    agent.memory.beta = min(agent.memory.beta + BETA_INCREMENT, 1)

    # Update target network
    if i_episode % TARGET_UPDATE == 0:
        agent.qnetwork_target.load_state_dict(agent.qnetwork_local.state_dict())


state position 0 is (array([ 1.1356354e-03,  1.3998280e+00,  1.1501759e-01, -4.9298069e-01,
       -1.3091781e-03, -2.6053220e-02,  0.0000000e+00,  0.0000000e+00],
      dtype=float32),), and state position 1 is , {}
2


ValueError: could not broadcast input array from shape (8,) into shape (0,1)

In [102]:
pip install 'gym[atari]'

Collecting ale-py~=0.8.0
  Downloading ale_py-0.8.1-cp38-cp38-macosx_10_15_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 3.3 MB/s eta 0:00:01
[?25hCollecting importlib-resources
  Using cached importlib_resources-5.12.0-py3-none-any.whl (36 kB)
Installing collected packages: importlib-resources, ale-py
Successfully installed ale-py-0.8.1 importlib-resources-5.12.0
Note: you may need to restart the kernel to use updated packages.


In [108]:
pip install gym --upgrade

Note: you may need to restart the kernel to use updated packages.


In [109]:
import gym

# Create an instance of the Seaquest environment
env = gym.make('Seaquest-v0')


NameNotFound: Environment Seaquest doesn't exist. 

In [111]:
env_names = [spec.id for spec in gym.envs.registry.values()]
print('Seaquest-v0' in env_names)

False


In [113]:
!pip install atari_py

Collecting atari_py
  Downloading atari_py-0.2.9-cp37-cp37m-macosx_10_12_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Installing collected packages: atari_py
Successfully installed atari_py-0.2.9


In [138]:
import gym
from gym.envs.atari import AtariEnv
from gym import error, spaces

def _make_seaquest_env(full_action_space=True):
    env = AtariEnv(game='seaquest', obs_type='image', frameskip=1)
    if not full_action_space:
        env = AtariEnv(game='seaquest', obs_type='image', frameskip=1, full_action_space=False)
    return env

# Register the Seaquest environment
try:
    gym.envs.register(
        id='Seaquest-v0',
        entry_point=_make_seaquest_env,
        max_episode_steps=100000,
        reward_threshold=250000.0,
    )
except error.Error:
    pass


ModuleNotFoundError: No module named 'gym.envs.atari'