In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import random
import gym

import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

env_name = "LunarLander-v2"

env = gym.make(env_name)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

LEARNING_RATE = 5e-4
GAMMA = 0.99

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()

        # Enter your code here: ~ 3 lines depending on the amount of layers you want
        self.linear1 = nn.Linear(env.observation_space.shape[0], 64)
        self.linear2 = nn.Linear(64, 64)
        self.linear3 = nn.Linear(64, env.action_space.n)
        #

    def forward(self, x):
        # Enter your code here: ~ 3 lines depending on the amount of layers you want
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        #
        return x

    def predict(self, x):
        return x.argmax().detach().item()

print(NeuralNetwork(env))

In [None]:
from collections import deque
import numpy as np
BATCH_SIZE = 32

class Agent():
    def __init__(self, N):
        self.D = deque(maxlen=N)
        
    # note that we also store the `done` value in our memory
    # we will use it when we set the target_value 
    # for the `if episode terminates at step j + 1` condition
    def store_transition(self, state, action, reward, done, new_state):
        # Enter your code here: ~ 1 line
        return self.D.append((state, action, reward, done, new_state))
    
    def retrieve_transitions(self):
        # Enter your code here: ~ 1 line
        transitions = random.sample(self.D, BATCH_SIZE)

        # Retrieving each element from sample
        states = ([t[0] for t in transitions])
        actions = ([t[1] for t in transitions])
        rewards = ([t[2] for t in transitions])
        dones = ([t[3] for t in transitions])
        new_states = ([t[4] for t in transitions])

        # Converting elements to tensors 
        # and adding a dimension where needed with unsqueeze()
        states_t = torch.as_tensor(np.array(states), dtype=torch.float32)
        actions_t = torch.as_tensor(np.array(actions), dtype=torch.int64).unsqueeze(-1)
        rewards_t = torch.as_tensor(np.array(rewards), dtype=torch.float32).unsqueeze(-1)
        dones_t = torch.as_tensor(np.array(dones), dtype=torch.float32).unsqueeze(-1)
        new_states_t = torch.as_tensor(np.array(new_states), dtype=torch.float32)

        return states_t, actions_t, rewards_t, dones_t, new_states_t

    
    def epsilon_greedy_action(self, epsilon, greedy_action, action_space):
        # Enter your code here: (~ 5 lines)
        random_value = random.random()
        if random_value > epsilon:
            return greedy_action
        if random_value <= epsilon:
            return action_space.sample()
        #

In [None]:
online_network = NeuralNetwork(env)
target_network = NeuralNetwork(env)

# Choose an optimizer and set it to `online_network`'s parameters
optimizer = torch.optim.RMSprop(params=online_network.parameters(), lr=LEARNING_RATE)
# Choose a loss function
criterion = nn.SmoothL1Loss()

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

# N
MEMORY_CAPACITY = 10000
# M
EPISODES = 500
# T
FRAMES = 1000
# C
UPDATE_FREQUENCY = 1000


In [None]:
agent = Agent(MEMORY_CAPACITY)

state, _ = env.reset()
for frame in range(MEMORY_CAPACITY):
    action = env.action_space.sample()

    new_state, reward, done, _, _ = env.step(action)

    agent.store_transition(state, action, reward, done, new_state)

    state = new_state

    if done:
        state, _ = env.reset()

In [None]:
epsilon = 1.0
steps = 0

recent_rewards = deque(maxlen=10)

train_rewards = []

for episode in range(EPISODES):
    state, _ = env.reset()

    episode_reward = 0
    episode_loss = []

    for frame in range(FRAMES):
        epsilon = max(0.1, epsilon * 0.99995)
        # interacting with the environment
        state_t = torch.as_tensor(np.array(state), dtype=torch.float32)
        q_values = online_network.forward(state_t)
        greedy_action = online_network.predict(q_values)
        
        action = agent.epsilon_greedy_action(epsilon, greedy_action, env.action_space)

        new_state, reward, done, trunc, _ = env.step(action)
        episode_reward += reward
        agent.store_transition(state, action, reward, done, new_state)

        state = new_state
        ## updating network
        states, actions, rewards, dones, new_states = agent.retrieve_transitions()

        max_next_Q = target_network.forward(new_states).max(dim=1, keepdim=True)[0]
        Y = rewards + GAMMA * (1 - dones) * max_next_Q

        current_Q = online_network.forward(states).gather(dim=1, index=actions)

        loss = criterion(Y, current_Q)        
        episode_loss.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # reset target network
        if steps % UPDATE_FREQUENCY == 0:
            target_network.load_state_dict(online_network.state_dict())
        steps += 1
        # end episode if done
        if done or trunc: 
            break

    writer.add_scalar("Reward/train", episode_reward, episode)
    writer.add_scalar("Loss/train", np.mean(episode_loss), episode)

    recent_rewards.append(episode_reward)
    train_rewards.append(episode_reward)

    if episode % 10 == 0:
        print(f"Episode {episode}:")
        print(f"\tReward:\t{np.mean(recent_rewards)}")
        print(f"\tLoss:\t{np.mean(episode_loss)}")
        print(f"\tEpsilon:\t{epsilon}")

In [None]:
fig, ax = plt.subplots()

# plotting rewards
ax.plot(train_rewards, alpha=0.3, label='raw rewards')
ax.plot(gaussian_filter1d(train_rewards, sigma=5), linewidth=2, label='smooth reward curve')
plt.axhspan(min(train_rewards), 200, color='red', alpha=0.2, label='failed')
plt.axhspan(200, max(train_rewards), color='green', alpha=0.2, label='solved')
ax.legend()
ax.set_title('Rewards')
# show figure
fig.show()

In [None]:
env = gym.make(env_name, render_mode="human")

rew = 0

online_network = NeuralNetwork(env)
online_network.load_state_dict(torch.load('model_weights.pt'))
online_network.eval()

state, _ = env.reset()
while True:
    for _ in range(500):
        state_t = torch.as_tensor(np.array(state), dtype=torch.float32)
        q_values = online_network.forward(state_t)
        action = online_network.predict(q_values)

        new_state, reward, done, trunc, _ = env.step(action)

        state = new_state

        env.render()

        rew += reward

        if done or trunc:
            print(rew)
            rew = 0
            state, _ = env.reset()