<a href="https://colab.research.google.com/github/ShokuninSan/deep-q-learning-from-paper-to-code/blob/master/09_frozen_lake_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torch as T

In [None]:
%matplotlib inline

# FrozenLake using Naive Deep Q-learning algorithm

## The `DQN`

In [None]:
class DQN(nn.Module):
    
    def __init__(self, n_input, n_hidden, n_classes,
                 loss=nn.MSELoss(),
                 optimizer=optim.Adam,
                 activation=lambda x: x,
                 lr=0.001):
        super(DQN, self).__init__()
        
        self.fc1 = nn.Linear(n_input, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_classes)
        
        self.activation = activation
        self.loss = loss
        self.optimizer = optimizer(self.parameters(), lr=lr)       
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, state):
        fc1 = F.relu(self.fc1(state))
        fc2 = self.fc2(fc1)
        return fc2

### Test the DQN and approximate a binary AND operator

In [None]:
net = DQN(2, 4, 1, loss=nn.BCELoss(), optimizer=optim.Adam, lr=0.1, activation=T.sigmoid)

In [None]:
X = np.array([
    [1, 1],
    [0, 1],
    [0, 0],
    [1, 0]
]*10)

Y = [[1], [0], [0], [0]]*10

In [None]:
for x, y in zip(X, Y):
    net.optimizer.zero_grad()

    x_ = T.tensor(x, dtype=T.float).to(net.device)
    y_ = T.tensor(y, dtype=T.float).to(net.device)

    y_pred = net.activation(net.forward(x_))

    loss = net.loss(y_pred, y_)
    loss.backward()

    net.optimizer.step()

In [None]:
T.sigmoid(net.forward(T.tensor([1,1], dtype=T.float))).item()

In [None]:
T.sigmoid(net.forward(T.tensor([0,0], dtype=T.float))).item()

In [None]:
T.sigmoid(net.forward(T.tensor([0,1], dtype=T.float))).item()

In [None]:
T.sigmoid(net.forward(T.tensor([1,0], dtype=T.float))).item()

## The `Agent`

Reuse the `Agent` implementation of the Q-learning example and substitute the dict by a `DQN`.

In [None]:
class Agent:

    def __init__(self, n_observations, n_actions, gamma=0.99):
        self.n_actions = n_actions
        self.n_observations = n_observations
        self.gamma = gamma
        self.Q = DQN(n_observations, 128, n_actions)

    def select_action(self, state, epsilon):
        action = None
        if np.random.rand() < epsilon:
            # do random action selection
            action = np.random.randint(self.n_actions)
        else:
            # do greedy action selection
            state = T.tensor(state, dtype=T.float).to(self.Q.device)
            action_values = self.Q.forward(state)
            action = T.argmax(action_values).item()

        return action
    
    def learn(self, state, action, reward, new_state):
        state = T.tensor(state, dtype=T.float)
        new_state = T.tensor(state, dtype=T.float)
        Q = self.Q.forward(state)
        Q_ = self.Q.forward(new_state)
        expected_values = Q + reward + self.gamma * Q_ - Q
        
        self.Q.optimizer.zero_grad()
        
        input_values = T.tensor(state, dtype=T.float).to(self.Q.device)
        target_values = T.tensor(expected_values, dtype=T.float).to(self.Q.device)
        
        predicted_values = self.Q.activation(self.Q.forward(input_values))
        
        loss = self.Q.loss(predicted_values, target_values)
        loss.backward()
        
        self.Q.optimizer.step()

In [None]:
N_EPISODES = 10_000
ALPHA = 0.001
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_END_TARGET = 2500

In [None]:
rewards = []
avg_100_rewards = []
env = gym.make('CartPole-v0') #gym.make('FrozenLake-v0')

In [None]:
env.observation_space.shape, env.action_space.n

In [None]:
agent = Agent(n_observations=env.observation_space.shape[0], n_actions=env.action_space.n)
epsilon = EPSILON_START

In [None]:
for episode in range(N_EPISODES):

    state = env.reset()
    episode_reward = 0

    while True:
        
        action = agent.select_action(state, epsilon)

        new_state, reward, is_done, _ = env.step(action)
        
        agent.learn(state, action, reward, new_state)
        
        episode_reward += reward

        if is_done:
            rewards.append(episode_reward) 
            break

    epsilon = np.max([epsilon - (EPSILON_START/EPSILON_END_TARGET), EPSILON_END]) 

    if episode % 100 == 0:
        avg_reward = np.mean(rewards[-100:])
        avg_100_rewards.append(avg_reward)

    if episode % 1000 == 0:
        print(f'Average reward over last episodes was {avg_100_rewards[-1]}, '
              f'epsilon: {epsilon}')


In [None]:
plt.plot(avg_100_rewards)

In [None]:
agent.Q