In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import random
from IPython.display import clear_output
from collections import deque
from tqdm.notebook import tqdm
from collections import deque

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras.backend as K

tf.compat.v1.disable_eager_execution()


## Noisy Networks

### Main Ideas:

1. **Enhancing Exploration with Parameter Noise**:
   - Traditional exploration strategies in reinforcement learning, such as ε-greedy, involve adding randomness to action selection.
   - **Noisy Networks (Noisy Nets)** introduce stochasticity directly into the network parameters, enabling more sophisticated and efficient exploration by incorporating parameter noise into the action-value function.

2. **Parameter-Based Exploration**:
   - Instead of perturbing actions or rewards, Noisy Nets add noise to the network weights, allowing the agent to explore by varying its policy in a more structured manner.
   - This approach can lead to more consistent exploration patterns and improved sample efficiency.

3. **Learnable Noise Parameters**:
   - Noisy Nets incorporate learnable parameters for noise, allowing the network to adjust the level of stochasticity based on the learning process.
   - This adaptability enables the agent to reduce exploration as it becomes more confident in its policy, leading to a natural annealing of exploration over time.

4. **Integration with Other Rainbow Components**:
   - **Noisy Networks** synergize with other Rainbow DQN components, such as **Dueling Networks**, **Prioritized Experience Replay**, and **Double DQN**, to enhance overall learning efficiency and performance.
   - When combined, these techniques provide a more comprehensive approach to tackling complex reinforcement learning tasks.

### Structure of Noisy Networks:

- **Noisy Linear Layers**:
  - Traditional linear layers in neural networks are augmented with noise, typically using factorized Gaussian noise.
  - Each weight in the noisy linear layer is represented as:
    $$
    y = (b + W x) + (b_{\text{noisy}} + W_{\text{noisy}} x)
    $$
    where:
    - $ b $ and $ W $ are the deterministic bias and weight parameters.
    - $ b_{\text{noisy}} $ and $ W_{\text{noisy}} $ are the noisy bias and weight parameters.
    - The noisy parameters are defined as:
      $$
      W_{\text{noisy}} = \sigma_W \odot \epsilon_{\text{out}} \epsilon_{\text{in}}^T
      $$
      $$
      b_{\text{noisy}} = \sigma_b \odot \epsilon_{\text{out}}
      $$
      where:
      - $ \sigma_W $ and $ \sigma_b $ are learnable standard deviation parameters.
      - $ \epsilon_{\text{in}} $ and $ \epsilon_{\text{out}} $ are noise vectors sampled from a standard normal distribution.
      - $ \odot $ denotes element-wise multiplication.
      - $ \epsilon_{\text{out}} \epsilon_{\text{in}}^T $ represents the outer product, ensuring that noise is factorized across input and output dimensions.

- **Factorized Gaussian Noise**:
  - To efficiently generate noise for all weights, Noisy Nets use factorized Gaussian noise.
  - This involves generating separate noise vectors for the input $ \epsilon_{\text{in}} $ and output $ \epsilon_{\text{out}} $ dimensions and combining them to produce the noise for each weight.
  - The noise variables are transformed using a function $ f $ (often $ f(x) = \text{sign}(x) \sqrt{|x|} $) to ensure zero mean and unit variance, maintaining the stability of the network.

- **Learnable Noise Parameters**:
  - The parameters $ \mu_W $, $ \sigma_W $, $ \mu_b $, and $ \sigma_b $ are learned alongside the rest of the network parameters.
  - This allows the network to adapt the level of noise based on the learning dynamics, facilitating controlled exploration.

### Why Noisy Networks?

1. **Efficient and Adaptive Exploration**:
   - Noisy Nets provide a more efficient exploration strategy by injecting noise directly into the network parameters, allowing for richer and more consistent exploration patterns compared to simple action perturbations.

2. **Reduced Need for External Exploration Strategies**:
   - By incorporating noise into the network itself, the reliance on external exploration mechanisms like ε-greedy is reduced, simplifying the overall architecture.

3. **Learnable Exploration Rate**:
   - The exploration level is not fixed but learned, allowing the agent to automatically adjust the degree of exploration based on its confidence in the policy, leading to a natural annealing process.

4. **Improved Sample Efficiency**:
   - By enabling more effective exploration, Noisy Nets can lead to faster learning and better performance with fewer training samples.

### Advantages of Noisy Networks

1. **Improved Exploration Efficiency**:
   - Provides a structured and efficient way to explore the action space, potentially discovering better policies more quickly than random action perturbations.

2. **Adaptive Noise Levels**:
   - The network learns to adjust the noise levels, reducing exploration as the agent becomes more certain about the optimal policy, which can enhance convergence rates.

3. **Simplified Exploration Mechanism**:
   - Eliminates the need for manually tuning exploration parameters like ε in ε-greedy strategies, as the noise levels are learned automatically.

4. **Enhanced Performance in Complex Environments**:
   - Particularly beneficial in environments with large or continuous action spaces, where traditional exploration methods may struggle.

In [3]:
class NoisyDense(tf.keras.layers.Layer):
    def __init__(self, 
                 units, 
                 activation=None,
                 trainable = True, 
                 sigma_0=0.5):
        super(NoisyDense, self).__init__()
        self.units = units
        self.activation = tf.keras.activations.get(activation)
        self.trainable = trainable
        self.sigma_0 = sigma_0

    def build(self, input_shape):
        input_dim = input_shape[-1]
        self.w_mu = self.add_weight(
            name="w_mu",
            shape=(input_dim, self.units),
            initializer=tf.keras.initializers.RandomUniform(
                minval=-1. / np.sqrt(input_dim),
                maxval=1. / np.sqrt(input_dim)
            ),
            trainable=self.trainable
        )

        self.w_sigma = self.add_weight(
            name="w_sigma",
            shape=(input_dim, self.units),
            initializer=tf.keras.initializers.Constant(self.sigma_0 / np.sqrt(input_dim)),
            trainable=self.trainable
        )

        self.b_mu = self.add_weight(
            name="b_mu",
            shape=(self.units,),
            initializer=tf.keras.initializers.RandomUniform(
                minval=-1. / np.sqrt(input_dim),
                maxval=1. / np.sqrt(input_dim)
            ),
            trainable=self.trainable
        )

        self.b_sigma = self.add_weight(
            name="b_sigma",
            shape=(self.units,),
            initializer=tf.keras.initializers.Constant(self.sigma_0 / np.sqrt(input_dim)),
            trainable=True
        )

    def call(self, inputs):
        # Add noise
        epsilon_in = self.f(tf.random.normal(shape=(self.w_mu.shape[0], 1)))
        epsilon_out = self.f(tf.random.normal(shape=(1, self.w_mu.shape[1])))

        w_epsilon = tf.matmul(epsilon_in, epsilon_out)
        b_epsilon = epsilon_out

        w = self.w_mu + self.w_sigma * w_epsilon
        b = self.b_mu + self.b_sigma * b_epsilon
        
        output = tf.matmul(inputs, w) + b
        if self.activation is not None:
            output = self.activation(output)
        return output

    @staticmethod
    def f(x):
        return tf.sign(x) * tf.sqrt(tf.abs(x) + 1e-10)

In [None]:
class Noisy_Agent():
    def __init__(
            self,
            observation_space,
            action_space,
            gamma = 0.99,
            lr = 0.001,
            maxlen = 50000,):
        
        self.observation_space = observation_space
        self.action_space = action_space
        self.gamma = gamma
        self.lr = lr
        self.buffer = deque(maxlen=maxlen)

        self.model = self.build_model(name = "model")
        self.target_model = self.build_model(name = "target")
        self.update_target()

    def remember(self,state, action, reward, next_state, done):
        self.buffer.append([state, action, reward, next_state, done])

    def get_batch(self, batch_size):
        if len(self.buffer) < batch_size:
            return None
        
        batch = random.sample(self.buffer, k=batch_size)
        
        S = np.array([val[0] for val in batch])
        A = np.array([val[1] for val in batch])
        R = np.array([val[2] for val in batch])
        S_ = np.array([val[3] for val in batch])
        T = np.array([val[4] for val in batch])

        return S, A, R, S_, T

    
    def build_model(self, name):
        model = keras.Sequential(name=name)
        model.add(keras.layers.InputLayer(input_shape=self.observation_space))
        model.add(NoisyDense(128, activation='relu'))
        model.add(NoisyDense(128, activation='relu'))
        model.add(NoisyDense(128, activation='relu'))
        model.add(NoisyDense(self.action_space, activation=None))
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=self.lr),
            loss='mse'
        )
        return model

    def update_target(self):
        self.target_model.set_weights(self.model.get_weights())

    def predict(self, observation, target = False):
        if target:
            return self.target_model.predict(np.array([observation]), verbose = False)[0]
        return self.model.predict(np.array([observation]), verbose= False)[0]
    
    def predict_action(self, observation, epsilon=0.05):
        if np.random.random() < epsilon:
            return np.random.randint(self.action_space)
        return np.argmax(self.model.predict(np.array([observation]), verbose=False)[0])
    
    def train(self, batch_size):
        if len(self.buffer) < batch_size:
            return
        
        S, A, R, S_, T = self.get_batch(batch_size)
        
        Q_values = self.model.predict(S, verbose=False)
        
        Q_next = self.target_model.predict(S_, verbose=False)
        
        max_Q_next = np.max(Q_next, axis = 1)

        targets = Q_values.copy()
        targets[np.arange(batch_size), A] = R + self.gamma * max_Q_next * (1 - T)

        self.model.fit(S, targets, verbose=False)


In [None]:
env = gym.make("LunarLander-v2")
agent = Noisy_Agent(env.observation_space.shape, env.action_space.n)

episodes = 2000
max_t = 500
e=0.05
scores = []
avgs = []
time = 0
frequency = 400

for episode in range(episodes):
    observation, _ = env.reset()
    
    score = 0
    for t in range(max_t):
        time += 1
        s = observation
        a = agent.predict_action(s, e)
        observation, reward, terminated, _, _ = env.step(a)
        score +=  reward
        s_ = observation
        agent.remember(s, a, reward, s_, terminated)
        agent.train(batch_size=32)
        if time % frequency == 0:
            agent.update_target()
        if terminated:
            break
    scores.append(score)
    avgs.append(np.sum(scores[-50:])/len(scores[-50:]))
    print(f"episode: {episode}, e: {e}, t: {t}, score: {score : .2f}, avg score: {avgs[-1]: .2f}")
    if avgs[-1] >= 200:
        break

  updates=self.state_updates,
  if not isinstance(terminated, (bool, np.bool8)):


episode: 0, e: 0.05, t: 83, score: -288.28, avg score: -288.28
episode: 1, e: 0.05, t: 55, score: -321.83, avg score: -305.05
episode: 2, e: 0.05, t: 90, score: -339.97, avg score: -316.69
episode: 3, e: 0.05, t: 75, score: -249.51, avg score: -299.90
episode: 4, e: 0.05, t: 78, score: -621.52, avg score: -364.22
episode: 5, e: 0.05, t: 50, score: -323.85, avg score: -357.49
episode: 6, e: 0.05, t: 65, score: -517.15, avg score: -380.30
episode: 7, e: 0.05, t: 86, score: -392.55, avg score: -381.83
episode: 8, e: 0.05, t: 134, score: -517.89, avg score: -396.95
episode: 9, e: 0.05, t: 97, score: -581.84, avg score: -415.44
episode: 10, e: 0.05, t: 68, score: -636.42, avg score: -435.53
episode: 11, e: 0.05, t: 55, score: -316.51, avg score: -425.61
episode: 12, e: 0.05, t: 96, score: -607.85, avg score: -439.63
episode: 13, e: 0.05, t: 92, score: -95.09, avg score: -415.02
episode: 14, e: 0.05, t: 62, score: -224.90, avg score: -402.35
episode: 15, e: 0.05, t: 131, score: -575.56, avg 