# Addition of a Neural Network


In the previous notebook, we have seen that we need to keep a tracking of 
the avg while weighing probabilities and expected rewards. A neural network 
can, therefore, be a valuable asset in RL as they standout in dataset 
characterization and learning a model / feature set 


In [1]:
# Bandit class copied from Deep Reinforcement Learning in Action book"

class ContextBandit:
    def __init__(self, arms=10):
        self.arms = arms
        self.init_distribution(arms)
        self.update_state()
        
    def init_distribution(self, arms):
        # Num states = Num Arms to keep things simple
        self.bandit_matrix = np.random.rand(arms,arms)
        #each row represents a state, each column an arm
        
    def reward(self, prob):
        reward = 0
        for i in range(self.arms):
            if random.random() < prob:
                reward += 1
        return reward
        
    def get_state(self):
        return self.state
    
    def update_state(self):
        self.state = np.random.randint(0,self.arms)
        
    def get_reward(self,arm):
        return self.reward(self.bandit_matrix[self.get_state()][arm])
        
    def choose_arm(self, arm):
        reward = self.get_reward(arm)
        self.update_state()
        return reward


In [31]:
# Import libraries 

import torch 
import torch.nn as nn
from torch.nn import functional as F
from collections import OrderedDict
import numpy as np
from torch.autograd import Variable
from matplotlib import pyplot as plt
import random
%matplotlib inline

In [5]:
def softmax_fn(av, tau = 1.12):
# This function receives average rewards and outputs the softmax probabilities 
# Arguments:
#   - av: expected averages
#   - tau: temperature. High val exaggerates differences; low value promotes homogenity
# Output:
#   - Softmaxed values

    softm = np.exp(av / tau) / np.sum( np.exp(av[:] / tau) )
    return softm

In [43]:
# Set params 

bandits = 10
env = ContextBandit(bandits)
n_in = bandits
n_hidden = 100
n_out = bandits
learning_rate = 0.1

In [40]:
# Neural network model definition 
model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(n_in, n_hidden)),
    ('ReLu1', nn.ReLU(inplace = True)),
    ('fc2', nn.Linear(n_hidden, n_out)),
])
)


In [44]:
# Loss metric and optimization criterion for training
loss = nn.MSELoss(size_average=False)
criterion = torch.optim.Adam(model.parameters(), lr=learning_rate)