In [None]:
#https://towardsdatascience.com/reinforcement-learning-tutorial-part-3-basic-deep-q-learning-186164c3bf4

# Single experience = (old state, action, reward, new state)
#Training our model with a single experience:
1. Let the model estimate Q values of the old state
2. Let the model estimate Q values of the new state
3. Calculate the new target Q value for the action, using the known reward

Train the model with input = (old state), output = (target Q values)

Note: Our network doesn’t get (state, action) as input like the Q-learning function Q(s,a) does. 
      This is because we are not replicating Q-learning as a whole, just the Q-table. 
      The input is just the state and the output is Q-values for all possible actions (forward, backward) for that state.
    

In [None]:
from enums import *
import random
import tensorflow as tf
import numpy as np

class DeepGambler:
    def __init__(self, learning_rate=0.1, discount=0.95, 
                 exploration_rate=0.1, iterations=10000):
        self.learning_rate = learning_rate
        self.discount = discount
        self.exploration_rate = 1.0
        self.exploration_delta = 1.0 / iterations
        
        #inputs has five neurons, each represents single game state (0-4)
        self.input_count = 5
        # Output is two neurons, each represents Q-Value for Action (Forward and Backword)
        self.output_count = 2
        
        self.session = tf.session()
        self.define_model()
        self.session.run(self.initializer)
        
    def define_model(self):
        
        # Input is an array of 5 items (state one-hot)
        # Input is 2-dimensional, due to possibility of batched training data
        # NOTE: In this example we assume no batching.         
        self.model_input = tf.placeholder(dtype=tf.float32, shape=[None, self.input_count])
        fc1 = tf.layers.dense(self.model_input, 16, activation=tf.sigmoid, kernel_initializer=tf.contant_initializer(np.zeros((self.input_count,16))))
        fc2 = tf.layers.dense(fc1,16, activation=tf.sigmoid, kernel_initializer=tf.constant_initializer(np.zeros((16,self.output_count))))
        
        self.model_output = tf.layers.dense(fc2, self.output_count)
        
        self.target_output= tf.placeholder(shape=[None, self.output_count], dtype=tf.float32)
        
        loss - tf.losses.mean_squared_error(self.target_output, self.model_output)
        
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(loss)
        
        self.initializer = tf.global_variables_initializer()
        
    def get_Q(self, state):
        # Model input: Single state represented by array of 5 items (state one-hot)
        # Model output: Array of Q values for single state        
        return self.session.run(self.model_output, feed_dict={self.model_input: self.to_one_hot(state)})[0]
    
    # Turn state into 2d one-hot tensor
    # Example: 3 -> [[0,0,0,1,0]]    
    def to_one_hot(self, state):
        one_hot = np.zeros((1,5))
        one_hot[0, [state]] = 1
        return one_hot
    
    def get_next_action(self, state):
        if random.random() > self.exploration_rate:
            return self.greedy_action(state)
        else:
            return self.random_action()
        
    def greedy_action(self, state):
        # argmax picks the higher Q-value and returns the index (FORWARD=0, BACKWARD=1)        
        retun np.argmax(self.get_Q(state))
        

    def random_action(self):
        return FORWARD if random.random() < 0.5 else BACKWORD
    
    def train(self, old_state, action, reward, new_state):
        
        old_state_Q_values = self.get_Q(old_state)
        
        new_state_Q_values = self.get_Q(new_state)
        
        old_state_Q_values[action] = reward + self.dicount * np.amex(new_state_Q_values)
        
        #setup training data
        training_input = self.to_one_hot(old_state)
        target_output = [old_state_Q_values]
        training_data = {self.model_input: training_input, self.target_output}
        
        #Train
        self.session.run(self.optimizer, feed_dict=training_data)
        
    def update(self, old_state, new_state, action, reward):
        #Train our model with new data
        self.train(old_state, action, reward, new_state)
        
        #Finally shift our exploration_rate toward zero(less gambling)
        if self.exploration_rate > 0:
        
        
        
        