In [None]:
# Importing the necessaries
import datetime
import numpy as np
import pandas as pd

#A QTable class is created and the hyperparamerters governing it are defined and updated with values
# In order
#Learning rate
# Discount factor which determines the rate of arrival of rewards
# Probability of selecting action for a state
# Rate of reduction of epsilon for better precision in exploration
# Minimum value of above mentioned reduction

class QTable(object):
    def __init__(
        self, 
        obv_sp=500,
        action_space=6, 
        alpha=0.5, 
        gamma=0.9
    ):
        self.alpha             = alpha
        self.gamma             = gamma
        self.obv_sp = obv_sp
        self.action_space      = action_space
        self.__q               = np.zeros(self.obv_sp * self.action_space)\
                                    .reshape((self.obv_sp, self.action_space))

# We define our Q learning model
#This maps (state, action)
    def q(self, state=None, action=None):
        if state is None:
            return self.__q
        if action is None:
            return self.__q[state]
        return self.__q[state][action]

# We then update our Q-learning model, 
#given a state, an action taken in that state, a new resulting state, and the reward received from taking that action.    
    def update_q(self, state, action, value):
        self.__q[state][action] = value

    def max_q(self, state):
        return np.max(self.__q[state])

    def old_value(self, state, action):
        return (1 - self.alpha) * self.q(state, action)

# We define a function to calculate discounted rewards for current time step
    def discounted_reward(self, state):
        return self.gamma * self.max_q(state)


# Now for a new state (new_s)
# Estimate the rerward for the new state
# Along with all possible actions in the new state
# Q table values are thus upated in relation to a new state
# Thus new value is updated
    def sarsa_max_update(self, s, a, r, new_s):
        new_value = self.old_value(s, a) + (self.alpha * (r + self.discounted_reward(new_s) - self.q(s, a)))
        self.update_q(s, a, new_value)
        
    def save(self, score):
        timestamp = datetime.datetime.now().timestamp()
        timestamp_12_digit = int(timestamp * 1000)
        df = pd.DataFrame(self.__q)
        df.to_csv("alpha_{}_gamma_{}_score_{}__{}.csv".format(self.alpha, self.gamma, score, timestamp_12_digit))