In [1]:
import numpy as np
import MDP

class RL:
    def __init__(self,mdp,sampleReward):
        '''Constructor for the RL class

        Inputs:
        mdp -- Markov decision process (T, R, discount)
        sampleReward -- Function to sample rewards (e.g., bernoulli, Gaussian).
        This function takes one argument: the mean of the distributon and 
        returns a sample from the distribution.
        '''

        self.mdp = mdp
        self.sampleReward = sampleReward

    def sampleRewardAndNextState(self,state,action):
        '''Procedure to sample a reward and the next state
        reward ~ Pr(r)
        nextState ~ Pr(s'|s,a)

        Inputs:
        state -- current state
        action -- action to be executed

        Outputs: 
        reward -- sampled reward
        nextState -- sampled next state
        '''

        reward = self.sampleReward(self.mdp.R[action,state])
        cumProb = np.cumsum(self.mdp.T[action,state,:])
        nextState = np.where(cumProb >= np.random.rand(1))[0][0]
        return [reward,nextState]

    def qLearning(self,s0,initialQ,nEpisodes,nSteps,epsilon=0,temperature=0):
        '''qLearning algorithm.  Epsilon exploration and Boltzmann exploration
        are combined in one procedure by sampling a random action with 
        probabilty epsilon and performing Boltzmann exploration otherwise.  
        When epsilon and temperature are set to 0, there is no exploration.

        Inputs:
        s0 -- initial state
        initialQ -- initial Q function (|A|x|S| array)
        nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0
        nSteps -- # of steps per episode
        epsilon -- probability with which an action is chosen at random
        temperature -- parameter that regulates Boltzmann exploration

        Outputs: 
        Q -- final Q function (|A|x|S| array)
        policy -- final policy
        '''

        Q = np.zeros([self.mdp.nActions,self.mdp.nStates])
        policy = np.zeros(self.mdp.nStates,int)
        temperature = float(temperature)
        counts = np.zeros([self.mdp.nActions, self.mdp.nStates])


        for episode in range(nEpisodes):
            s = s0

            for step in range(nSteps):
                # Action selection
                if np.random.random() < epsilon:
                    a = np.random.randint(self.mdp.nActions)
                else:
                    if temperature == 0.:
                        a = np.argmax(Q[:, s])
                    else:
                        prob_a = np.exp(Q[:, s] / temperature) / np.sum(np.exp(Q[:, s] / temperature))
                        a = np.argmax(np.random.multinomial(1, prob_a))

                # Observe s' and r
                r, s_prime = self.sampleRewardAndNextState(s, a)

                # Update count
                counts[a, s] += 1.

                # Learning rate
                alpha = counts[a,s]

                # Update Q
                Q[a,s] += alpha*(r + self.mdp.discount*np.amax(Q[:,s_prime], axis=0) - Q[a,s])

                s = s_prime

                if np.linalg.norm(prev_Q - Q[a,s]) <= 1e-8:
                    break
                
        policy = np.argmax(Q, axis=0)

        return [Q,policy]    

In [2]:
''' Construct simple MDP as described in Lecture 2a Slides 13-14'''
T = np.array([[[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]],[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]]])
R = np.array([[0,0,10,10],[0,0,10,10]])
discount = 0.9        
mdp = MDP.MDP(T,R,discount)
rlProblem = RL(mdp,np.random.normal)

In [4]:
# Test Q-learning 
[Q,policy] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=1000,nSteps=100,epsilon=0.3)
print("\nQ-learning results")
print(Q)
print(policy)

  Q[a,s] += alpha*(r + self.mdp.discount*np.amax(Q[:,s_prime], axis=0) - Q[a,s])
  Q[a,s] += alpha*(r + self.mdp.discount*np.amax(Q[:,s_prime], axis=0) - Q[a,s])



Q-learning results
[[nan nan nan nan]
 [nan nan nan nan]]
[0 0 0 0]
