## Problem 5 : Q-Learning

In [39]:
environment = {
        'a1': ('s1',1),
        'a2': ('s2',2),
        'a3': ('s3',3)
}

class Agent(object):
    """Learns to act within the environment."""

    def __init__(self):
        self.epsilon = 0.3 # Exploration rate
        self.gamma = 0.5 # Discount factor
        self.alpha = 0.7 # Learning rate
        self.Q_values = {}
        
    def choose(self, s, actions):
        """Return an action to try in this state."""
        p = random.random()
        if p < self.epsilon:
            return 0,random.choice(actions)
        else:
            return 1,self.policy(s, actions)

    def policy(self, s, actions):
        """Return the best action for this state."""
        max_value = max([self.Q(s,a) for a in actions])
        max_actions = [a for a in actions if self.Q(s,a) == max_value]
        return max_actions[0]

    def Q(self, s, a):
        """Return the estimated Q-value of this action in this state."""
        if (s,a) not in self.Q_values:
            self.Q_values[(s,a)] = 0
        return self.Q_values[(s,a)]
    
    def observe(self, s, a, sp, r, actions):
        """Update weights based on this observed step."""
        max_value = max([self.Q(sp,a) for a in actions])
        self.Q_values[(s,a)] = (1-self.alpha)*self.Q(s,a) + self.alpha*(r + self.gamma*max_value)

agent = Agent()
states = ['s1','s2','s3']
actions = ['a1','a2','a3']
wanted  = ['s1','a1',1,'s1','a2',2,'s2']

for epoch in range(1000):
    s = random.choice(states)
    for _ in range(3):
        _,a = agent.choose(s, actions)
        (sp,r) = environment[a]
        agent.observe(s, a, sp, r, actions)
        s = sp

for epoch in range(1000):
    s1 = 's1'
    c1,a1 = agent.choose(s1, actions)
    (s2,r1) = environment[a1]
    
    c2,a2 = agent.choose(s2, actions)
    (s3,r2) = environment[a2]
    
    seq = [s1,a1,r1,s2,a2,r2,s3]
    if seq == wanted:
        print(seq,c1,c2)
    

['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0
['s1', 'a1', 1, 's1', 'a2', 2, 's2'] 0 0


Yes, we obtained the sequence, as shown above. Both actions chosen were random and not from greedy policy.