# Q Learning Awesome Mouse-Maze

<img src="resources/images/screen_shot.png" alt="Mouse Maze" width="300" height="300">

<label>Implementation of the Q learning module</label>

<h3>Step 1: Create AI Module</h3>
<li>Create new ai.py file</li>
<li>Create class AI()</li>
<li>Init variables</li>

In [2]:
class AI:

    def __init__(self, Q, epsilon=0.15, alpha=0.05, discount=0.99):
        
        #Radom action probability rate
        self.epsilon = epsilon
        #Learning rate
        self.alpha = alpha
        #Rate which determines how much weight is given to future rewards
        self.discount = discount
        self.START_E = epsilon
        self.END_E = 0.05
        #Possible game actions
        self.actions = ["LEFT", "RIGHT", "UP", "DOWN"]
        #Our Q table 
        self.Q = Q

<h3>Step 2: Choose Action </h3>
<li>Compare random vs epsilon</li>
<li>Find the action with the maximum q value</li>

In [3]:
def choose_action(self, state, epsilon):

    if random() < epsilon:
        return choice(self.actions)

    actions = [self.Q.get((state+"-"+i), 0.0) for i in self.actions]
    _max = max(actions)

    if actions.count(_max) > 0:
        index =  choice([i for i in range(len(self.actions)) if actions[i] == _max])
    else:
        index = actions.index(_max)

    return self.actions[index]

<h3>Step 3: Learn Function </h3>
<li>Get Max value for new state across all actions</li>
<li>Apply Bellman equation to update previous state Q value</li>

In [5]:
def learn(self, reward, state, action, prev_state):

    _max_val = max([self.Q.get(("%s-%s"% (state, i)), 0.0)for i in self.actions])
    _val = self.Q.get(("%s-%s" % (prev_state, action)), 0.0)
    new_val = _val + self.alpha * (reward + (self.discount * _max_val) - _val)
    self.Q[("%s-%s" % (prev_state, action))] = new_val

    return new_val

<h3>Helper function</h3>

In [None]:
def calculate_total_rewards(self):
    return sum([val for val in self.Q.values()])

<h2>Rewards(Top) & Exploration vs Exploitation(Bottom) </h2>
<img src="resources/images/data_screen_shot.png" width="500" height="600">

<h2>Trained game board</h2>
<img src="resources/images/train_screen_shot.png" width="500" height="600">

<h1>Complete AI class</h1>

In [1]:
from random import  random, choice

class AI:

    def __init__(self, Q, epsilon=0.15, alpha=0.05, discount=0.99):

        self.epsilon = epsilon
        self.alpha = alpha
        self.discount = discount
        self.START_E = epsilon
        self.END_E = 0.05
        self.actions = ["LEFT", "RIGHT", "UP", "DOWN"]
        self.Q = Q


    def learn(self, reward, state, action, prev_state):

        _max_val = max([self.Q.get(("%s-%s"% (state, i)), 0.0)for i in self.actions])
        _val = self.Q.get(("%s-%s" % (prev_state, action)), 0.0)
        new_val = _val + self.alpha * (reward + (self.discount * _max_val) - _val)
        self.Q[("%s-%s" % (prev_state, action))] = new_val

        return new_val

    def choose_action(self, state, epsilon):

        if random() < epsilon:
            return choice(self.actions)

        actions = [self.Q.get(("%s-%s" % (state,i)), 0.0) for i in self.actions]
        _max = max(actions)

        if actions.count(_max) > 0:
            index =  choice([i for i in range(len(self.actions)) if actions[i] == _max])
        else:
            index = actions.index(_max)

        return self.actions[index]

    def calculate_total_rewards(self):
        return sum([val for val in self.Q.values()])

<h1>Complete code for main.py</h1>

In [None]:
from matplotlib import pyplot as plt
from env import MouseMaze
from ai import AI
import utils
import time

START_e = 0.99
END_e = 0.0005

def run(env, ai, train):

    state = env.get_state()
    epsilon = START_e if len(ai.Q) <= 0 else END_e
    if not train:
        epsilon = -1

    iter = 0
    rewards = []
    epsilons = []
    while "luke" != "last_jedi":
        
        action = ai.choose_action(state, epsilon)
        new_state, reward, status = env.get_frame_step(action)

        if epsilon > END_e:
            epsilon -= END_e

        #calculating new q value based on new state
        q_val = ai.learn(reward, new_state, action, state)

        #Updating text to show values
        env.update(q_val, state, new_state)


        if iter % 50 == 0:
            utils.save_j(ai.Q, 'resources/q.json')
            utils.save_j(env.values, 'resources/values.json')
            sum_rewards = ai.calculate_total_rewards()
            rewards.append(sum_rewards)
            epsilons.append(epsilon)
            utils.save_j({'rewards':rewards, 'epsilon':epsilons}, 'resources/data.json')
            print("Iter %d Accumulated Rewards %f with Epsilon %f" % (iter, sum_rewards , epsilon))

        state = new_state
        iter += 1
        if status:
            state = env.reset()

        if not train:
            time.sleep(0.1)


def plot_stuff():
    data = utils.load_j('resources/data.json')
    rewards, epsilon = data['rewards'], data['epsilon']

    #print(rewards)
    plt.subplot(2,1,1)
    plt.plot(rewards)
    plt.ylabel('rewards')
    plt.xlabel('iterations (hundreds)')

    plt.subplot(2, 1, 2)
    plt.plot(epsilon)
    plt.ylabel('epsilon')
    plt.xlabel('iterations (hundreds)')

    plt.show()



if __name__ == "__main__":

    #plot_stuff()
    ai = AI(utils.load_j('resources/q.json'))
    game = MouseMaze(utils.load_j('resources/values.json'))
    run(game, ai, True)