First, remember to upload the environment.

# OPTIMIZING WITH REINFORCEMENT LEARNING

Inspired by: https://github.com/llSourcell/deep_q_learning/blob/master/03_PlayingAgent.ipynb

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import environment
import random
import sys

from keras.models import Sequential
from keras.layers import Dense, Flatten
from collections import deque            # For storing moves

env = environment.SequenceAlignmentEnvironment()
ALPHA = 0.01 #learning rate
EPSILON = 1.0 #chance that a creative action is taken
GAMMA = 0.7 #discount factor

OBSERVETIME = 100000 # Number of timesteps we will be acting on the game and observing results
MBSIZE = 10000 # Learning minibatch size

# Create network. Input is two consecutive game states, output is Q-values of the possible moves.
model = Sequential()
model.add(Dense(20, input_shape=(201,), init='uniform', activation='relu'))
model.add(Dense(18, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(env.TOTALACTIONS, init='uniform', activation='linear'))    # Same number of outputs as possible actions

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# FIRST STEP: Knowing what each action does (Observing)

D = deque()                                # Register where the actions will be stored

observation = env.reset().reshape((1,201))
done = False
for t in range(OBSERVETIME):
    sys.stdout.write('\r' + str(t*100/OBSERVETIME) + '%')
    if np.random.rand() <= EPSILON:
        action = env.randomAction()
    else:
        Q = model.predict(observation)
        action = np.argmax(Q)
    
    observation_new, reward, done = env.step(action)
    observation_new = observation_new.reshape((1,201))
    
    D.append((observation, action, reward, observation_new, done))
    
    observation = observation_new
    
    if done:
        observation = env.reset().reshape((1,201))
print('\nObserving Finished')

# SECOND STEP: Learning from the observations (Experience replay)

minibatch = random.sample(D, MBSIZE)                              # Sample some moves

inputs = np.zeros((MBSIZE,201,))
targets = np.zeros((MBSIZE, env.TOTALACTIONS))

for i in range(0, MBSIZE):
    sys.stdout.write('\r' + str(i*100/MBSIZE) + '%')
    observation = minibatch[i][0]
    action = minibatch[i][1]
    reward = minibatch[i][2]
    observation_new = minibatch[i][3]
    done = minibatch[i][4]
    
    #Build Bellman equation for the Q function
    inputs[i] = observation
    targets[i] = model.predict(observation)
    
    #future (discounted) reward for entering the new state
    Q_sa = model.predict(observation_new)
    
    #changing the predictions to their optimal values
    #if the model predicted [0.1, 0.2, 0.1] but action 2 reward for the given state should be 0.3, it's changed to [0.1, 0.3, 0.1]
    if done:
        targets[i][action] = reward
    else:
        targets[i][action] = reward + (GAMMA * np.max(Q_sa))

    #Train network to output the Q function
    model.train_on_batch(inputs, targets)
print('\nLearning Finished')

# THIRD STEP: Play!
observation = env.reset().reshape((1,201))
seq1 = np.array([1,3,2,4,2,3,3,3,1,3,2,1,2,3,2,2,1,4,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
seq2 = np.array([3,2,4,2,3,3,3,1,3,2,1,2,3,2,2,1,4,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
env.customInput(seq1, seq2)
done = False
tot_reward = 0.0
while not done:
    Q = model.predict(observation)
    action = np.argmax(Q)         
    observation, reward, done = env.step(action)
    observation = observation.reshape((1,201))
    tot_reward += reward
sys.stdout.write(env.display()[1])
print('Game ended! Total reward: {}'.format(tot_reward))