In [1]:
#import logging
import random

import numpy as np

import gym
from gym import wrappers

#logging.basicConfig(level=logging.INFO)

# ACS2 in *FrozenLake*

> The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile.

In [2]:
env = gym.make('FrozenLake-v0')

# Reset the state
state = env.reset()

# Render the environment
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


Each state might get following possible values: `{S, F, H, G}` which, refers to
```
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)
```

In case of interacting with environment agent cant perform 4 action which map as follow:
- 0 - left
- 1 - down
- 2 - right
- 3 - up

> FrozenLake-v0 defines "solving" as getting average reward of 0.78 over 100 consecutive trials.

## Setting up the Agent
Unfortunatelly the only information back from the environment is the current agent position. Therefore our agent task will be to predicit where it will land after executing each action.

To do so we will represent state as a one-hot encoded vector.

In [3]:
# Import PyALCS code from local path
import sys
sys.path.append('/Users/khozzy/Projects/pyalcs')

from alcs import ACS2, ACS2Configuration

# Enable automatic module reload
%load_ext autoreload
%autoreload 2

# Set some constants
CLASSIFIER_LENGTH = 16
POSSIBLE_ACTIONS = env.action_space.n

In [4]:
def one_hot_encode(state):
    vec = ['0' for i in range(CLASSIFIER_LENGTH)]
    vec[state] = 'X'
    return ''.join(vec)

`X` corresponds to current agent position. State 4 is encoded as follows:

In [5]:
one_hot_encode(4)

'0000X00000000000'

We will also need a function for evaluating if agent finished succesfuly a trial

In [6]:
# We assume if the final state was with number 15 that the algorithm found the reward. Otherwise not
def collect_env_metrics(env):
    state = None
    
    if type(env) is gym.wrappers.monitoring.Monitor:
        state = env.env.env.s
    else:
        state = env.env.s
        
    return {'found_reward': state == 15}

Right now we are ready to configure the ACS2 agent

In [7]:
cfg = ACS2Configuration(
    classifier_length=CLASSIFIER_LENGTH,
    number_of_possible_actions=POSSIBLE_ACTIONS,
    perception_mapper_fcn=one_hot_encode,
    environment_metrics_fcn=collect_env_metrics,
    theta_i=0.3,
    epsilon=0.7)

print(cfg)

ACS2Configuration:
	- Classifier length: [16]
	- Number of possible actions: [4]
	- Classifier wildcard: [#]
	- Perception mapper function: [<function one_hot_encode at 0x10ed558c8>]
	- Action mapping dict: [None]
	- Environment metrics function: [<function collect_env_metrics at 0x10ed55840>]
	- Performance calculation function: [None] 
	- Do GA: [False]
	- Do subsumption: [True]
	- Beta: [0.05]
	- ...
	- Epsilon: [0.7]
	- U_max: [100000]


Build agent using defined configuration

In [8]:
agent = ACS2(cfg)

Learn some behaviour during exploration phase

In [9]:
EXPLOIT_TRIALS = 1000

population, metrics = agent.explore(env, EXPLOIT_TRIALS)

In [10]:
population.sort(key=lambda cl: -cl.fitness)
population_count = len(population)
reliable_count = len([cl for cl in population if cl.is_reliable()])
successful_trials = sum(m['environment']['found_reward'] for m in metrics)

print("Number of classifiers: {}".format(population_count))
print("Number of reliable classifiers: {}".format(reliable_count))
print("Percentage of successul trials: {:.2f}%".format(successful_trials / EXPLOIT_TRIALS  * 100))
print("\nTop 10 classifiers:")
for cl in population[:10]:
    print("{!r} \tq: {:.2f} \tr: {:.2f} \tir: {:.2f} \texp: {}".format(cl, cl.q, cl.r, cl.ir, cl.exp))

Number of classifiers: 383
Number of reliable classifiers: 0
Percentage of successul trials: 1.90%

Top 10 classifiers:
##############X0-1-##############0X @ 0x10ee32160 	q: 0.54 	r: 0.51 	ir: 0.23 	exp: 17
##############X0-3-##############0X @ 0x10edcfb70 	q: 0.51 	r: 0.52 	ir: 0.16 	exp: 8
##############X0-2-##############0X @ 0x10ee9a9b0 	q: 0.46 	r: 0.52 	ir: 0.25 	exp: 20
#0X#############-1-#X0############# @ 0x10ef2b208 	q: 0.53 	r: 0.45 	ir: 0.00 	exp: 3
X0####0#########-3-0X############## @ 0x10ee9aac8 	q: 0.66 	r: 0.34 	ir: 0.00 	exp: 3
0X##############-1-X0############## @ 0x10ee32b00 	q: 0.47 	r: 0.46 	ir: 0.00 	exp: 2
##########0###X#-2-##########X###0# @ 0x10ee321d0 	q: 0.43 	r: 0.50 	ir: 0.21 	exp: 17
#############0X#-1-#############X0# @ 0x10ed57f98 	q: 0.44 	r: 0.50 	ir: 0.23 	exp: 18
X#####0#########-3-################ @ 0x10ef2bd30 	q: 0.63 	r: 0.34 	ir: 0.00 	exp: 9
##X0##0#########-3-##0X############ @ 0x10ef2b8d0 	q: 0.53 	r: 0.40 	ir: 0.00 	exp: 3


Now let's try to reuse this knowledge - exploitation phase

In [11]:
TEST_TRIALS = 100

# Create new agent with initial classifiers population
exploiter = ACS2(cfg, population=population)

# Record performance, force erase previous run
env = wrappers.Monitor(env, "/tmp/gym-results", force=True)

_, metrics = exploiter.exploit(env, TEST_TRIALS)

env.close()

In [12]:
successful_trials = sum(m['environment']['found_reward'] for m in metrics)

print("Percentage of successul trials: {:.2f}%".format(successful_trials / TEST_TRIALS * 100))

Percentage of successul trials: 4.00%
