# ACS in Mountain Car environment

In [1]:
# Logger
import logging
logging.basicConfig(level=logging.INFO)

import random
import pickle

import pandas as pd
import numpy as np

# Import local paths
import sys, os
sys.path.append(os.path.abspath('../'))
sys.path.insert(0, os.path.abspath('../../openai-envs'))

from lcs import Perception
from lcs.agents import EnvironmentAdapter
from lcs.agents.acs2 import ACS2, Configuration, ClassifiersList
from lcs.metrics import population_metrics

import matplotlib.pyplot as plt
import matplotlib.cm as cm

%matplotlib inline

# Load gyms
import gym
import gym_mountain_car

## TODO
- ✅ https://www.youtube.com/watch?v=rBzOyjywtPw
- ✅ https://repl.it/@MichaelMegliola/MountainCarQ-argmax-only
- https://en.wikipedia.org/wiki/Learned_helplessness
- maybe some projection of the input space for simpler problem
- ✅explore / exploit decay
- car final position metric
- ✅policy visualization (2d plot)?
- ✅extend mountain car to return energy as reward
- ✅classifiers plot
- profiler co działa najbardziej spowalnia
- tile coding

In [2]:
trials = 15000
decay = True
bins = 12

## Environment
Description - https://github.com/openai/gym/wiki/MountainCar-v0

![aa](http://gym.openai.com/v2018-02-21/videos/MountainCar-v0-270f34b9-f23e-4d95-a933-4c902b4f4435/poster.jpg)

In [3]:
env = gym.make('EnergyMountainCar-v0')
# env._max_episode_steps = 1000

_range, _low = (env.observation_space.high - env.observation_space.low, env.observation_space.low)

## Agent configuration

### Discretization of continuous input
Values for bins are taken from https://gist.github.com/vblank182/83e29f16755320f82936d211761bfeea

In [4]:
class MountainCarAdapter(EnvironmentAdapter):
    BINS = bins
    
    @classmethod
    def to_genotype(cls, obs):
        return np.round(((obs - _low) / _range) * cls.BINS).astype(int).astype(str).tolist()

### Evaluation metric

In [5]:
def avg_fitness(pop):
    return np.mean([cl.fitness for cl in pop if cl.is_reliable()])

# collect more metrics
def mc_metrics(pop, env):
    metrics = {}
    metrics['avg_fitness'] = avg_fitness(pop)
    metrics.update(population_metrics(pop, env))
    
    return metrics

### Building final configuration object

In [6]:
cfg = Configuration(
            classifier_length=2,
            number_of_possible_actions=3,
            epsilon=0.9,
            beta=0.1,
            gamma=0.95,
            theta_exp=100,
            theta_ga=50,
            do_ga=True,
            mu=0.03,
            metrics_trial_frequency=5,
            user_metrics_collector_fcn=mc_metrics,
            environment_adapter=MountainCarAdapter)

## Experiments

In [7]:
print(f'Running {trials} trials, decay={decay}, bins={bins}')

Running 15000 trials, decay=True, bins=12


In [None]:
%%time

agent = ACS2(cfg)
population, metrics = agent.explore(env, trials, decay=decay)

INFO:lcs.agents.Agent:{'trial': 0, 'steps_in_trial': 200, 'reward': 0.46631024260800513, 'avg_fitness': 0.1950212669473081, 'population': 55, 'numerosity': 64, 'reliable': 4}


In [None]:
# helper function for printing classifier details
def print_cl(cl):
    action = None
    marked = ''
    
    if cl.action == 0:
        action = 'L'
    if cl.action == 1:
        action = '-'
    if cl.action == 2:
        action = 'R'
    
    if cl.is_marked():
        marked = '(*)'
    
    return (f"{cl.condition} - {action} - {cl.effect} [fit: {cl.fitness:.3f}, r: {cl.r:.2f}, q: {cl.q:.2f}, exp: {cl.exp}, num: {cl.num} {marked}]")

In [None]:
reliable = [cl for cl in population if cl.is_reliable()]

print(f"Explore population size: {len(population)}")
print(f"Reliable classifiers: {len(reliable)}\n")

for cl in sorted(reliable, key=lambda cl: -cl.fitness)[:50]:
    print(print_cl(cl))

In [None]:
metrics_df = pd.DataFrame(metrics)
metrics_df.set_index('trial', inplace=True)

metrics_df.tail()

### Steps in trial

In [None]:
metrics_df['steps_in_trial'].plot(figsize=(14,6), title='Steps in each trial');

### Average fitness

In [None]:
window = 100

fig, ax = plt.subplots(figsize=(14, 6))

metrics_df['avg_fitness'].rolling(window=window).mean().plot(ax=ax)

ax.set_title('Fitness')
ax.set_xlabel('Trial')
ax.set_ylabel('Avg population fitness')

plt.show()

## Reward

In [None]:
window = 50

fig, ax = plt.subplots(figsize=(14, 6))

metrics_df['reward'].rolling(window=window).mean().plot(ax=ax)

ax.set_title('Reward')
ax.set_xlabel('Trial')
ax.set_ylabel('Reward (energy)')

plt.show()

## Classifiers

In [None]:
window = 100

fig, ax = plt.subplots(figsize=(14,8))

metrics_df['population'].rolling(window=window).mean().plot(label='population', ax=ax)
metrics_df['reliable'].rolling(window=window).mean().plot(label='reliable', ax=ax)

plt.legend()
plt.show()

## Policy plot

In [None]:
obs = ("1","1")

def best_action(obs, population):
    matchset = population.form_match_set(Perception(obs))
    anticipated_change_cls = [cl for cl in matchset if cl.does_anticipate_change()]

    best_classifier = None
    if len(anticipated_change_cls) > 0:
        random.shuffle(anticipated_change_cls)
        best_classifier = max(anticipated_change_cls, key=lambda cl: cl.fitness * cl.num)

    if best_classifier is not None:
        return best_classifier.action
    else:
        return None
    
a = best_action(obs, population)
print(a)

In [None]:
cmap = cm.get_cmap('Set3', 4)

BINS = 14
EMPTY = -1

policy = np.full((BINS, BINS), EMPTY)

for pos_bin in range(BINS):
    for vel_bin in range(BINS):
        obs = (str(pos_bin), str(vel_bin))
        action = best_action(obs, population)
        policy[pos_bin, vel_bin] = action if action != None else EMPTY
        
fig, ax = plt.subplots(figsize=(8,8))
im = ax.imshow(policy, interpolation='none', cmap=cmap);
ax.invert_yaxis()

fig.colorbar(im, ticks=[-1, 0, 1, 2])
plt.xlabel('Position')
plt.ylabel('Velocity')


plt.show();

## Save objects for reproduction

In [None]:
filename = f'energy_mountain_car_{trials}_trials_decay_{decay}_bins_{bins}.pickle'
with open(filename, 'wb') as f:
    pickle.dump((population, metrics_df), f)