From 629bb65b840110e92007e121e25d1ebe3e2b11ec Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 26 Oct 2017 10:57:40 -0400
Subject: [PATCH 01/96] minor fix docs

---
 examples/MG_two_storages/run_MG_two_storages.py | 2 +-
 examples/gym/run_mountain_car.py                | 2 +-
 examples/gym/run_pendulum.py                    | 2 +-
 examples/pendulum/run_pendulum.py               | 2 +-
 examples/toy_env/run_toy_env.py                 | 2 +-
 examples/toy_env/run_toy_env_simple.py          | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/MG_two_storages/run_MG_two_storages.py b/examples/MG_two_storages/run_MG_two_storages.py
index 62519372..2ebdc55b 100644
--- a/examples/MG_two_storages/run_MG_two_storages.py
+++ b/examples/MG_two_storages/run_MG_two_storages.py
@@ -1,4 +1,4 @@
-"""2-Storage Microgrid launcher. See Wiki for more details about this experiment.
+"""2-Storage Microgrid launcher. See the docs for more details about this experiment.
 
 Authors: Vincent Francois-Lavet, David Taralla
 """
diff --git a/examples/gym/run_mountain_car.py b/examples/gym/run_mountain_car.py
index c2cfc350..d49958c9 100644
--- a/examples/gym/run_mountain_car.py
+++ b/examples/gym/run_mountain_car.py
@@ -1,5 +1,5 @@
 """ Mountain car environment launcher.
-Same principles as run_toy_env. See the wiki for more details.
+Same principles as run_toy_env. See the docs for more details.
 
 Authors: Vincent Francois-Lavet, David Taralla
 """
diff --git a/examples/gym/run_pendulum.py b/examples/gym/run_pendulum.py
index a59d953c..7fd1617a 100644
--- a/examples/gym/run_pendulum.py
+++ b/examples/gym/run_pendulum.py
@@ -1,5 +1,5 @@
 """ Pendulum environment launcher.
-Same principles as run_toy_env. See the wiki for more details.
+Same principles as run_toy_env. See the docs for more details.
 
 Authors: Vincent Francois-Lavet, David Taralla
 """
diff --git a/examples/pendulum/run_pendulum.py b/examples/pendulum/run_pendulum.py
index 52d647f6..dd4ceaa6 100644
--- a/examples/pendulum/run_pendulum.py
+++ b/examples/pendulum/run_pendulum.py
@@ -1,5 +1,5 @@
 """ Pendulum environment launcher.
-Same principles as run_toy_env. See the wiki for more details.
+Same principles as run_toy_env. See the docs for more details.
 
 Authors: Vincent Francois-Lavet, David Taralla
 """
diff --git a/examples/toy_env/run_toy_env.py b/examples/toy_env/run_toy_env.py
index 26546c54..4545766d 100644
--- a/examples/toy_env/run_toy_env.py
+++ b/examples/toy_env/run_toy_env.py
@@ -1,4 +1,4 @@
-"""Toy environment launcher. See the wiki for more details about this environment.
+"""Toy environment launcher. See the docs for more details about this environment.
 
 Authors: Vincent Francois-Lavet, David Taralla
 """
diff --git a/examples/toy_env/run_toy_env_simple.py b/examples/toy_env/run_toy_env_simple.py
index 78087bf0..fc1dcf0c 100644
--- a/examples/toy_env/run_toy_env_simple.py
+++ b/examples/toy_env/run_toy_env_simple.py
@@ -1,4 +1,4 @@
-"""Toy environment launcher. See the wiki for more details about this environment.
+"""Toy environment launcher. See the docs for more details about this environment.
 
 Authors: Vincent Francois-Lavet, David Taralla
 """

From 5a6db0e56962551395ce74d3d17c4e97619ceaec Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 26 Oct 2017 11:30:36 -0400
Subject: [PATCH 02/96] adding a few comments

---
 examples/toy_env/Toy_env.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/examples/toy_env/Toy_env.py b/examples/toy_env/Toy_env.py
index 5618235a..7ee220d7 100644
--- a/examples/toy_env/Toy_env.py
+++ b/examples/toy_env/Toy_env.py
@@ -27,7 +27,7 @@ def __init__(self, rng):
 
         Parameters
         -----------
-            rng : the numpy random number generator
+        rng : the numpy random number generator
         """
         # Defining the type of environment
         self._last_ponctual_observation = [0, 0] # At each time step, the observation is made up of two elements, each scalar
@@ -47,6 +47,19 @@ def __init__(self, rng):
         self._counter = 1
                 
     def reset(self, mode):
+        """ Resets the environment for a new episode.
+
+        Parameters
+        -----------
+        mode : int
+            -1 is for the training phase, others are for validation/test.
+
+        Returns
+        -------
+        list
+            Initialization of the sequence of observations used for the pseudo-state; dimension must match self.inputDimensions().
+            If only the current observation is used as a (pseudo-)state, then this list is equal to self._last_ponctual_observation.
+        """
         if mode == -1:
             self.prices = self._price_signal_train
         else:
@@ -59,6 +72,17 @@ def reset(self, mode):
         return [6*[0], 0]
 
     def act(self, action):
+        """ Performs one time-step within the environment and updates the current observation self._last_ponctual_observation
+
+        Parameters
+        -----------
+        action : int
+            Integer in [0, ..., N_A] where N_A is the number of actions given by self.nActions()
+
+        Returns
+        -------
+        reward: float
+        """
         reward = 0
         
         if (action == 0 and self._last_ponctual_observation[1] == 1):

From 3541406fea896b3b2cd412f5c41e3be8aaafc872 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Sun, 19 Nov 2017 14:15:48 -0500
Subject: [PATCH 03/96] first commit integr_learn_and_plan

---
 deer/q_networks/NN_keras_lp.py    | 257 ++++++++++++++++++++++++++++++
 deer/q_networks/q_net_keras_lp.py | 238 +++++++++++++++++++++++++++
 examples/PLE/PLE_env.py           |   2 +
 examples/PLE/run_PLE.py           |   2 +-
 4 files changed, 498 insertions(+), 1 deletion(-)
 create mode 100644 deer/q_networks/NN_keras_lp.py
 create mode 100644 deer/q_networks/q_net_keras_lp.py

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
new file mode 100644
index 00000000..2914d997
--- /dev/null
+++ b/deer/q_networks/NN_keras_lp.py
@@ -0,0 +1,257 @@
+"""
+Neural network using Keras (called by q_net_keras)
+.. Author: Vincent Francois-Lavet
+"""
+
+import numpy as np
+from keras.models import Model
+from keras.layers import Input, Layer, Dense, Flatten, merge, Activation, Conv2D, MaxPooling2D, Reshape, Permute
+
+np.random.seed(102912)
+
+class NN():
+    """
+    Deep Q-learning network using Keras
+    
+    Parameters
+    -----------
+    batch_size : int
+        Number of tuples taken into account for each iteration of gradient descent
+    input_dimensions :
+    n_actions :
+    random_state : numpy random number generator
+    action_as_input : Boolean
+        Whether the action is given as input or as output
+    """
+    def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False):
+        self._input_dimensions=input_dimensions
+        self._batch_size=batch_size
+        self._random_state=random_state
+        self._n_actions=n_actions
+        self._action_as_input=action_as_input
+        self._internal_dim=3 # size random vector
+        self._rand_vect_size=5 # size output distribution
+
+    def encoder_model(self):
+        """
+    
+        Parameters
+        -----------
+        s
+    
+        Returns
+        -------
+        model with output x (= encoding of s)
+    
+        """
+        inputs = [ Input( shape=(4,48,48,) ) ]
+        # input_distr, conditional info
+        
+        x = inputs[0]
+        x = Conv2D(16, (4, 4), padding='same', activation='relu')(x)
+        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x)
+        x = Conv2D(16, (4, 4), padding='same', activation='relu')(x)
+        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x)
+        
+        x = Flatten()(x)
+        
+        x = Dense(20, activation='relu')(x)
+
+        x = Dense(self._internal_dim, activation='relu')(x)
+        
+        model = Model(input=inputs, output=x)
+        
+        return model
+
+    def generator_transition_model(self,encoder_model):
+        """
+    
+        Parameters
+        -----------
+        s
+        a
+        random z
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs = [ Input( shape=(4,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ) ] #s,a,z
+        
+        x = encoder_model(inputs[0]) #s,a,z --> x,a,z
+        
+        x = merge([x]+inputs[1:],mode='concat',concat_axis=-1)
+        x = Dense(20, activation='relu')(x)
+        x = Dense(self._internal_dim, activation='relu')(x)
+        
+        model = Model(input=inputs, output=x)
+        
+        return model
+    
+    def discriminator_model(self):
+        """
+    
+        Parameters
+        -----------
+        Tx or x'
+        conditional info a
+    
+        Returns
+        -------
+        model with output D
+    
+        """
+        inputs = [ Input( shape=(self._internal_dim,) ), Input( shape=(self._n_actions,) ) ]
+        # distr Tx/x', conditional info a
+     
+        x=merge(inputs,mode='concat')
+        x = Dense(20, activation='relu')(x)
+        true_or_model=Dense(1, activation='sigmoid')(x)
+        model = Model(input=inputs, output=true_or_model)
+        return model
+        
+    def full_model_trans(self,generator_transition_model, encoder_model, discriminator):
+        """
+    
+        Parameters
+        -----------
+        s
+        a
+        random z
+        x'
+    
+        Returns
+        -------
+        model with output D
+    
+        """
+        inputs = [ Input( shape=(4,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ), Input( shape=(self._internal_dim,) ) ]
+        # input_distr, conditional info
+        T = generator_transition_model(inputs[0:3])
+        
+        discriminator.trainable = False
+        gan_V = discriminator([T, inputs[1]])
+        model = Model(input=inputs, output=gan_V)
+        return model
+
+    def full_model_enc(self,generator_transition_model, encoder_model, discriminator):
+        """
+    
+        Parameters
+        -----------
+        s'
+        a
+        Tx
+            
+        Returns
+        -------
+        model with output D
+    
+        """
+        inputs = [ Input( shape=(4,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._internal_dim,) ) ] #s,a,Tx
+        # input_distr, conditional info
+        T = generator_transition_model(inputs[0:2])
+        
+        discriminator.trainable = False
+        gan_V = discriminator([T, inputs[1]])
+        model = Model(input=inputs, output=gan_V)
+        return model
+
+
+    def _buildDQN(self):
+        """
+        Build a network consistent with each type of inputs
+        """
+        layers=[]
+        outs_conv=[]
+        inputs=[]
+
+        for i, dim in enumerate(self._input_dimensions):
+            # - observation[i] is a FRAME
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
+                reshaped=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)    #data_format='channels_last'
+                x = Conv2D(8, (4, 4), activation='relu', padding='valid')(reshaped)   #Conv on the frames
+                x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x)         #Conv on the frames
+                x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x)
+                x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x)         #Conv on the frames
+                
+                out = Flatten()(x)
+                
+            # - observation[i] is a VECTOR
+            elif len(dim) == 2:
+                if dim[0] > 3:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+                    reshaped=Reshape((dim[0],dim[1],1), input_shape=(dim[0],dim[1]))(input) 
+                    x = Conv2D(16, (2, 1), activation='relu', padding='valid')(reshaped)#Conv on the history
+                    x = Conv2D(16, (2, 2), activation='relu', padding='valid')(x)       #Conv on the history & features
+
+                    out = Flatten()(x)
+                else:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+                    out = Flatten()(input)
+
+            # - observation[i] is a SCALAR -
+            else:
+                if dim[0] > 3:
+                    # this returns a tensor
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+                    reshaped=Reshape((1,dim[0],1), input_shape=(dim[0],))(input)  
+                    x = Conv2D(8, (1,2), activation='relu', padding='valid')(reshaped)  #Conv on the history
+                    x = Conv2D(8, (1,2), activation='relu', padding='valid')(x)         #Conv on the history
+                    
+                    out = Flatten()(x)
+                                        
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+                    out=input
+                    
+            outs_conv.append(out)
+
+        if (self._action_as_input==True):
+            if ( isinstance(self._n_actions,int)):
+                print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
+            else:
+                input = Input(shape=(len(self._n_actions),))
+                inputs.append(input)
+                outs_conv.append(input)
+        
+        if len(outs_conv)>1:
+            x = merge(outs_conv, mode='concat')
+        else:
+            x= outs_conv [0]
+        
+        # we stack a deep fully-connected network on top
+        x = Dense(50, activation='relu')(x)
+        x = Dense(20, activation='relu')(x)
+        
+        if (self._action_as_input==False):
+            if ( isinstance(self._n_actions,int)):
+                out = Dense(self._n_actions)(x)
+            else:
+                out = Dense(len(self._n_actions))(x)
+        else:
+            out = Dense(1)(x)
+                
+        model = Model(input=inputs, output=out)
+        layers=model.layers
+        
+        # Grab all the parameters together.
+        params = [ param
+                    for layer in layers 
+                    for param in layer.trainable_weights ]
+        
+        if (self._action_as_input==True):
+            return model, params, inputs
+        else:
+            return model, params
+
+if __name__ == '__main__':
+    pass
+    
\ No newline at end of file
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
new file mode 100644
index 00000000..e8a269bd
--- /dev/null
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -0,0 +1,238 @@
+"""
+Code for general deep Q-learning using Keras that can take as inputs scalars, vectors and matrices
+
+.. Author: Vincent Francois-Lavet
+"""
+
+import numpy as np
+from keras.optimizers import SGD,RMSprop
+from keras import backend as K
+from ..base_classes import QNetwork
+from .NN_keras import NN # Default Neural network used
+
+class MyQNetwork(QNetwork):
+    """
+    Deep Q-learning network using Keras (with any backend)
+    
+    Parameters
+    -----------
+    environment : object from class Environment
+    rho : float
+        Parameter for rmsprop. Default : 0.9
+    rms_epsilon : float
+        Parameter for rmsprop. Default : 0.0001
+    momentum : float
+        Default : 0
+    clip_delta : float
+        Not implemented.
+    freeze_interval : int
+        Period during which the target network is freezed and after which the target network is updated. Default : 1000
+    batch_size : int
+        Number of tuples taken into account for each iteration of gradient descent. Default : 32
+    update_rule: str
+        {sgd,rmsprop}. Default : rmsprop
+    random_state : numpy random number generator
+    double_Q : bool, optional
+        Activate or not the double_Q learning.
+        More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
+    neural_network : object, optional
+        default is deer.qnetworks.NN_keras
+    """
+
+    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN):
+        """ Initialize environment
+        
+        """
+        QNetwork.__init__(self,environment, batch_size)
+
+        
+        self._rho = rho
+        self._rms_epsilon = rms_epsilon
+        self._momentum = momentum
+        self._update_rule = update_rule
+        #self.clip_delta = clip_delta
+        self._freeze_interval = freeze_interval
+        self._double_Q = double_Q
+        self._random_state = random_state
+        self.update_counter = 0
+                
+        Q_net = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
+        self.q_vals, self.params = Q_net._buildDQN()
+
+
+        optimizer=RMSprop(lr=0.00005, rho=0.9, epsilon=1e-06)
+        optimizer2=RMSprop(lr=0.0001, rho=0.9, epsilon=1e-06)#.Adam(lr=0.0002, beta_1=0.5, beta_2=0.999, epsilon=1e-08)
+
+        self.encoder = Q_net.encoder_model()
+        self.generator_transition = Q_net.generator_transition_model(self.encoder)
+        self.discriminator = Q_net.discriminator_model()
+        self.full_trans = Q_net.full_model_trans(self.generator_transition, self.encoder, self.discriminator)
+
+        self.discriminator.trainable = True
+        self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer2)
+        self.generator_transition.compile(optimizer=optimizer,
+                  loss='mae',
+                  metrics=['accuracy'])
+        self.encoder.compile(optimizer=optimizer,
+                  loss='mae',
+                  metrics=['accuracy'])
+        self.full_trans.compile(loss='binary_crossentropy', optimizer=optimizer)
+            
+        self._compile()
+
+        self.next_q_vals, self.next_params = Q_net._buildDQN()
+        self.next_q_vals.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.q_vals
+
+        self._resetQHat()
+
+    def getAllParams(self):
+        params_value=[]
+        for i,p in enumerate(self.params):
+            params_value.append(K.get_value(p))
+        return params_value
+
+    def setAllParams(self, list_of_values):
+        for i,p in enumerate(self.params):
+            K.set_value(p,list_of_values[i])
+
+    def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
+        """
+        Train one batch.
+
+        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
+        2. perform batch training
+
+        Parameters
+        -----------
+        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
+        actions_val : b x 1 numpy array of integers
+        rewards_val : b x 1 numpy array
+        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
+        terminals_val : b x 1 numpy boolean array
+
+        Returns
+        -------
+        Average loss of the batch training (RMSE)
+        Individual (square) losses for each tuple
+        """
+        
+        print "self.discriminator.get_weights()"
+        print self.discriminator.get_weights()
+
+        noise = np.random.uniform(-1,1,size=(self._batch_size,5)) #self._rand_vect_size=5
+        #print "[states_val[0],actions_val,noise]"
+        #print [states_val[0],actions_val,noise]
+        #print "states_val.tolist()"
+        #print states_val.tolist()
+        onehot_actions = np.zeros((self._batch_size, self._n_actions))
+        onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
+        print onehot_actions
+        print "[states_val[0],onehot_actions,noise]"
+        print [states_val[0],onehot_actions,noise]
+        ETs=self.generator_transition.predict([states_val[0],onehot_actions,noise])
+        Es_=self.encoder.predict([next_states_val[0]])
+        
+        
+        X = np.concatenate((ETs, Es_))
+        print "X"
+        print X
+            
+        y = [1] * self._batch_size + [0] * self._batch_size # first batch size is ETs and second is Es'
+        
+        d_loss=0
+        for i in range(5):
+            #loss=discriminator.train_on_batch([X, np.tile(in_distrib,(2,1,1,1)), np.tile(cond_distrib,(2,1,1,1))], y)
+            loss = self.discriminator.train_on_batch([X, np.tile(onehot_actions,(2,1))], y)
+            d_loss += loss
+            if loss < 0.01:
+                break
+        d_loss=d_loss/(i+1)
+        
+        # Training generator ETs
+        self.discriminator.trainable = False # required?
+        g_loss = self.full_trans.train_on_batch([states_val[0], onehot_actions, noise, Es_], [0] * self._batch_size) # ETs should look like ES_
+
+        # Training generator Es'
+        # TO DO
+ 
+
+        if self.update_counter % self._freeze_interval == 0:
+            self._resetQHat()
+        
+        next_q_vals = self.next_q_vals.predict(next_states_val.tolist())
+        
+        if(self._double_Q==True):
+            next_q_vals_current_qnet=self.q_vals.predict(next_states_val.tolist())
+            argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1)
+            max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1))
+        else:
+            max_next_q_vals=np.max(next_q_vals, axis=1, keepdims=True)
+
+        not_terminals=np.ones_like(terminals_val) - terminals_val
+        
+        target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
+        
+        q_vals=self.q_vals.predict(states_val.tolist())
+
+        # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
+        q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
+        diff = - q_val + target 
+        loss_ind=pow(diff,2)
+                
+        q_vals[  np.arange(self._batch_size), actions_val.reshape((-1,))  ] = target
+                
+        # Is it possible to use something more flexible than this? 
+        # Only some elements of next_q_vals are actual value that I target. 
+        # My loss should only take these into account.
+        # Workaround here is that many values are already "exact" in this update
+        loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 
+                
+        self.update_counter += 1        
+
+        # loss*self._n_actions = np.average(loss_ind)
+        return np.sqrt(loss),loss_ind
+
+
+    def qValues(self, state_val):
+        """ Get the q values for one belief state
+
+        Arguments
+        ---------
+        state_val : one belief state
+
+        Returns
+        -------
+        The q values for the provided belief state
+        """ 
+        return self.q_vals.predict([np.expand_dims(state,axis=0) for state in state_val])[0]
+
+    def chooseBestAction(self, state):
+        """ Get the best action for a belief state
+
+        Arguments
+        ---------
+        state : one belief state
+
+        Returns
+        -------
+        The best action : int
+        """        
+        q_vals = self.qValues(state)
+
+        return np.argmax(q_vals),np.max(q_vals)
+        
+    def _compile(self):
+        """ compile self.q_vals
+        """
+        if (self._update_rule=="sgd"):
+            optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False)
+        elif (self._update_rule=="rmsprop"):
+            optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon)
+        else:
+            raise Exception('The update_rule '+self._update_rule+' is not implemented.')
+        
+        self.q_vals.compile(optimizer=optimizer, loss='mse')
+
+    def _resetQHat(self):
+        for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
+            K.set_value(next_param,K.get_value(param))
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 4df160aa..88848d62 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -9,6 +9,8 @@
 
 from deer.base_classes import Environment
 
+import matplotlib
+matplotlib.use('qt5agg')
 from mpl_toolkits.axes_grid1 import host_subplot
 import mpl_toolkits.axisartist as AA
 import matplotlib.pyplot as plt
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index 310a90b0..9b670e4a 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -11,7 +11,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras import MyQNetwork
+from deer.q_networks.q_net_keras_lp import MyQNetwork
 from PLE_env import MyEnv as PLE_env
 import deer.experiment.base_controllers as bc
 

From 053615f7228fa2d124699161b1e6dd944dd4b940 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 21 Nov 2017 11:28:20 -0500
Subject: [PATCH 04/96] 1st version integr learn and plan

---
 deer/base_classes/QNetwork.py     |   1 +
 deer/q_networks/NN_keras_lp.py    | 100 ++++++++++--------------------
 deer/q_networks/q_net_keras_lp.py |  85 +++++++++++++++++--------
 examples/PLE/PLE_env.py           |   8 +--
 examples/PLE/run_PLE.py           |   2 +-
 5 files changed, 99 insertions(+), 97 deletions(-)

diff --git a/deer/base_classes/QNetwork.py b/deer/base_classes/QNetwork.py
index 648ba789..c416eac6 100644
--- a/deer/base_classes/QNetwork.py
+++ b/deer/base_classes/QNetwork.py
@@ -47,6 +47,7 @@ def setLearningRate(self, lr):
             The learning rate that has to bet set
         """
         self._lr = lr
+        self._compile()
 
     def setDiscountFactor(self, df):
         """ Setting the discount factor
diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 2914d997..bc4eb135 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -5,7 +5,8 @@
 
 import numpy as np
 from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, merge, Activation, Conv2D, MaxPooling2D, Reshape, Permute
+from keras.layers import Input, Layer, Dense, Flatten, merge, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add
+from keras import regularizers
 
 np.random.seed(102912)
 
@@ -44,20 +45,22 @@ def encoder_model(self):
         model with output x (= encoding of s)
     
         """
-        inputs = [ Input( shape=(4,48,48,) ) ]
-        # input_distr, conditional info
+        inputs = [ Input( shape=(2,48,48,) ) ]
+        # input_distr
         
         x = inputs[0]
         x = Conv2D(16, (4, 4), padding='same', activation='relu')(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x)
+        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
         x = Conv2D(16, (4, 4), padding='same', activation='relu')(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x)
+        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+        x = Conv2D(8, (4, 4), padding='same', activation='relu')(x)
+        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
         
         x = Flatten()(x)
         
         x = Dense(20, activation='relu')(x)
 
-        x = Dense(self._internal_dim, activation='relu')(x)
+        x = Dense(self._internal_dim, activity_regularizer=regularizers.l2(0.0001))(x) #, activation='relu'
         
         model = Model(input=inputs, output=x)
         
@@ -77,13 +80,15 @@ def generator_transition_model(self,encoder_model):
         model with output Tx (= model estimate of x')
     
         """
-        inputs = [ Input( shape=(4,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ) ] #s,a,z
+        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ) ] #s,a,z
         
-        x = encoder_model(inputs[0]) #s,a,z --> x,a,z
+        enc_x = encoder_model(inputs[0]) #s --> x
         
-        x = merge([x]+inputs[1:],mode='concat',concat_axis=-1)
+        x = merge([enc_x]+inputs[1:],mode='concat',concat_axis=-1)
+        x = Dense(20, activation='relu')(x)
         x = Dense(20, activation='relu')(x)
-        x = Dense(self._internal_dim, activation='relu')(x)
+        x = Dense(self._internal_dim)(x) #, activation='relu'
+        #x = Add()([enc_x,x])
         
         model = Model(input=inputs, output=x)
         
@@ -102,11 +107,12 @@ def discriminator_model(self):
         model with output D
     
         """
-        inputs = [ Input( shape=(self._internal_dim,) ), Input( shape=(self._n_actions,) ) ]
-        # distr Tx/x', conditional info a
+        inputs = [ Input( shape=(self._internal_dim,) ), Input( shape=(self._internal_dim,) ), Input( shape=(self._n_actions,) ) ]
+        # distr Tx/x', conditional info x, a
      
         x=merge(inputs,mode='concat')
         x = Dense(20, activation='relu')(x)
+        x = Dense(20, activation='relu')(x)
         true_or_model=Dense(1, activation='sigmoid')(x)
         model = Model(input=inputs, output=true_or_model)
         return model
@@ -119,100 +125,62 @@ def full_model_trans(self,generator_transition_model, encoder_model, discriminat
         s
         a
         random z
-        x'
+        x
     
         Returns
         -------
         model with output D
     
         """
-        inputs = [ Input( shape=(4,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ), Input( shape=(self._internal_dim,) ) ]
+        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ), Input( shape=(self._internal_dim,) )]#, Input( shape=(self._internal_dim,) ) ]
         # input_distr, conditional info
         T = generator_transition_model(inputs[0:3])
         
         discriminator.trainable = False
-        gan_V = discriminator([T, inputs[1]])
+        gan_V = discriminator([T, inputs[3], inputs[1]])
         model = Model(input=inputs, output=gan_V)
         return model
 
-    def full_model_enc(self,generator_transition_model, encoder_model, discriminator):
+    def full_model_enc(self,encoder_model, discriminator):
         """
     
         Parameters
         -----------
         s'
         a
-        Tx
+        x
             
         Returns
         -------
         model with output D
     
         """
-        inputs = [ Input( shape=(4,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._internal_dim,) ) ] #s,a,Tx
+        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._internal_dim,) ) ] #s,a,Tx
         # input_distr, conditional info
-        T = generator_transition_model(inputs[0:2])
+        x = encoder_model(inputs[0])
         
         discriminator.trainable = False
-        gan_V = discriminator([T, inputs[1]])
+        gan_V = discriminator([x, inputs[2], inputs[1]])
         model = Model(input=inputs, output=gan_V)
         return model
 
 
-    def _buildDQN(self):
+    def _buildDQN(self,encoder_model):
         """
         Build a network consistent with each type of inputs
         """
         layers=[]
         outs_conv=[]
         inputs=[]
-
+        
+        #if len(dim) == 3:
         for i, dim in enumerate(self._input_dimensions):
-            # - observation[i] is a FRAME
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
-                inputs.append(input)
-                reshaped=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)    #data_format='channels_last'
-                x = Conv2D(8, (4, 4), activation='relu', padding='valid')(reshaped)   #Conv on the frames
-                x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x)         #Conv on the frames
-                x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x)
-                x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x)         #Conv on the frames
-                
-                out = Flatten()(x)
-                
-            # - observation[i] is a VECTOR
-            elif len(dim) == 2:
-                if dim[0] > 3:
-                    input = Input(shape=(dim[0],dim[1]))
-                    inputs.append(input)
-                    reshaped=Reshape((dim[0],dim[1],1), input_shape=(dim[0],dim[1]))(input) 
-                    x = Conv2D(16, (2, 1), activation='relu', padding='valid')(reshaped)#Conv on the history
-                    x = Conv2D(16, (2, 2), activation='relu', padding='valid')(x)       #Conv on the history & features
-
-                    out = Flatten()(x)
-                else:
-                    input = Input(shape=(dim[0],dim[1]))
-                    inputs.append(input)
-                    out = Flatten()(input)
+            input = Input(shape=(dim[0],dim[1],dim[2]))
+            inputs.append(input)
 
-            # - observation[i] is a SCALAR -
-            else:
-                if dim[0] > 3:
-                    # this returns a tensor
-                    input = Input(shape=(dim[0],))
-                    inputs.append(input)
-                    reshaped=Reshape((1,dim[0],1), input_shape=(dim[0],))(input)  
-                    x = Conv2D(8, (1,2), activation='relu', padding='valid')(reshaped)  #Conv on the history
-                    x = Conv2D(8, (1,2), activation='relu', padding='valid')(x)         #Conv on the history
-                    
-                    out = Flatten()(x)
-                                        
-                else:
-                    input = Input(shape=(dim[0],))
-                    inputs.append(input)
-                    out=input
-                    
-            outs_conv.append(out)
+        out = encoder_model(inputs)
+        
+        outs_conv.append(out)
 
         if (self._action_as_input==True):
             if ( isinstance(self._n_actions,int)):
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index e8a269bd..23bf900f 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -5,10 +5,11 @@
 """
 
 import numpy as np
+np.set_printoptions(threshold=np.nan)
 from keras.optimizers import SGD,RMSprop
 from keras import backend as K
 from ..base_classes import QNetwork
-from .NN_keras import NN # Default Neural network used
+from .NN_keras_lp import NN # Default Neural network used
 
 class MyQNetwork(QNetwork):
     """
@@ -54,19 +55,23 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._freeze_interval = freeze_interval
         self._double_Q = double_Q
         self._random_state = random_state
-        self.update_counter = 0
+        self.update_counter = 0    
+        self.d_loss=1.
+
                 
         Q_net = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
-        self.q_vals, self.params = Q_net._buildDQN()
-
 
         optimizer=RMSprop(lr=0.00005, rho=0.9, epsilon=1e-06)
         optimizer2=RMSprop(lr=0.0001, rho=0.9, epsilon=1e-06)#.Adam(lr=0.0002, beta_1=0.5, beta_2=0.999, epsilon=1e-08)
 
         self.encoder = Q_net.encoder_model()
+
+        self.q_vals, self.params = Q_net._buildDQN(self.encoder)
+
         self.generator_transition = Q_net.generator_transition_model(self.encoder)
         self.discriminator = Q_net.discriminator_model()
         self.full_trans = Q_net.full_model_trans(self.generator_transition, self.encoder, self.discriminator)
+        self.full_enc = Q_net.full_model_enc(self.encoder, self.discriminator)
 
         self.discriminator.trainable = True
         self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer2)
@@ -77,10 +82,11 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
                   loss='mae',
                   metrics=['accuracy'])
         self.full_trans.compile(loss='binary_crossentropy', optimizer=optimizer)
+        self.full_enc.compile(loss='binary_crossentropy', optimizer=optimizer)
             
         self._compile()
 
-        self.next_q_vals, self.next_params = Q_net._buildDQN()
+        self.next_q_vals, self.next_params = Q_net._buildDQN(self.encoder)
         self.next_q_vals.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.q_vals
 
         self._resetQHat()
@@ -116,45 +122,68 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         Individual (square) losses for each tuple
         """
         
-        print "self.discriminator.get_weights()"
-        print self.discriminator.get_weights()
+        #print "self.discriminator.get_weights()"
+        #print self.discriminator.get_weights()
 
         noise = np.random.uniform(-1,1,size=(self._batch_size,5)) #self._rand_vect_size=5
-        #print "[states_val[0],actions_val,noise]"
-        #print [states_val[0],actions_val,noise]
-        #print "states_val.tolist()"
-        #print states_val.tolist()
+        ##print "[states_val[0],actions_val,noise]"
+        ##print [states_val[0],actions_val,noise]
+        ##print "states_val.tolist()"
+        ##print states_val.tolist()
         onehot_actions = np.zeros((self._batch_size, self._n_actions))
         onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
-        print onehot_actions
-        print "[states_val[0],onehot_actions,noise]"
-        print [states_val[0],onehot_actions,noise]
+        #print onehot_actions
+        #print "[states_val[0],onehot_actions,noise]"
+        #print [states_val[0],onehot_actions,noise]
         ETs=self.generator_transition.predict([states_val[0],onehot_actions,noise])
         Es_=self.encoder.predict([next_states_val[0]])
+        Es=self.encoder.predict([states_val[0]])
         
         
         X = np.concatenate((ETs, Es_))
-        print "X"
-        print X
+        if(self.update_counter%100==0):
+            print states_val[0][0]
+            print next_states_val[0][0]
+            print actions_val, rewards_val, terminals_val
+            print "Es"
+            print Es
+            print "X"
+            print ETs,Es_
+            #print "disc"
+            #print self.discriminator.predict([X, np.tile(onehot_actions,(2,1))])
+            print "full trans"
+            print self.full_trans.predict([states_val[0], onehot_actions, noise, Es])
+            print "full enc"
+            print self.full_enc.predict([next_states_val[0], onehot_actions, Es])
             
         y = [1] * self._batch_size + [0] * self._batch_size # first batch size is ETs and second is Es'
         
-        d_loss=0
+        noise_to_avoid_too_easy_disc=np.random.normal(size=X.shape)*(0.7-min(self.d_loss,0.7))*1#*100/max(100,epoch)
+        self.d_loss=0
         for i in range(5):
-            #loss=discriminator.train_on_batch([X, np.tile(in_distrib,(2,1,1,1)), np.tile(cond_distrib,(2,1,1,1))], y)
-            loss = self.discriminator.train_on_batch([X, np.tile(onehot_actions,(2,1))], y)
-            d_loss += loss
+            self.discriminator.trainable = True
+            loss = self.discriminator.train_on_batch([X+noise_to_avoid_too_easy_disc, np.tile(Es,(2,1)), np.tile(onehot_actions,(2,1))], y)
+            self.d_loss += loss
             if loss < 0.01:
                 break
-        d_loss=d_loss/(i+1)
-        
+        self.d_loss=self.d_loss/(i+1)
+
+        if(self.update_counter%100==0):
+            print "d_loss"
+            print self.d_loss
+                    
         # Training generator ETs
         self.discriminator.trainable = False # required?
-        g_loss = self.full_trans.train_on_batch([states_val[0], onehot_actions, noise, Es_], [0] * self._batch_size) # ETs should look like ES_
+        g_loss1 = self.full_trans.train_on_batch([states_val[0], onehot_actions, noise, Es], [0] * self._batch_size) # ETs should look like Es_
 
         # Training generator Es'
-        # TO DO
- 
+        g_loss2 = self.full_enc.train_on_batch([next_states_val[0], onehot_actions, Es], [1] * self._batch_size) # Es_ should look like ETs
+
+        if(self.update_counter%100==0):
+            print "g_losses"
+            print g_loss1
+            print g_loss2
+
 
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
@@ -185,8 +214,12 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # Only some elements of next_q_vals are actual value that I target. 
         # My loss should only take these into account.
         # Workaround here is that many values are already "exact" in this update
-        loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 
+        if (self.update_counter<10000):
+            loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 
                 
+        if(self.update_counter%100==0):
+            print self.update_counter
+        
         self.update_counter += 1        
 
         # loss*self._n_actions = np.average(loss_ind)
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 88848d62..ae60587d 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -18,8 +18,8 @@
 class MyEnv(Environment):
     VALIDATION_MODE = 0
 
-    def __init__(self, rng, game=None, frame_skip=4, 
-            ple_options={"display_screen": True, "force_fps":True, "fps":30}):
+    def __init__(self, rng, game=None, frame_skip=2, 
+            ple_options={"display_screen": True, "force_fps":True, "fps":15}):
 
         self._mode = -1
         self._mode_score = 0.0
@@ -57,7 +57,7 @@ def reset(self, mode):
         self._screen = self._ple.getScreenGrayscale()
         cv2.resize(self._screen, (48, 48), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
         
-        return [4 * [48 * [48 * [0]]]]
+        return [2 * [48 * [48 * [0]]]]
         
         
     def act(self, action):
@@ -82,7 +82,7 @@ def summarizePerformance(self, test_data_set):
 
 
     def inputDimensions(self):
-        return [(4, 48, 48)]
+        return [(2, 48, 48)]
 
     def observationType(self, subject):
         return np.float32
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index 9b670e4a..93580a02 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -38,7 +38,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0005
+    LEARNING_RATE = 0.0001
     LEARNING_RATE_DECAY = 0.99
     DISCOUNT = 0.9
     DISCOUNT_INC = 1

From 0c979fb2671e3907df3f45dc4b8cb49407ef7dfc Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 21 Nov 2017 13:54:17 -0500
Subject: [PATCH 05/96] fixing non trainable disc in full models

---
 deer/base_classes/QNetwork.py     |  1 -
 deer/q_networks/NN_keras_lp.py    | 10 +++++++---
 deer/q_networks/q_net_keras_lp.py | 20 +++++++++++++-------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/deer/base_classes/QNetwork.py b/deer/base_classes/QNetwork.py
index c416eac6..648ba789 100644
--- a/deer/base_classes/QNetwork.py
+++ b/deer/base_classes/QNetwork.py
@@ -47,7 +47,6 @@ def setLearningRate(self, lr):
             The learning rate that has to bet set
         """
         self._lr = lr
-        self._compile()
 
     def setDiscountFactor(self, df):
         """ Setting the discount factor
diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index bc4eb135..9bfb6061 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -60,7 +60,7 @@ def encoder_model(self):
         
         x = Dense(20, activation='relu')(x)
 
-        x = Dense(self._internal_dim, activity_regularizer=regularizers.l2(0.0001))(x) #, activation='relu'
+        x = Dense(self._internal_dim, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         
         model = Model(input=inputs, output=x)
         
@@ -134,9 +134,12 @@ def full_model_trans(self,generator_transition_model, encoder_model, discriminat
         """
         inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ), Input( shape=(self._internal_dim,) )]#, Input( shape=(self._internal_dim,) ) ]
         # input_distr, conditional info
+        
+        for layer in discriminator.layers:
+            layer.trainable = False
+
         T = generator_transition_model(inputs[0:3])
         
-        discriminator.trainable = False
         gan_V = discriminator([T, inputs[3], inputs[1]])
         model = Model(input=inputs, output=gan_V)
         return model
@@ -159,7 +162,8 @@ def full_model_enc(self,encoder_model, discriminator):
         # input_distr, conditional info
         x = encoder_model(inputs[0])
         
-        discriminator.trainable = False
+        for layer in discriminator.layers:
+            layer.trainable = False
         gan_V = discriminator([x, inputs[2], inputs[1]])
         model = Model(input=inputs, output=gan_V)
         return model
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 23bf900f..0f4469e4 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -70,10 +70,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
 
         self.generator_transition = Q_net.generator_transition_model(self.encoder)
         self.discriminator = Q_net.discriminator_model()
-        self.full_trans = Q_net.full_model_trans(self.generator_transition, self.encoder, self.discriminator)
-        self.full_enc = Q_net.full_model_enc(self.encoder, self.discriminator)
 
-        self.discriminator.trainable = True
         self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer2)
         self.generator_transition.compile(optimizer=optimizer,
                   loss='mae',
@@ -81,6 +78,10 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.encoder.compile(optimizer=optimizer,
                   loss='mae',
                   metrics=['accuracy'])
+                  
+        # Full models have to be instatiated and compiled afterwards (we put the discriminator weights to non-trainable)
+        self.full_trans = Q_net.full_model_trans(self.generator_transition, self.encoder, self.discriminator)
+        self.full_enc = Q_net.full_model_enc(self.encoder, self.discriminator)
         self.full_trans.compile(loss='binary_crossentropy', optimizer=optimizer)
         self.full_enc.compile(loss='binary_crossentropy', optimizer=optimizer)
             
@@ -161,7 +162,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         noise_to_avoid_too_easy_disc=np.random.normal(size=X.shape)*(0.7-min(self.d_loss,0.7))*1#*100/max(100,epoch)
         self.d_loss=0
         for i in range(5):
-            self.discriminator.trainable = True
+
             loss = self.discriminator.train_on_batch([X+noise_to_avoid_too_easy_disc, np.tile(Es,(2,1)), np.tile(onehot_actions,(2,1))], y)
             self.d_loss += loss
             if loss < 0.01:
@@ -173,12 +174,17 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print self.d_loss
                     
         # Training generator ETs
-        self.discriminator.trainable = False # required?
+        #print "self.discriminator.get_weights()1"
+        #print self.discriminator.get_weights()
+        
         g_loss1 = self.full_trans.train_on_batch([states_val[0], onehot_actions, noise, Es], [0] * self._batch_size) # ETs should look like Es_
 
         # Training generator Es'
         g_loss2 = self.full_enc.train_on_batch([next_states_val[0], onehot_actions, Es], [1] * self._batch_size) # Es_ should look like ETs
 
+        #print "self.discriminator.get_weights()2"
+        #print self.discriminator.get_weights()
+
         if(self.update_counter%100==0):
             print "g_losses"
             print g_loss1
@@ -214,8 +220,8 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # Only some elements of next_q_vals are actual value that I target. 
         # My loss should only take these into account.
         # Workaround here is that many values are already "exact" in this update
-        if (self.update_counter<10000):
-            loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 
+        #if (self.update_counter<10000):
+        loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 
                 
         if(self.update_counter%100==0):
             print self.update_counter

From 484752538ddba55708350eadab39967c49c10fde Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 21 Nov 2017 15:21:22 -0500
Subject: [PATCH 06/96] fix lr and minor modif

---
 deer/base_classes/QNetwork.py     | 1 +
 deer/q_networks/NN_keras_lp.py    | 8 ++++----
 deer/q_networks/q_net_keras_lp.py | 6 ++++--
 examples/PLE/run_PLE.py           | 2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/deer/base_classes/QNetwork.py b/deer/base_classes/QNetwork.py
index 648ba789..c416eac6 100644
--- a/deer/base_classes/QNetwork.py
+++ b/deer/base_classes/QNetwork.py
@@ -47,6 +47,7 @@ def setLearningRate(self, lr):
             The learning rate that has to bet set
         """
         self._lr = lr
+        self._compile()
 
     def setDiscountFactor(self, df):
         """ Setting the discount factor
diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 9bfb6061..07ab75de 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -86,9 +86,9 @@ def generator_transition_model(self,encoder_model):
         
         x = merge([enc_x]+inputs[1:],mode='concat',concat_axis=-1)
         x = Dense(20, activation='relu')(x)
-        x = Dense(20, activation='relu')(x)
-        x = Dense(self._internal_dim)(x) #, activation='relu'
-        #x = Add()([enc_x,x])
+        #x = Dense(20, activation='relu')(x)
+        x = Dense(self._internal_dim, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+        x = Add()([enc_x,x])
         
         model = Model(input=inputs, output=x)
         
@@ -112,7 +112,7 @@ def discriminator_model(self):
      
         x=merge(inputs,mode='concat')
         x = Dense(20, activation='relu')(x)
-        x = Dense(20, activation='relu')(x)
+        #x = Dense(20, activation='relu')(x)
         true_or_model=Dense(1, activation='sigmoid')(x)
         model = Model(input=inputs, output=true_or_model)
         return model
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 0f4469e4..605b32a1 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -159,7 +159,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             
         y = [1] * self._batch_size + [0] * self._batch_size # first batch size is ETs and second is Es'
         
-        noise_to_avoid_too_easy_disc=np.random.normal(size=X.shape)*(0.7-min(self.d_loss,0.7))*1#*100/max(100,epoch)
+        noise_to_avoid_too_easy_disc=np.random.normal(size=X.shape)*(0.7-min(self.d_loss,0.7))*0.5#*100/max(100,epoch)
         self.d_loss=0
         for i in range(5):
 
@@ -222,7 +222,9 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # Workaround here is that many values are already "exact" in this update
         #if (self.update_counter<10000):
         loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 
-                
+        #print "self.q_vals.optimizer.lr"
+        #print K.eval(self.q_vals.optimizer.lr)
+        
         if(self.update_counter%100==0):
             print self.update_counter
         
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index 93580a02..f9e3bb7a 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -38,7 +38,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0001
+    LEARNING_RATE = 0.002
     LEARNING_RATE_DECAY = 0.99
     DISCOUNT = 0.9
     DISCOUNT_INC = 1

From ef0717f56363bc85e19938b3a708b0f122192eb0 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Sun, 26 Nov 2017 10:31:14 -0500
Subject: [PATCH 07/96] first prototype

---
 deer/agent.py                     |   2 +-
 deer/q_networks/NN_keras_lp.py    |  89 ++++++-----------------
 deer/q_networks/q_net_keras_lp.py | 114 +++++++++++++-----------------
 examples/PLE/PLE_env.py           |  58 +++++++++++++--
 examples/PLE/run_PLE.py           |   2 +-
 5 files changed, 127 insertions(+), 138 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index beab5c01..292846fe 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -179,7 +179,7 @@ def summarizeTestPerformance(self):
         if self._mode == -1:
             raise AgentError("Cannot summarize test performance outside test environment.")
 
-        self._environment.summarizePerformance(self._tmp_dataset)
+        self._environment.summarizePerformance(self._tmp_dataset, self._network)
 
     def train(self):
         """
diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 07ab75de..70822a63 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -4,8 +4,9 @@
 """
 
 import numpy as np
+from keras import backend as K
 from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, merge, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add
+from keras.layers import Input, Layer, Dense, Flatten, merge, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, add, Dot, Multiply, Average, Lambda
 from keras import regularizers
 
 np.random.seed(102912)
@@ -30,7 +31,7 @@ def __init__(self, batch_size, input_dimensions, n_actions, random_state, action
         self._random_state=random_state
         self._n_actions=n_actions
         self._action_as_input=action_as_input
-        self._internal_dim=3 # size random vector
+        self.internal_dim=5 # size random vector
         self._rand_vect_size=5 # size output distribution
 
     def encoder_model(self):
@@ -60,7 +61,7 @@ def encoder_model(self):
         
         x = Dense(20, activation='relu')(x)
 
-        x = Dense(self._internal_dim, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+        x = Dense(self.internal_dim, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         
         model = Model(input=inputs, output=x)
         
@@ -80,44 +81,21 @@ def generator_transition_model(self,encoder_model):
         model with output Tx (= model estimate of x')
     
         """
-        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ) ] #s,a,z
+        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ) ] #s,a
         
         enc_x = encoder_model(inputs[0]) #s --> x
         
         x = merge([enc_x]+inputs[1:],mode='concat',concat_axis=-1)
         x = Dense(20, activation='relu')(x)
         #x = Dense(20, activation='relu')(x)
-        x = Dense(self._internal_dim, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         x = Add()([enc_x,x])
         
         model = Model(input=inputs, output=x)
         
         return model
-    
-    def discriminator_model(self):
-        """
-    
-        Parameters
-        -----------
-        Tx or x'
-        conditional info a
-    
-        Returns
-        -------
-        model with output D
-    
-        """
-        inputs = [ Input( shape=(self._internal_dim,) ), Input( shape=(self._internal_dim,) ), Input( shape=(self._n_actions,) ) ]
-        # distr Tx/x', conditional info x, a
-     
-        x=merge(inputs,mode='concat')
-        x = Dense(20, activation='relu')(x)
-        #x = Dense(20, activation='relu')(x)
-        true_or_model=Dense(1, activation='sigmoid')(x)
-        model = Model(input=inputs, output=true_or_model)
-        return model
-        
-    def full_model_trans(self,generator_transition_model, encoder_model, discriminator):
+
+    def generator_diff_s_s_(self,encoder_model):
         """
     
         Parameters
@@ -125,50 +103,24 @@ def full_model_trans(self,generator_transition_model, encoder_model, discriminat
         s
         a
         random z
-        x
     
         Returns
         -------
-        model with output D
+        model with output Tx (= model estimate of x')
     
         """
-        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._rand_vect_size,) ), Input( shape=(self._internal_dim,) )]#, Input( shape=(self._internal_dim,) ) ]
-        # input_distr, conditional info
+        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(2,48,48,) ) ] #s,s'
         
-        for layer in discriminator.layers:
-            layer.trainable = False
-
-        T = generator_transition_model(inputs[0:3])
+        enc_x = encoder_model(inputs[0]) #s --> x
+        enc_x_ = encoder_model(inputs[1]) #s --> x
+        
+        x = Subtract()([enc_x,enc_x_])
+        x = Dot(axes=-1, normalize=False)([x,x])
+        
+        model = Model(input=inputs, output=x )
         
-        gan_V = discriminator([T, inputs[3], inputs[1]])
-        model = Model(input=inputs, output=gan_V)
-        return model
-
-    def full_model_enc(self,encoder_model, discriminator):
-        """
-    
-        Parameters
-        -----------
-        s'
-        a
-        x
-            
-        Returns
-        -------
-        model with output D
-    
-        """
-        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ), Input( shape=(self._internal_dim,) ) ] #s,a,Tx
-        # input_distr, conditional info
-        x = encoder_model(inputs[0])
-        
-        for layer in discriminator.layers:
-            layer.trainable = False
-        gan_V = discriminator([x, inputs[2], inputs[1]])
-        model = Model(input=inputs, output=gan_V)
         return model
 
-
     def _buildDQN(self,encoder_model):
         """
         Build a network consistent with each type of inputs
@@ -181,8 +133,11 @@ def _buildDQN(self,encoder_model):
         for i, dim in enumerate(self._input_dimensions):
             input = Input(shape=(dim[0],dim[1],dim[2]))
             inputs.append(input)
+        
+        input = Input(shape=(self.internal_dim,))
+        inputs.append(input)
 
-        out = encoder_model(inputs)
+        out = encoder_model(inputs[:-1])
         
         outs_conv.append(out)
 
@@ -199,6 +154,8 @@ def _buildDQN(self,encoder_model):
         else:
             x= outs_conv [0]
         
+        x = Add()([x,inputs[-1]])
+        
         # we stack a deep fully-connected network on top
         x = Dense(50, activation='relu')(x)
         x = Dense(20, activation='relu')(x)
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 605b32a1..aff09a53 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -11,6 +11,9 @@
 from ..base_classes import QNetwork
 from .NN_keras_lp import NN # Default Neural network used
 
+def mean_squared_error_1(y_true, y_pred):
+    return K.abs(y_pred - y_true)
+
 class MyQNetwork(QNetwork):
     """
     Deep Q-learning network using Keras (with any backend)
@@ -57,37 +60,23 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._random_state = random_state
         self.update_counter = 0    
         self.d_loss=1.
+        self.loss1=0
+        self.loss2=0
+        self.loss3=0
+        
+        self.Q_net = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
 
-                
-        Q_net = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
-
-        optimizer=RMSprop(lr=0.00005, rho=0.9, epsilon=1e-06)
-        optimizer2=RMSprop(lr=0.0001, rho=0.9, epsilon=1e-06)#.Adam(lr=0.0002, beta_1=0.5, beta_2=0.999, epsilon=1e-08)
-
-        self.encoder = Q_net.encoder_model()
 
-        self.q_vals, self.params = Q_net._buildDQN(self.encoder)
+        self.encoder = self.Q_net.encoder_model()
 
-        self.generator_transition = Q_net.generator_transition_model(self.encoder)
-        self.discriminator = Q_net.discriminator_model()
+        self.q_vals, self.params = self.Q_net._buildDQN(self.encoder)
 
-        self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer2)
-        self.generator_transition.compile(optimizer=optimizer,
-                  loss='mae',
-                  metrics=['accuracy'])
-        self.encoder.compile(optimizer=optimizer,
-                  loss='mae',
-                  metrics=['accuracy'])
-                  
-        # Full models have to be instatiated and compiled afterwards (we put the discriminator weights to non-trainable)
-        self.full_trans = Q_net.full_model_trans(self.generator_transition, self.encoder, self.discriminator)
-        self.full_enc = Q_net.full_model_enc(self.encoder, self.discriminator)
-        self.full_trans.compile(loss='binary_crossentropy', optimizer=optimizer)
-        self.full_enc.compile(loss='binary_crossentropy', optimizer=optimizer)
-            
+        self.generator_transition = self.Q_net.generator_transition_model(self.encoder)
+        self.generator_diff_s_s_ = self.Q_net.generator_diff_s_s_(self.encoder)
+                              
         self._compile()
 
-        self.next_q_vals, self.next_params = Q_net._buildDQN(self.encoder)
+        self.next_q_vals, self.next_params = self.Q_net._buildDQN(self.encoder)
         self.next_q_vals.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.q_vals
 
         self._resetQHat()
@@ -126,7 +115,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         #print "self.discriminator.get_weights()"
         #print self.discriminator.get_weights()
 
-        noise = np.random.uniform(-1,1,size=(self._batch_size,5)) #self._rand_vect_size=5
         ##print "[states_val[0],actions_val,noise]"
         ##print [states_val[0],actions_val,noise]
         ##print "states_val.tolist()"
@@ -136,7 +124,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         #print onehot_actions
         #print "[states_val[0],onehot_actions,noise]"
         #print [states_val[0],onehot_actions,noise]
-        ETs=self.generator_transition.predict([states_val[0],onehot_actions,noise])
+        ETs=self.generator_transition.predict([states_val[0],onehot_actions])
         Es_=self.encoder.predict([next_states_val[0]])
         Es=self.encoder.predict([states_val[0]])
         
@@ -148,53 +136,27 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print actions_val, rewards_val, terminals_val
             print "Es"
             print Es
-            print "X"
+            print "ETs,Es_"
             print ETs,Es_
             #print "disc"
             #print self.discriminator.predict([X, np.tile(onehot_actions,(2,1))])
-            print "full trans"
-            print self.full_trans.predict([states_val[0], onehot_actions, noise, Es])
-            print "full enc"
-            print self.full_enc.predict([next_states_val[0], onehot_actions, Es])
             
-        y = [1] * self._batch_size + [0] * self._batch_size # first batch size is ETs and second is Es'
-        
-        noise_to_avoid_too_easy_disc=np.random.normal(size=X.shape)*(0.7-min(self.d_loss,0.7))*0.5#*100/max(100,epoch)
-        self.d_loss=0
-        for i in range(5):
+        self.loss1+=self.generator_transition.train_on_batch([states_val[0],onehot_actions] , Es_ ) 
+        self.loss2+=self.encoder.train_on_batch(next_states_val[0], ETs ) 
 
-            loss = self.discriminator.train_on_batch([X+noise_to_avoid_too_easy_disc, np.tile(Es,(2,1)), np.tile(onehot_actions,(2,1))], y)
-            self.d_loss += loss
-            if loss < 0.01:
-                break
-        self.d_loss=self.d_loss/(i+1)
-
-        if(self.update_counter%100==0):
-            print "d_loss"
-            print self.d_loss
-                    
-        # Training generator ETs
-        #print "self.discriminator.get_weights()1"
-        #print self.discriminator.get_weights()
+        self.loss3+=self.generator_diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)*1) 
         
-        g_loss1 = self.full_trans.train_on_batch([states_val[0], onehot_actions, noise, Es], [0] * self._batch_size) # ETs should look like Es_
-
-        # Training generator Es'
-        g_loss2 = self.full_enc.train_on_batch([next_states_val[0], onehot_actions, Es], [1] * self._batch_size) # Es_ should look like ETs
-
-        #print "self.discriminator.get_weights()2"
-        #print self.discriminator.get_weights()
-
         if(self.update_counter%100==0):
-            print "g_losses"
-            print g_loss1
-            print g_loss2
-
+            print "losses"
+            print self.loss1/100.,self.loss2/100.,self.loss3/100.
+            self.loss1=0
+            self.loss2=0
+            self.loss3=0
 
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
         
-        next_q_vals = self.next_q_vals.predict(next_states_val.tolist())
+        next_q_vals = self.next_q_vals.predict([next_states_val[0],np.zeros((32,self.Q_net.internal_dim))])
         
         if(self._double_Q==True):
             next_q_vals_current_qnet=self.q_vals.predict(next_states_val.tolist())
@@ -207,7 +169,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         
-        q_vals=self.q_vals.predict(states_val.tolist())
+        q_vals=self.q_vals.predict([states_val[0],np.zeros((32,self.Q_net.internal_dim))])
 
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
         q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
@@ -221,7 +183,9 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # My loss should only take these into account.
         # Workaround here is that many values are already "exact" in this update
         #if (self.update_counter<10000):
-        loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 
+        noise_to_be_robust=np.random.normal(size=(32,self.Q_net.internal_dim))*0.25
+
+        loss=self.q_vals.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
         #print "self.q_vals.optimizer.lr"
         #print K.eval(self.q_vals.optimizer.lr)
         
@@ -245,7 +209,7 @@ def qValues(self, state_val):
         -------
         The q values for the provided belief state
         """ 
-        return self.q_vals.predict([np.expand_dims(state,axis=0) for state in state_val])[0]
+        return self.q_vals.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((32,self.Q_net.internal_dim))])[0]
 
     def chooseBestAction(self, state):
         """ Get the best action for a belief state
@@ -273,7 +237,25 @@ def _compile(self):
             raise Exception('The update_rule '+self._update_rule+' is not implemented.')
         
         self.q_vals.compile(optimizer=optimizer, loss='mse')
+        
+
+        optimizer=RMSprop(lr=self._lr/20., rho=0.9, epsilon=1e-06)
+        optimizer2=RMSprop(lr=self._lr/10., rho=0.9, epsilon=1e-06)#.Adam(lr=0.0002, beta_1=0.5, beta_2=0.999, epsilon=1e-08)
+
+        self.generator_transition.compile(optimizer=optimizer,
+                  loss='mae')
+                  #metrics=['accuracy'])
+        self.encoder.compile(optimizer=optimizer,
+                  loss='mae')
+                  #metrics=['accuracy'])
+        self.generator_diff_s_s_.compile(optimizer=optimizer2,
+                  loss=mean_squared_error_1)
+                  #metrics=['accuracy'])
 
     def _resetQHat(self):
         for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
             K.set_value(next_param,K.get_value(param))
+
+        self._compile() # recompile to take into account new optimizer parameters that may have changed since
+                        # self._compile() was called in __init__. FIXME: this call should ideally be done elsewhere
+        
\ No newline at end of file
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index ae60587d..328bf2f5 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -32,6 +32,7 @@ def __init__(self, rng, game=None, frame_skip=2,
             raise ValueError("Game must be provided")
 
         self._ple = PLE(game, **ple_options)
+        self._ple.game.rng = rng
         self._ple.init()
 
         w, h = self._ple.getScreenDims()
@@ -46,14 +47,18 @@ def reset(self, mode):
                 self._mode = MyEnv.VALIDATION_MODE
                 self._mode_score = 0.0
                 self._mode_episode_count = 0
+                # fix the seed for every new validation. It potentially removes one source of variance and
+                # it allows to show some illustration of the learning for the same setting in validation
+                self._ple.game.rng = np.random.RandomState(23) # 23:left, center, right, ...
             else:
                 self._mode_episode_count += 1
         elif self._mode != -1: # and thus mode == -1
             self._mode = -1
-
+        
+        
         self._ple.reset_game()
-        for _ in range(self._random_state.randint(15)):
-            self._ple.act(self._ple.NOOP)
+        #for _ in range(self._ple.rng.randint(15)):
+        #    self._ple.act(self._ple.NOOP)
         self._screen = self._ple.getScreenGrayscale()
         cv2.resize(self._screen, (48, 48), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
         
@@ -63,6 +68,9 @@ def reset(self, mode):
     def act(self, action):
         action = self._actions[action]
         
+        #if self._mode == MyEnv.VALIDATION_MODE:
+        #    action=0
+
         reward = 0
         for _ in range(self._frame_skip):
             reward += self._ple.act(action)
@@ -75,10 +83,52 @@ def act(self, action):
         self._mode_score += reward
         return np.sign(reward)
 
-    def summarizePerformance(self, test_data_set):
+    def summarizePerformance(self, test_data_set, learning_algo):
+        #print "test_data_set.observations.shape"
+        #print test_data_set.observations()[0][0:1]
+        n=20
+        historics=[]
+        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
+            if(i<n-1):
+                historics.append(np.expand_dims(observ,axis=0))
+            if(i>0):
+                historics[i-1]=np.concatenate([historics[i-1],np.expand_dims(observ,axis=0)], axis=0)
+        historics=np.array(historics)
+        #print historics
+        abs_states=learning_algo.encoder.predict(historics)
+        print abs_states
+        actions=test_data_set.actions()[0:n]
+        print actions
+        print test_data_set.rewards()[0:n]
         if self.inTerminalState() == False:
             self._mode_episode_count += 1
         print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
+        
+
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        import matplotlib.cm as cm
+        m = cm.ScalarMappable(cmap=cm.jet)
+        
+        x = np.array(abs_states)[:,0]
+        y = np.array(abs_states)[:,1]
+        z = np.array(abs_states)[:,2]
+        
+        #Colors
+        #onehot_actions = np.zeros((n, 4))
+        #onehot_actions[np.arange(n), actions] = 1
+        
+        fig = plt.figure()
+        ax = fig.add_subplot(111,projection='3d')
+        for i in xrange(n-1):
+            ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.jet(255*i/n))
+        #line = ax.contour(x, y ,z, cmap=cm.coolwarm)
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(actions/2.,axis=1),(1,3)), s=50, marker='o', edgecolors='none', depthshade=False)
+        #m.set_array(actions/2.)
+        #plt.colorbar(m)
+                
+        #plt.show()
+        plt.savefig('fig'+str(learning_algo.update_counter)+'.pdf')
 
 
     def inputDimensions(self):
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index f9e3bb7a..a3d2aba0 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -39,7 +39,7 @@ class Defaults:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
     LEARNING_RATE = 0.002
-    LEARNING_RATE_DECAY = 0.99
+    LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
     DISCOUNT_MAX = 0.99

From 35a1489ac067365f53500205aa5e80657c6a5359 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 5 Dec 2017 10:35:14 -0500
Subject: [PATCH 08/96] adding plot + disentgling t and a + pred R + first
 draft planning best action

---
 deer/q_networks/NN_keras_lp.py    | 216 ++++++++++++++++++++++--------
 deer/q_networks/q_net_keras_lp.py | 128 ++++++++++++------
 examples/PLE/PLE_env.py           | 136 ++++++++++++++++---
 3 files changed, 371 insertions(+), 109 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 70822a63..5cf7eb42 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -6,7 +6,7 @@
 import numpy as np
 from keras import backend as K
 from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, merge, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, add, Dot, Multiply, Average, Lambda
+from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate
 from keras import regularizers
 
 np.random.seed(102912)
@@ -31,7 +31,7 @@ def __init__(self, batch_size, input_dimensions, n_actions, random_state, action
         self._random_state=random_state
         self._n_actions=n_actions
         self._action_as_input=action_as_input
-        self.internal_dim=5 # size random vector
+        self.internal_dim=3 # size random vector
         self._rand_vect_size=5 # size output distribution
 
     def encoder_model(self):
@@ -60,21 +60,46 @@ def encoder_model(self):
         x = Flatten()(x)
         
         x = Dense(20, activation='relu')(x)
+        x = Dense(10, activation='relu')(x)
 
-        x = Dense(self.internal_dim, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         
-        model = Model(input=inputs, output=x)
+        model = Model(inputs=inputs, outputs=x)
         
         return model
 
-    def generator_transition_model(self,encoder_model):
+    def transition_model(self):
+        """
+    
+        Parameters
+        -----------
+        x
+        a
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
+
+        x = Concatenate()(inputs)#,axis=-1)
+        x = Dense(20, activation='relu')(x)
+        x = Dense(20, activation='relu')(x)
+        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+        x = Add()([inputs[0],x])
+        
+        model = Model(inputs=inputs, outputs=x)
+        
+        return model
+
+    def full_transition_model(self,encoder_model,transition_model):
         """
     
         Parameters
         -----------
         s
         a
-        random z
     
         Returns
         -------
@@ -85,17 +110,13 @@ def generator_transition_model(self,encoder_model):
         
         enc_x = encoder_model(inputs[0]) #s --> x
         
-        x = merge([enc_x]+inputs[1:],mode='concat',concat_axis=-1)
-        x = Dense(20, activation='relu')(x)
-        #x = Dense(20, activation='relu')(x)
-        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
-        x = Add()([enc_x,x])
+        x = transition_model([enc_x]+inputs[1:])
         
-        model = Model(input=inputs, output=x)
+        model = Model(inputs=inputs, outputs=x)
         
         return model
 
-    def generator_diff_s_s_(self,encoder_model):
+    def diff_s_s_(self,encoder_model):
         """
     
         Parameters
@@ -117,13 +138,134 @@ def generator_diff_s_s_(self,encoder_model):
         x = Subtract()([enc_x,enc_x_])
         x = Dot(axes=-1, normalize=False)([x,x])
         
-        model = Model(input=inputs, output=x )
+        model = Model(inputs=inputs, outputs=x )
+        
+        return model
+
+    def diff_Tx(self,transition_model):
+        """
+    
+        Parameters
+        -----------
+        x
+        a
+        x
+        a
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ), Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) )] #x,a,x,a
+        
+        #identity_mat=inputs[2]#K.constant(np.diag(np.ones(self._n_actions)), name="identity_mat")
+        Tx = transition_model(inputs[:2])
+        Tx2 = transition_model(inputs[2:])
+        
+        #tile_x=K.tile(inputs[0],(self._n_actions,1))        
+        #Tx_ = transition_model([tile_x]+[identity_mat])
+        
+        x = Subtract()([Tx,Tx2])
+        x = Dot(axes=-1, normalize=False)([x,x])
+        
+        model = Model(inputs=inputs, outputs=x )
         
         return model
 
-    def _buildDQN(self,encoder_model):
+    def R_model(self):
         """
         Build a network consistent with each type of inputs
+
+        Parameters
+        -----------
+        x
+        a
+    
+        Returns
+        -------
+        r
+        """
+        
+        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
+        
+        x = Concatenate()(inputs[:1]+inputs[1:])#,axis=-1)
+        x = Dense(20, activation='relu')(inputs[0])
+        #x = Dense(10, activation='relu')(inputs[0])
+        
+        out = Dense(1)(x)
+                
+        model = Model(inputs=inputs, outputs=out)
+        
+        return model
+
+    def full_R_model(self,encoder_model,R_model):
+        """
+        Maps internal state to immediate rewards
+
+        Parameters
+        -----------
+        s
+        a
+        (noise in abstract state space) : FIXME
+    
+        Returns
+        -------
+        r
+        """
+        
+        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ) ] #s,a
+        
+        enc_x = encoder_model(inputs[0]) #s --> x
+                
+        out = R_model([enc_x]+inputs[1:])
+                
+        model = Model(inputs=inputs, outputs=out)
+        
+        return model
+
+    def Q_model(self):
+        
+        inputs = [ Input( shape=(self.internal_dim,) ) ] #x
+        
+        #if (self._action_as_input==True):
+        #    if ( isinstance(self._n_actions,int)):
+        #        print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
+        #    else:
+        #        input = Input(shape=(len(self._n_actions),))
+        #        inputs.append(input)
+                
+        #x = Add()([x,inputs[-1]]) #????
+        
+        # we stack a deep fully-connected network on top
+        x = Dense(50, activation='relu')(inputs[0])
+        x = Dense(20, activation='relu')(x)
+        
+        #if (self._action_as_input==False):
+        #    if ( isinstance(self._n_actions,int)):
+        out = Dense(self._n_actions)(x)
+        #    else:
+        #        out = Dense(len(self._n_actions))(x)
+        #else:
+        #    out = Dense(1)(x)
+                
+        model = Model(inputs=inputs, outputs=out)
+        
+        return model
+
+
+    def full_Q_model(self, encoder_model, Q_model):
+        """
+        Build a network consistent with each type of inputs
+
+        Parameters
+        -----------
+        s
+        noise in abstract state space
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
         """
         layers=[]
         outs_conv=[]
@@ -138,48 +280,14 @@ def _buildDQN(self,encoder_model):
         inputs.append(input)
 
         out = encoder_model(inputs[:-1])
+                
+        x=Add()([out,inputs[-1]]) # adding noise in the abstract state space
         
-        outs_conv.append(out)
+        out = Q_model(out)
 
-        if (self._action_as_input==True):
-            if ( isinstance(self._n_actions,int)):
-                print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
-            else:
-                input = Input(shape=(len(self._n_actions),))
-                inputs.append(input)
-                outs_conv.append(input)
-        
-        if len(outs_conv)>1:
-            x = merge(outs_conv, mode='concat')
-        else:
-            x= outs_conv [0]
+        model = Model(inputs=inputs, outputs=out)
         
-        x = Add()([x,inputs[-1]])
-        
-        # we stack a deep fully-connected network on top
-        x = Dense(50, activation='relu')(x)
-        x = Dense(20, activation='relu')(x)
-        
-        if (self._action_as_input==False):
-            if ( isinstance(self._n_actions,int)):
-                out = Dense(self._n_actions)(x)
-            else:
-                out = Dense(len(self._n_actions))(x)
-        else:
-            out = Dense(1)(x)
-                
-        model = Model(input=inputs, output=out)
-        layers=model.layers
-        
-        # Grab all the parameters together.
-        params = [ param
-                    for layer in layers 
-                    for param in layer.trainable_weights ]
-        
-        if (self._action_as_input==True):
-            return model, params, inputs
-        else:
-            return model, params
+        return model
 
 if __name__ == '__main__':
     pass
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index aff09a53..6fb4cbbb 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -54,7 +54,6 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._rms_epsilon = rms_epsilon
         self._momentum = momentum
         self._update_rule = update_rule
-        #self.clip_delta = clip_delta
         self._freeze_interval = freeze_interval
         self._double_Q = double_Q
         self._random_state = random_state
@@ -62,22 +61,41 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.d_loss=1.
         self.loss1=0
         self.loss2=0
-        self.loss3=0
+        self.loss_disentangle_t=0
+        self.loss_disentangle_a=0
+        self.lossR=0
         
-        self.Q_net = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
+        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
 
+        self.encoder = self.learn_and_plan.encoder_model()
+        self.Q = self.learn_and_plan.Q_model()
+        self.R = self.learn_and_plan.R_model()
+        self.transition = self.learn_and_plan.transition_model()
 
-        self.encoder = self.Q_net.encoder_model()
+        self.full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q)
+        self.full_R = self.learn_and_plan.full_R_model(self.encoder,self.R)
 
-        self.q_vals, self.params = self.Q_net._buildDQN(self.encoder)
-
-        self.generator_transition = self.Q_net.generator_transition_model(self.encoder)
-        self.generator_diff_s_s_ = self.Q_net.generator_diff_s_s_(self.encoder)
+        self.full_transition = self.learn_and_plan.full_transition_model(self.encoder,self.transition)
+        self.diff_s_s_ = self.learn_and_plan.diff_s_s_(self.encoder)
+        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
                               
+        
+        layers=self.full_Q.layers
+        # Grab all the parameters together.
+        self.params = [ param
+                    for layer in layers 
+                    for param in layer.trainable_weights ]
+
         self._compile()
 
-        self.next_q_vals, self.next_params = self.Q_net._buildDQN(self.encoder)
-        self.next_q_vals.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.q_vals
+        self.next_full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q)
+        self.next_full_Q.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.full_Q
+
+        layers=self.next_full_Q.layers
+        # Grab all the parameters together.
+        self.next_params = [ param
+                    for layer in layers 
+                    for param in layer.trainable_weights ]
 
         self._resetQHat()
 
@@ -112,19 +130,9 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         Individual (square) losses for each tuple
         """
         
-        #print "self.discriminator.get_weights()"
-        #print self.discriminator.get_weights()
-
-        ##print "[states_val[0],actions_val,noise]"
-        ##print [states_val[0],actions_val,noise]
-        ##print "states_val.tolist()"
-        ##print states_val.tolist()
         onehot_actions = np.zeros((self._batch_size, self._n_actions))
         onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
-        #print onehot_actions
-        #print "[states_val[0],onehot_actions,noise]"
-        #print [states_val[0],onehot_actions,noise]
-        ETs=self.generator_transition.predict([states_val[0],onehot_actions])
+        ETs=self.full_transition.predict([states_val[0],onehot_actions])
         Es_=self.encoder.predict([next_states_val[0]])
         Es=self.encoder.predict([states_val[0]])
         
@@ -132,34 +140,44 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         X = np.concatenate((ETs, Es_))
         if(self.update_counter%100==0):
             print states_val[0][0]
+            print "len(states_val)"
+            print len(states_val)
             print next_states_val[0][0]
             print actions_val, rewards_val, terminals_val
             print "Es"
             print Es
             print "ETs,Es_"
             print ETs,Es_
-            #print "disc"
-            #print self.discriminator.predict([X, np.tile(onehot_actions,(2,1))])
             
-        self.loss1+=self.generator_transition.train_on_batch([states_val[0],onehot_actions] , Es_ ) 
+        self.loss1+=self.full_transition.train_on_batch([states_val[0],onehot_actions] , Es_ ) 
         self.loss2+=self.encoder.train_on_batch(next_states_val[0], ETs ) 
 
-        self.loss3+=self.generator_diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)*1) 
+        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)*2) 
+
+        # Loss to have all s' following s,a with a to a distance 1 of s,a)
+        tiled_x=np.tile(Es,(self._n_actions,1))
+        tiled_onehot_actions=np.tile(onehot_actions,(self._n_actions,1))
+        tiled_onehot_actions2=np.repeat(np.diag(np.ones(self._n_actions)),self._batch_size,axis=0)
+        self.loss_disentangle_a+=self.diff_Tx.train_on_batch([tiled_x,tiled_onehot_actions,tiled_x,tiled_onehot_actions2], np.ones(32*self._n_actions)) 
+
+        self.lossR+=self.full_R.train_on_batch([states_val[0],onehot_actions], rewards_val) 
         
         if(self.update_counter%100==0):
             print "losses"
-            print self.loss1/100.,self.loss2/100.,self.loss3/100.
+            print self.loss1/100.,self.loss2/100.,self.loss_disentangle_t/100.,self.lossR/100.,self.loss_disentangle_a/100.
             self.loss1=0
             self.loss2=0
-            self.loss3=0
+            self.loss_disentangle_t=0
+            self.loss_disentangle_a=0
+            self.lossR=0
 
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
         
-        next_q_vals = self.next_q_vals.predict([next_states_val[0],np.zeros((32,self.Q_net.internal_dim))])
+        next_q_vals = self.next_full_Q.predict([next_states_val[0],np.zeros((32,self.learn_and_plan.internal_dim))])
         
         if(self._double_Q==True):
-            next_q_vals_current_qnet=self.q_vals.predict(next_states_val.tolist())
+            next_q_vals_current_qnet=self.full_Q.predict(next_states_val.tolist())
             argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1)
             max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1))
         else:
@@ -169,7 +187,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         
-        q_vals=self.q_vals.predict([states_val[0],np.zeros((32,self.Q_net.internal_dim))])
+        q_vals=self.full_Q.predict([states_val[0],np.zeros((32,self.learn_and_plan.internal_dim))])
 
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
         q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
@@ -183,9 +201,9 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # My loss should only take these into account.
         # Workaround here is that many values are already "exact" in this update
         #if (self.update_counter<10000):
-        noise_to_be_robust=np.random.normal(size=(32,self.Q_net.internal_dim))*0.25
+        noise_to_be_robust=np.random.normal(size=(32,self.learn_and_plan.internal_dim))*0.#25
 
-        loss=self.q_vals.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
+        loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
         #print "self.q_vals.optimizer.lr"
         #print K.eval(self.q_vals.optimizer.lr)
         
@@ -199,7 +217,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
 
     def qValues(self, state_val):
-        """ Get the q values for one belief state
+        """ Get the q values for one belief state (without planning)
 
         Arguments
         ---------
@@ -209,7 +227,34 @@ def qValues(self, state_val):
         -------
         The q values for the provided belief state
         """ 
-        return self.q_vals.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((32,self.Q_net.internal_dim))])[0]
+        return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((32,self.learn_and_plan.internal_dim))])[0]
+
+    def qValues_planning(self, state_val, d=2.):
+        """ Get the q values for one belief state with a planning depth d
+
+        Arguments
+        ---------
+        state_val : one belief state
+        d : planning depth
+
+        Returns
+        -------
+        The q values with planning depth d for the provided belief state
+        """ 
+        identity_matrix = np.diag(np.ones(self._n_actions))
+        
+        encoded_x = self.encoder.predict([np.expand_dims(state,axis=0) for state in state_val])
+
+        q_vals_d0=self.Q.predict([encoded_x])[0]
+        #print "q_vals_d0"
+        #print q_vals_d0
+        
+        next_x_predicted=self.full_transition.predict([np.array([state for state in state_val for i in range(self._n_actions)])]+[identity_matrix])
+        q_vals_d1=self.Q.predict([next_x_predicted])
+        #print q_vals_d1
+        #print (1-1/d)+(1-1/d)**2
+        #print ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*np.array([np.max(vals) for vals in q_vals_d1])
+        return ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*np.array([np.max(vals) for vals in q_vals_d1])
 
     def chooseBestAction(self, state):
         """ Get the best action for a belief state
@@ -222,8 +267,8 @@ def chooseBestAction(self, state):
         -------
         The best action : int
         """        
-        q_vals = self.qValues(state)
-
+        q_vals = self.qValues_planning(state)#self.qValues(state)#
+        
         return np.argmax(q_vals),np.max(q_vals)
         
     def _compile(self):
@@ -236,19 +281,22 @@ def _compile(self):
         else:
             raise Exception('The update_rule '+self._update_rule+' is not implemented.')
         
-        self.q_vals.compile(optimizer=optimizer, loss='mse')
-        
+        self.full_Q.compile(optimizer=optimizer, loss='mse')
+        self.full_R.compile(optimizer=optimizer, loss='mse')
 
         optimizer=RMSprop(lr=self._lr/20., rho=0.9, epsilon=1e-06)
         optimizer2=RMSprop(lr=self._lr/10., rho=0.9, epsilon=1e-06)#.Adam(lr=0.0002, beta_1=0.5, beta_2=0.999, epsilon=1e-08)
 
-        self.generator_transition.compile(optimizer=optimizer,
+        self.full_transition.compile(optimizer=optimizer,
                   loss='mae')
                   #metrics=['accuracy'])
         self.encoder.compile(optimizer=optimizer,
                   loss='mae')
                   #metrics=['accuracy'])
-        self.generator_diff_s_s_.compile(optimizer=optimizer2,
+        self.diff_s_s_.compile(optimizer=optimizer2,
+                  loss=mean_squared_error_1)
+                  #metrics=['accuracy'])
+        self.diff_Tx.compile(optimizer=optimizer,
                   loss=mean_squared_error_1)
                   #metrics=['accuracy'])
 
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 328bf2f5..a8addc70 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -66,30 +66,32 @@ def reset(self, mode):
         
         
     def act(self, action):
-        action = self._actions[action]
-        
+        #print action
+        #print self._actions
         #if self._mode == MyEnv.VALIDATION_MODE:
         #    action=0
+        action = self._actions[action]
+        
 
-        reward = 0
+        self.reward = 0
         for _ in range(self._frame_skip):
-            reward += self._ple.act(action)
+            self.reward += self._ple.act(action)
             if self.inTerminalState():
                 break
             
         self._screen = self._ple.getScreenGrayscale()
         cv2.resize(self._screen, (48, 48), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
   
-        self._mode_score += reward
-        return np.sign(reward)
+        self._mode_score += self.reward
+        return np.sign(self.reward)
 
     def summarizePerformance(self, test_data_set, learning_algo):
         #print "test_data_set.observations.shape"
         #print test_data_set.observations()[0][0:1]
         n=20
         historics=[]
-        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
-            if(i<n-1):
+        for i,observ in enumerate(test_data_set.observations()[0][0:n+1]):
+            if(i<n):
                 historics.append(np.expand_dims(observ,axis=0))
             if(i>0):
                 historics[i-1]=np.concatenate([historics[i-1],np.expand_dims(observ,axis=0)], axis=0)
@@ -97,7 +99,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         #print historics
         abs_states=learning_algo.encoder.predict(historics)
         print abs_states
-        actions=test_data_set.actions()[0:n]
+        actions=test_data_set.actions()[1:n+1] #instead of 0:n because history of 2 time steps considered
         print actions
         print test_data_set.rewards()[0:n]
         if self.inTerminalState() == False:
@@ -118,17 +120,116 @@ def summarizePerformance(self, test_data_set, learning_algo):
         #onehot_actions = np.zeros((n, 4))
         #onehot_actions[np.arange(n), actions] = 1
         
+        # Plot the trajectory
         fig = plt.figure()
         ax = fig.add_subplot(111,projection='3d')
         for i in xrange(n-1):
-            ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.jet(255*i/n))
-        #line = ax.contour(x, y ,z, cmap=cm.coolwarm)
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(actions/2.,axis=1),(1,3)), s=50, marker='o', edgecolors='none', depthshade=False)
-        #m.set_array(actions/2.)
+            ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
+
+        # Plot the colorbar for the trajectory
+        fig.subplots_adjust(right=0.7)
+        ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
+        # Set the colormap and norm to correspond to the data for which the colorbar will be used.
+        cmap = matplotlib.cm.cool
+        norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
+        # everything needed for a standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
+                                norm=norm,
+                                orientation='vertical')
+        cb1.set_label('Beginning to end of trajectory')
+
+
+        # Plot the dots at each time step depending on the action taken
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(actions/2.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
+        axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
+        zrange=axes_lims[2][1]-axes_lims[2][0]
+        
+        # Plot the legend for the dots
+        from matplotlib.patches import Circle
+        from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
+        box1 = TextArea(" Actions (right, left and none) : ", textprops=dict(color="k"))
+        
+        box2 = DrawingArea(60, 20, 0, 0)
+        el1 = Circle((10, 10), 5, fc="k", edgecolor="k")
+        el2 = Circle((30, 10), 5, fc="grey", edgecolor="k") 
+        el3 = Circle((50, 10), 5, fc="w", edgecolor="k") 
+        box2.add_artist(el1)
+        box2.add_artist(el2)
+        box2.add_artist(el3)
+        
+        box = HPacker(children=[box1, box2],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=box, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 1.02),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )        
+        ax.add_artist(anchored_box)
+
+        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # Plot the Q_vals
+        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+        print "actions,C"
+        print actions
+        print c
+        #c=np.max(c,axis=1)
+        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        m3=ax.scatter(x, y, z+zrange/10, c=c[:,2], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        
+        #plt.colorbar(m3)
+        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.RdYlGn
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+        print x
+        print np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
+        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+        c=np.max(c,axis=1)
+        print c
+        
+        m=ax.scatter(x, y, z, c=c, cmap=plt.hot())
         #plt.colorbar(m)
-                
+        fig.subplots_adjust(right=0.8)
+        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.hot
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
         #plt.show()
-        plt.savefig('fig'+str(learning_algo.update_counter)+'.pdf')
+        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
 
 
     def inputDimensions(self):
@@ -144,6 +245,11 @@ def observe(self):
         return [np.array(self._reduced_screen)/256.]
 
     def inTerminalState(self):
+        #if (self.reward!=0):
+        #    # If a reward has been observed, end the episode
+        #    print "end!!"
+        #    return True
+        #else:
         return self._ple.game_over()
                 
 

From 81f495838cc047b08696848448389585116e9267 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 5 Dec 2017 12:46:46 -0500
Subject: [PATCH 09/96] add R visualisation and fix qValues_planning

---
 deer/q_networks/q_net_keras_lp.py | 11 ++++++--
 examples/PLE/PLE_env.py           | 44 ++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 6fb4cbbb..691eca6d 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -248,13 +248,20 @@ def qValues_planning(self, state_val, d=2.):
         q_vals_d0=self.Q.predict([encoded_x])[0]
         #print "q_vals_d0"
         #print q_vals_d0
+        #tile3_encoded_x=np.array([enc for enc in encoded_x for i in range(self._n_actions)])
+        tile3_encoded_x=np.tile(encoded_x,(3,1))
+        print tile3_encoded_x
+        r_vals_d0=self.R.predict([tile3_encoded_x,identity_matrix])
         
-        next_x_predicted=self.full_transition.predict([np.array([state for state in state_val for i in range(self._n_actions)])]+[identity_matrix])
+        #tile3_state_val=np.array([state for state in state_val for i in range(self._n_actions)])
+        tile3_state_val=np.tile(state_val,(3,1,1,1))
+        
+        next_x_predicted=self.full_transition.predict([tile3_state_val,identity_matrix])
         q_vals_d1=self.Q.predict([next_x_predicted])
         #print q_vals_d1
         #print (1-1/d)+(1-1/d)**2
         #print ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*np.array([np.max(vals) for vals in q_vals_d1])
-        return ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*np.array([np.max(vals) for vals in q_vals_d1])
+        return ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*(r_vals_d0+self._df*np.array([np.max(vals) for vals in q_vals_d1]))
 
     def chooseBestAction(self, state):
         """ Get the best action for a belief state
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index a8addc70..933a23f2 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -201,6 +201,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
         plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
 
+
+        # fig_visuV
         fig = plt.figure()
         ax = fig.add_subplot(111, projection='3d')
         
@@ -211,9 +213,10 @@ def summarizePerformance(self, test_data_set, learning_algo):
         print np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
         c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
         c=np.max(c,axis=1)
+        print "c"
         print c
         
-        m=ax.scatter(x, y, z, c=c, cmap=plt.hot())
+        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
         #plt.colorbar(m)
         fig.subplots_adjust(right=0.8)
         ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
@@ -232,6 +235,45 @@ def summarizePerformance(self, test_data_set, learning_algo):
         plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
 
 
+        # fig_visuR
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+        print x
+        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
+        repeat3_coord=np.repeat(coords,3,axis=0)
+        identity_matrix = np.diag(np.ones(self.nActions()))
+        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
+        print tile_identity_matrix
+        c = learning_algo.R.predict([repeat3_coord,tile_identity_matrix])
+        c=np.max(np.reshape(c,(125,3)),axis=1)
+        print "c"
+        print c
+        mini=np.min(c)
+        maxi=np.max(c)
+        
+        m=ax.scatter(x, y, z, c=c, cmap=plt.hot())
+        #plt.colorbar(m)
+        fig.subplots_adjust(right=0.8)
+        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.hot
+        norm = matplotlib.colors.Normalize(vmin=mini, vmax=maxi)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        #plt.show()
+        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
+
+
     def inputDimensions(self):
         return [(2, 48, 48)]
 

From 942d9421e8210212387758daeb4a60776f056e24 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 5 Dec 2017 13:46:39 -0500
Subject: [PATCH 10/96] add visualisation of planning

---
 examples/PLE/PLE_env.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 933a23f2..2ce41373 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -105,8 +105,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
         if self.inTerminalState() == False:
             self._mode_episode_count += 1
         print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
+                
         
-
         import matplotlib.pyplot as plt
         from mpl_toolkits.mplot3d import Axes3D
         import matplotlib.cm as cm
@@ -126,6 +126,30 @@ def summarizePerformance(self, test_data_set, learning_algo):
         for i in xrange(n-1):
             ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
 
+        # Plot the fitted one-step trajectory from time t=10
+        one_hot_a=np.zeros((1,3))
+        one_hot_a[0,actions[10:11]]=1
+        print "learning_algo.transition"
+        print [abs_states[10:11],one_hot_a]
+        predicted=learning_algo.transition.predict([abs_states[10:11],one_hot_a])
+        print predicted
+        predicted1=learning_algo.transition.predict([abs_states[10:11],np.array([[1,0,0]])])
+        predicted2=learning_algo.transition.predict([abs_states[10:11],np.array([[0,1,0]])])
+        predicted3=learning_algo.transition.predict([abs_states[10:11],np.array([[0,0,1]])])
+        print "predicted1,predicted2,predicted3"
+        print predicted1,predicted2,predicted3
+        i=10
+        print x[i:i+1]
+        print predicted[0,:1]
+        print np.concatenate([x[i:i+1],predicted[0,:1]])
+        print predicted[0,1:2]
+        print predicted[0,2:]
+        ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:]]), color="1")
+        ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:]]), color="0.5")
+        ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:]]), color="0")
+        #ax.plot(np.concatenate([x[i:i+1],predicted[0,:1]]), np.concatenate([y[i:i+1],predicted[0,1:2]]), np.concatenate([z[i:i+1],predicted[0,2:]]), color="g")
+        
+
         # Plot the colorbar for the trajectory
         fig.subplots_adjust(right=0.7)
         ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])

From 3f0190850635575fba67c555efa40d41a46e6a5c Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 5 Dec 2017 19:26:48 -0500
Subject: [PATCH 11/96] clean

---
 examples/PLE/PLE_env.py | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 2ce41373..175945d7 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -129,21 +129,11 @@ def summarizePerformance(self, test_data_set, learning_algo):
         # Plot the fitted one-step trajectory from time t=10
         one_hot_a=np.zeros((1,3))
         one_hot_a[0,actions[10:11]]=1
-        print "learning_algo.transition"
-        print [abs_states[10:11],one_hot_a]
         predicted=learning_algo.transition.predict([abs_states[10:11],one_hot_a])
-        print predicted
         predicted1=learning_algo.transition.predict([abs_states[10:11],np.array([[1,0,0]])])
         predicted2=learning_algo.transition.predict([abs_states[10:11],np.array([[0,1,0]])])
         predicted3=learning_algo.transition.predict([abs_states[10:11],np.array([[0,0,1]])])
-        print "predicted1,predicted2,predicted3"
-        print predicted1,predicted2,predicted3
         i=10
-        print x[i:i+1]
-        print predicted[0,:1]
-        print np.concatenate([x[i:i+1],predicted[0,:1]])
-        print predicted[0,1:2]
-        print predicted[0,2:]
         ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:]]), color="1")
         ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:]]), color="0.5")
         ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:]]), color="0")
@@ -202,9 +192,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
         # Plot the Q_vals
         c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-        print "actions,C"
-        print actions
-        print c
+        #print "actions,C"
+        #print actions
+        #print c
         #c=np.max(c,axis=1)
         m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
         m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
@@ -233,12 +223,11 @@ def summarizePerformance(self, test_data_set, learning_algo):
         x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
         y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
         z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-        print x
-        print np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
+
         c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
         c=np.max(c,axis=1)
-        print "c"
-        print c
+        #print "c"
+        #print c
         
         m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
         #plt.colorbar(m)
@@ -266,16 +255,16 @@ def summarizePerformance(self, test_data_set, learning_algo):
         x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
         y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
         z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-        print x
+
         coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
         repeat3_coord=np.repeat(coords,3,axis=0)
         identity_matrix = np.diag(np.ones(self.nActions()))
         tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
-        print tile_identity_matrix
+
         c = learning_algo.R.predict([repeat3_coord,tile_identity_matrix])
         c=np.max(np.reshape(c,(125,3)),axis=1)
-        print "c"
-        print c
+        #print "c"
+        #print c
         mini=np.min(c)
         maxi=np.max(c)
         

From 8f1d08d312fa4f73612e26f4ee5009aa4d790e67 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 6 Dec 2017 14:58:13 -0500
Subject: [PATCH 12/96] avoiding memory leaks

---
 deer/base_classes/QNetwork.py     |  3 ++-
 deer/q_networks/q_net_keras_lp.py | 18 +++++++++++++++---
 examples/PLE/PLE_env.py           |  1 +
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/deer/base_classes/QNetwork.py b/deer/base_classes/QNetwork.py
index c416eac6..19a147c6 100644
--- a/deer/base_classes/QNetwork.py
+++ b/deer/base_classes/QNetwork.py
@@ -40,6 +40,8 @@ def qValues(self, state):
 
     def setLearningRate(self, lr):
         """ Setting the learning rate
+        NB: The learning rate has usually to be set in the optimizer, hence this function should
+        be overridden. Otherwise, the learning rate change is likely not to be taken into account
 
         Parameters
         -----------
@@ -47,7 +49,6 @@ def setLearningRate(self, lr):
             The learning rate that has to bet set
         """
         self._lr = lr
-        self._compile()
 
     def setDiscountFactor(self, df):
         """ Setting the discount factor
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 691eca6d..d7a8e93f 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -311,6 +311,18 @@ def _resetQHat(self):
         for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
             K.set_value(next_param,K.get_value(param))
 
-        self._compile() # recompile to take into account new optimizer parameters that may have changed since
-                        # self._compile() was called in __init__. FIXME: this call should ideally be done elsewhere
-        
\ No newline at end of file
+    def setLearningRate(self, lr):
+        """ Setting the learning rate
+
+        Parameters
+        -----------
+        lr : float
+            The learning rate that has to bet set
+        """
+        self._lr = lr
+        # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
+        K.set_value(self.full_transition.optimizer.lr, self._lr/20.)
+        K.set_value(self.encoder.optimizer.lr, self._lr/20.)
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
+        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
+
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 175945d7..4779fd9e 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -286,6 +286,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         #plt.show()
         plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
 
+        matplotlib.pyplot.close("all") # avoids memory leaks
 
     def inputDimensions(self):
         return [(2, 48, 48)]

From 1d6d576614bbdf41753d640f9164288825350737 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 7 Dec 2017 14:35:53 -0500
Subject: [PATCH 13/96] minor modif plots

---
 examples/PLE/PLE_env.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 4779fd9e..fefba09d 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -127,13 +127,13 @@ def summarizePerformance(self, test_data_set, learning_algo):
             ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
 
         # Plot the fitted one-step trajectory from time t=10
+        i=18
         one_hot_a=np.zeros((1,3))
-        one_hot_a[0,actions[10:11]]=1
-        predicted=learning_algo.transition.predict([abs_states[10:11],one_hot_a])
-        predicted1=learning_algo.transition.predict([abs_states[10:11],np.array([[1,0,0]])])
-        predicted2=learning_algo.transition.predict([abs_states[10:11],np.array([[0,1,0]])])
-        predicted3=learning_algo.transition.predict([abs_states[10:11],np.array([[0,0,1]])])
-        i=10
+        one_hot_a[0,actions[i:i+1]]=1
+        predicted=learning_algo.transition.predict([abs_states[i:i+1],one_hot_a])
+        predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0]])])
+        predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0]])])
+        predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1]])])
         ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:]]), color="1")
         ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:]]), color="0.5")
         ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:]]), color="0")
@@ -265,15 +265,15 @@ def summarizePerformance(self, test_data_set, learning_algo):
         c=np.max(np.reshape(c,(125,3)),axis=1)
         #print "c"
         #print c
-        mini=np.min(c)
-        maxi=np.max(c)
+        #mini=np.min(c)
+        #maxi=np.max(c)
         
-        m=ax.scatter(x, y, z, c=c, cmap=plt.hot())
+        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
         #plt.colorbar(m)
         fig.subplots_adjust(right=0.8)
         ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
         cmap = matplotlib.cm.hot
-        norm = matplotlib.colors.Normalize(vmin=mini, vmax=maxi)
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
 
         # ColorbarBase derives from ScalarMappable and puts a colorbar
         # in a specified axes, so it has everything needed for a

From 783346e041ef6b69dbacde27dcace595f56fdef1 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 7 Dec 2017 17:06:40 -0500
Subject: [PATCH 14/96] new way of training the transition model

---
 deer/q_networks/NN_keras_lp.py    | 28 ++++++++++++++++++++++++++++
 deer/q_networks/q_net_keras_lp.py | 18 ++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 5cf7eb42..4bbbb202 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -93,6 +93,34 @@ def transition_model(self):
         
         return model
 
+    def diff_Tx_x_(self,encoder_model,transition_model):
+        """
+    
+        Parameters
+        -----------
+        s
+        a
+        s'
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ) , Input( shape=(2,48,48,) ) ] #s,s'
+        
+        enc_x = encoder_model(inputs[0]) #s --> x
+        enc_x_ = encoder_model(inputs[2]) #s --> x
+        
+        Tx= transition_model([enc_x,inputs[1]])
+        
+        x = Subtract()([Tx,enc_x_])
+        x = Dot(axes=-1, normalize=False)([x,x])
+        
+        model = Model(inputs=inputs, outputs=x )
+        
+        return model
+
     def full_transition_model(self,encoder_model,transition_model):
         """
     
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index d7a8e93f..9ed1ccd1 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -59,6 +59,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._random_state = random_state
         self.update_counter = 0    
         self.d_loss=1.
+        self.loss_T=0
         self.loss1=0
         self.loss2=0
         self.loss_disentangle_t=0
@@ -75,9 +76,11 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q)
         self.full_R = self.learn_and_plan.full_R_model(self.encoder,self.R)
 
+        self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)#full_transition_model(self.encoder,self.transition)
         self.full_transition = self.learn_and_plan.full_transition_model(self.encoder,self.transition)
         self.diff_s_s_ = self.learn_and_plan.diff_s_s_(self.encoder)
         self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
+        
                               
         
         layers=self.full_Q.layers
@@ -149,8 +152,11 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print "ETs,Es_"
             print ETs,Es_
             
-        self.loss1+=self.full_transition.train_on_batch([states_val[0],onehot_actions] , Es_ ) 
-        self.loss2+=self.encoder.train_on_batch(next_states_val[0], ETs ) 
+        # Fit transition
+        self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],onehot_actions,next_states_val[0]], np.zeros(32))
+
+#        self.loss1+=self.full_transition.train_on_batch([states_val[0],onehot_actions] , Es_ ) 
+#        self.loss2+=self.encoder.train_on_batch(next_states_val[0], ETs ) 
 
         self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)*2) 
 
@@ -164,7 +170,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         if(self.update_counter%100==0):
             print "losses"
-            print self.loss1/100.,self.loss2/100.,self.loss_disentangle_t/100.,self.lossR/100.,self.loss_disentangle_a/100.
+            print self.loss_T/100.,self.loss_disentangle_t/100.,self.lossR/100.,self.loss_disentangle_a/100.
             self.loss1=0
             self.loss2=0
             self.loss_disentangle_t=0
@@ -250,7 +256,7 @@ def qValues_planning(self, state_val, d=2.):
         #print q_vals_d0
         #tile3_encoded_x=np.array([enc for enc in encoded_x for i in range(self._n_actions)])
         tile3_encoded_x=np.tile(encoded_x,(3,1))
-        print tile3_encoded_x
+        #print tile3_encoded_x
         r_vals_d0=self.R.predict([tile3_encoded_x,identity_matrix])
         
         #tile3_state_val=np.array([state for state in state_val for i in range(self._n_actions)])
@@ -294,6 +300,9 @@ def _compile(self):
         optimizer=RMSprop(lr=self._lr/20., rho=0.9, epsilon=1e-06)
         optimizer2=RMSprop(lr=self._lr/10., rho=0.9, epsilon=1e-06)#.Adam(lr=0.0002, beta_1=0.5, beta_2=0.999, epsilon=1e-08)
 
+        self.diff_Tx_x_.compile(optimizer=optimizer,
+                  loss='mae')
+                  #metrics=['accuracy'])
         self.full_transition.compile(optimizer=optimizer,
                   loss='mae')
                   #metrics=['accuracy'])
@@ -321,6 +330,7 @@ def setLearningRate(self, lr):
         """
         self._lr = lr
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
+        K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr/10.)
         K.set_value(self.full_transition.optimizer.lr, self._lr/20.)
         K.set_value(self.encoder.optimizer.lr, self._lr/20.)
         K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)

From ee6db575bfff9b93a40e6bece9a4ba5a7feece1f Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 7 Dec 2017 17:07:10 -0500
Subject: [PATCH 15/96] minor fixes

---
 deer/q_networks/q_net_keras_lp.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 9ed1ccd1..d1bdbca2 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -171,6 +171,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         if(self.update_counter%100==0):
             print "losses"
             print self.loss_T/100.,self.loss_disentangle_t/100.,self.lossR/100.,self.loss_disentangle_a/100.
+            self.loss_T=0
             self.loss1=0
             self.loss2=0
             self.loss_disentangle_t=0
@@ -262,7 +263,8 @@ def qValues_planning(self, state_val, d=2.):
         #tile3_state_val=np.array([state for state in state_val for i in range(self._n_actions)])
         tile3_state_val=np.tile(state_val,(3,1,1,1))
         
-        next_x_predicted=self.full_transition.predict([tile3_state_val,identity_matrix])
+        #next_x_predicted=self.full_transition.predict([tile3_state_val,identity_matrix])
+        next_x_predicted=self.transition.predict([tile3_encoded_x,identity_matrix])
         q_vals_d1=self.Q.predict([next_x_predicted])
         #print q_vals_d1
         #print (1-1/d)+(1-1/d)**2

From 3d2a5108448ff95ebfacf8087c51865c4bc37d5f Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 7 Dec 2017 17:12:27 -0500
Subject: [PATCH 16/96] clean

---
 deer/q_networks/NN_keras_lp.py    | 23 -----------------------
 deer/q_networks/q_net_keras_lp.py | 14 +-------------
 2 files changed, 1 insertion(+), 36 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 4bbbb202..f1032cfc 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -121,29 +121,6 @@ def diff_Tx_x_(self,encoder_model,transition_model):
         
         return model
 
-    def full_transition_model(self,encoder_model,transition_model):
-        """
-    
-        Parameters
-        -----------
-        s
-        a
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ) ] #s,a
-        
-        enc_x = encoder_model(inputs[0]) #s --> x
-        
-        x = transition_model([enc_x]+inputs[1:])
-        
-        model = Model(inputs=inputs, outputs=x)
-        
-        return model
-
     def diff_s_s_(self,encoder_model):
         """
     
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index d1bdbca2..fe45b030 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -58,10 +58,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._double_Q = double_Q
         self._random_state = random_state
         self.update_counter = 0    
-        self.d_loss=1.
         self.loss_T=0
-        self.loss1=0
-        self.loss2=0
         self.loss_disentangle_t=0
         self.loss_disentangle_a=0
         self.lossR=0
@@ -77,7 +74,6 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.full_R = self.learn_and_plan.full_R_model(self.encoder,self.R)
 
         self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)#full_transition_model(self.encoder,self.transition)
-        self.full_transition = self.learn_and_plan.full_transition_model(self.encoder,self.transition)
         self.diff_s_s_ = self.learn_and_plan.diff_s_s_(self.encoder)
         self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
         
@@ -135,9 +131,9 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         onehot_actions = np.zeros((self._batch_size, self._n_actions))
         onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
-        ETs=self.full_transition.predict([states_val[0],onehot_actions])
         Es_=self.encoder.predict([next_states_val[0]])
         Es=self.encoder.predict([states_val[0]])
+        ETs=self.transition.predict([Es,onehot_actions])
         
         
         X = np.concatenate((ETs, Es_))
@@ -155,9 +151,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # Fit transition
         self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],onehot_actions,next_states_val[0]], np.zeros(32))
 
-#        self.loss1+=self.full_transition.train_on_batch([states_val[0],onehot_actions] , Es_ ) 
-#        self.loss2+=self.encoder.train_on_batch(next_states_val[0], ETs ) 
-
         self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)*2) 
 
         # Loss to have all s' following s,a with a to a distance 1 of s,a)
@@ -263,7 +256,6 @@ def qValues_planning(self, state_val, d=2.):
         #tile3_state_val=np.array([state for state in state_val for i in range(self._n_actions)])
         tile3_state_val=np.tile(state_val,(3,1,1,1))
         
-        #next_x_predicted=self.full_transition.predict([tile3_state_val,identity_matrix])
         next_x_predicted=self.transition.predict([tile3_encoded_x,identity_matrix])
         q_vals_d1=self.Q.predict([next_x_predicted])
         #print q_vals_d1
@@ -305,9 +297,6 @@ def _compile(self):
         self.diff_Tx_x_.compile(optimizer=optimizer,
                   loss='mae')
                   #metrics=['accuracy'])
-        self.full_transition.compile(optimizer=optimizer,
-                  loss='mae')
-                  #metrics=['accuracy'])
         self.encoder.compile(optimizer=optimizer,
                   loss='mae')
                   #metrics=['accuracy'])
@@ -333,7 +322,6 @@ def setLearningRate(self, lr):
         self._lr = lr
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr/10.)
-        K.set_value(self.full_transition.optimizer.lr, self._lr/20.)
         K.set_value(self.encoder.optimizer.lr, self._lr/20.)
         K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
         K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)

From d2cad8e7293854bd91445eb9158bae3818bed105 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 7 Dec 2017 19:38:06 -0500
Subject: [PATCH 17/96] fix for stability

---
 deer/q_networks/NN_keras_lp.py    | 2 +-
 deer/q_networks/q_net_keras_lp.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index f1032cfc..dd247172 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -115,7 +115,7 @@ def diff_Tx_x_(self,encoder_model,transition_model):
         Tx= transition_model([enc_x,inputs[1]])
         
         x = Subtract()([Tx,enc_x_])
-        x = Dot(axes=-1, normalize=False)([x,x])
+#        x = Dot(axes=-1, normalize=False)([x,x])
         
         model = Model(inputs=inputs, outputs=x )
         
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index fe45b030..d3b69d30 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -149,7 +149,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print ETs,Es_
             
         # Fit transition
-        self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],onehot_actions,next_states_val[0]], np.zeros(32))
+        self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],onehot_actions,next_states_val[0]], np.zeros((32,3)))
 
         self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)*2) 
 

From ac84a3b67d9ac641526e3c50b6d77eed7f9243b9 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 8 Dec 2017 20:13:25 -0500
Subject: [PATCH 18/96] working version

---
 deer/q_networks/NN_keras_lp.py    |  23 +++++++
 deer/q_networks/q_net_keras_lp.py | 106 ++++++++++++++++++++----------
 examples/PLE/PLE_env.py           |   2 +-
 examples/PLE/run_PLE.py           |   2 +-
 4 files changed, 96 insertions(+), 37 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index dd247172..7c798c9a 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -68,6 +68,29 @@ def encoder_model(self):
         
         return model
 
+    def encoder_diff_model(self,encoder_model):
+        """
+    
+        Parameters
+        -----------
+        s
+    
+        Returns
+        -------
+        model with output x (= encoding of s)
+    
+        """
+        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(2,48,48,) ) ]
+        # input_distr
+        
+        x1 = encoder_model(inputs[0])
+        x2 = encoder_model(inputs[1])
+        
+        x = Subtract()([x1,x2])
+        model = Model(inputs=inputs, outputs=x)
+        
+        return model
+
     def transition_model(self):
         """
     
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index d3b69d30..5d3ea643 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -11,8 +11,15 @@
 from ..base_classes import QNetwork
 from .NN_keras_lp import NN # Default Neural network used
 
-def mean_squared_error_1(y_true, y_pred):
-    return K.abs(y_pred - y_true)
+def mean_squared_error(y_true, y_pred):
+    return K.mean(K.square(y_pred - y_true), axis=-1) # tend to reduce the square of the diff between y_pred and y_true
+
+def mean_squared_error_div10(y_true, y_pred):
+    return K.mean(K.square(y_pred - y_true), axis=-1) # tend to reduce the square of the diff between y_pred and y_true
+
+def exp_dec_error(y_true, y_pred):
+    #return - K.sum(  K.sqrt( K.clip(y_pred,-1,1) +0.0001)  , axis=-1, keepdims=True ) # tend to increase y_pred
+    return K.exp( - K.sqrt( K.sum(K.square(y_pred), axis=-1, keepdims=True) + 0.0001 )  ) # tend to increase y_pred
 
 class MyQNetwork(QNetwork):
     """
@@ -60,25 +67,34 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.update_counter = 0    
         self.loss_T=0
         self.loss_disentangle_t=0
-        self.loss_disentangle_a=0
+        #self.loss_disentangle_a=0
         self.lossR=0
+        self.loss_disambiguate1=0
+        self.loss_disambiguate2=0
+
         
         self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
 
         self.encoder = self.learn_and_plan.encoder_model()
+        self.encoder_diff = self.learn_and_plan.encoder_diff_model(self.encoder)
+        
         self.Q = self.learn_and_plan.Q_model()
         self.R = self.learn_and_plan.R_model()
         self.transition = self.learn_and_plan.transition_model()
 
         self.full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q)
+        
+        # used to fit rewards
         self.full_R = self.learn_and_plan.full_R_model(self.encoder,self.R)
-
+        
+        # used to fit transitions
         self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)#full_transition_model(self.encoder,self.transition)
+
+        # constraint on consecutive t
         self.diff_s_s_ = self.learn_and_plan.diff_s_s_(self.encoder)
-        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
-        
-                              
+#        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
         
+                
         layers=self.full_Q.layers
         # Grab all the parameters together.
         self.params = [ param
@@ -134,8 +150,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         Es_=self.encoder.predict([next_states_val[0]])
         Es=self.encoder.predict([states_val[0]])
         ETs=self.transition.predict([Es,onehot_actions])
-        
-        
+                   
         X = np.concatenate((ETs, Es_))
         if(self.update_counter%100==0):
             print states_val[0][0]
@@ -151,26 +166,40 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # Fit transition
         self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],onehot_actions,next_states_val[0]], np.zeros((32,3)))
 
+        # Fit rewards
+        self.lossR+=self.full_R.train_on_batch([states_val[0],onehot_actions], rewards_val) 
+
+#        # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
+#        # reduce the squared value of the abstract features
+#        #self.loss_disambiguate1+=self.encoder.train_on_batch([states_val[0]],np.zeros((32,3)))
+#        # increase the squared difference of the abstract features of two states
+#        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,3)))
+
+        #print self.loss_disambiguate1
         self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)*2) 
+#
+#        # Loss to have all s' following s,a with a to a distance 1 of s,a)
+#        tiled_x=np.tile(Es,(self._n_actions,1))
+#        tiled_onehot_actions=np.tile(onehot_actions,(self._n_actions,1))
+#        tiled_onehot_actions2=np.repeat(np.diag(np.ones(self._n_actions)),self._batch_size,axis=0)
+#        #self.loss_disentangle_a+=self.diff_Tx.train_on_batch([tiled_x,tiled_onehot_actions,tiled_x,tiled_onehot_actions2], np.ones(32*self._n_actions)) 
 
-        # Loss to have all s' following s,a with a to a distance 1 of s,a)
-        tiled_x=np.tile(Es,(self._n_actions,1))
-        tiled_onehot_actions=np.tile(onehot_actions,(self._n_actions,1))
-        tiled_onehot_actions2=np.repeat(np.diag(np.ones(self._n_actions)),self._batch_size,axis=0)
-        self.loss_disentangle_a+=self.diff_Tx.train_on_batch([tiled_x,tiled_onehot_actions,tiled_x,tiled_onehot_actions2], np.ones(32*self._n_actions)) 
 
-        self.lossR+=self.full_R.train_on_batch([states_val[0],onehot_actions], rewards_val) 
         
         if(self.update_counter%100==0):
             print "losses"
-            print self.loss_T/100.,self.loss_disentangle_t/100.,self.lossR/100.,self.loss_disentangle_a/100.
+            print "self.loss_T/100.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100."
+            print self.loss_T/100.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100.
             self.loss_T=0
-            self.loss1=0
-            self.loss2=0
-            self.loss_disentangle_t=0
-            self.loss_disentangle_a=0
             self.lossR=0
 
+            self.loss_disentangle_t=0
+            #self.loss_disentangle_a=0
+            
+            self.loss_disambiguate1=0
+            self.loss_disambiguate2=0
+
+
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
         
@@ -289,24 +318,26 @@ def _compile(self):
             raise Exception('The update_rule '+self._update_rule+' is not implemented.')
         
         self.full_Q.compile(optimizer=optimizer, loss='mse')
-        self.full_R.compile(optimizer=optimizer, loss='mse')
 
-        optimizer=RMSprop(lr=self._lr/20., rho=0.9, epsilon=1e-06)
-        optimizer2=RMSprop(lr=self._lr/10., rho=0.9, epsilon=1e-06)#.Adam(lr=0.0002, beta_1=0.5, beta_2=0.999, epsilon=1e-08)
+        optimizer=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+
+        self.diff_Tx_x_.compile(optimizer=optimizer, loss='mse') # Fit transitions
+        self.full_R.compile(optimizer=optimizer, loss='mse') # Fit rewards
 
-        self.diff_Tx_x_.compile(optimizer=optimizer,
-                  loss='mae')
-                  #metrics=['accuracy'])
         self.encoder.compile(optimizer=optimizer,
-                  loss='mae')
+                  loss=mean_squared_error_div10)
+        self.encoder_diff.compile(optimizer=optimizer,
+                  loss=exp_dec_error)
                   #metrics=['accuracy'])
-        self.diff_s_s_.compile(optimizer=optimizer2,
-                  loss=mean_squared_error_1)
-                  #metrics=['accuracy'])
-        self.diff_Tx.compile(optimizer=optimizer,
-                  loss=mean_squared_error_1)
+
+        self.diff_s_s_.compile(optimizer=optimizer,
+                  loss='mse')
                   #metrics=['accuracy'])
 
+#        self.diff_Tx.compile(optimizer=optimizer,
+#                  loss=mean_squared_error)
+#                  #metrics=['accuracy'])
+
     def _resetQHat(self):
         for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
             K.set_value(next_param,K.get_value(param))
@@ -321,8 +352,13 @@ def setLearningRate(self, lr):
         """
         self._lr = lr
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
+        K.set_value(self.full_Q.optimizer.lr, self._lr)
+
+        K.set_value(self.full_R.optimizer.lr, self._lr/10.)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr/10.)
-        K.set_value(self.encoder.optimizer.lr, self._lr/20.)
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
-        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
 
+        K.set_value(self.encoder.optimizer.lr, self._lr/100.)
+        K.set_value(self.encoder_diff.optimizer.lr, self._lr/10.)
+
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
+#        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index fefba09d..f992eafa 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -127,7 +127,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
             ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
 
         # Plot the fitted one-step trajectory from time t=10
-        i=18
+        i=16
         one_hot_a=np.zeros((1,3))
         one_hot_a[0,actions[i:i+1]]=1
         predicted=learning_algo.transition.predict([abs_states[i:i+1],one_hot_a])
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index a3d2aba0..3c773ce0 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -38,7 +38,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.002
+    LEARNING_RATE = 0.005
     LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1

From 0f310afb4ea5726fd20ed40f2658d24c0fee3987 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 13 Dec 2017 10:49:35 -0500
Subject: [PATCH 19/96] some modifs

---
 deer/q_networks/NN_keras_lp.py    | 26 ++++++++++++++++++++++++++
 deer/q_networks/q_net_keras_lp.py | 29 +++++++++++++++++++++--------
 examples/PLE/run_PLE.py           |  2 +-
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 7c798c9a..ab84b557 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -116,6 +116,32 @@ def transition_model(self):
         
         return model
 
+    def transition_model2(self):
+        """
+    
+        Parameters
+        -----------
+        x
+        a
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
+
+        x = Concatenate()(inputs)#,axis=-1)
+        x = Dense(20, activation='relu')(x)
+        x = Dense(50, activation='relu')(x)
+        x = Dense(20, activation='relu')(x)
+        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+        x = Add()([inputs[0],x])
+        
+        model = Model(inputs=inputs, outputs=x)
+        
+        return model
+
     def diff_Tx_x_(self,encoder_model,transition_model):
         """
     
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 5d3ea643..76842f88 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -18,9 +18,15 @@ def mean_squared_error_div10(y_true, y_pred):
     return K.mean(K.square(y_pred - y_true), axis=-1) # tend to reduce the square of the diff between y_pred and y_true
 
 def exp_dec_error(y_true, y_pred):
-    #return - K.sum(  K.sqrt( K.clip(y_pred,-1,1) +0.0001)  , axis=-1, keepdims=True ) # tend to increase y_pred
+    #return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred
     return K.exp( - K.sqrt( K.sum(K.square(y_pred), axis=-1, keepdims=True) + 0.0001 )  ) # tend to increase y_pred
 
+def rms_from_squared_components(y_true, y_pred):
+    return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
+
+def squared_error_from_squared_components(y_true, y_pred):
+    return - K.sum(  K.clip(y_pred,0.,1)  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
+
 class MyQNetwork(QNetwork):
     """
     Deep Q-learning network using Keras (with any backend)
@@ -66,6 +72,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._random_state = random_state
         self.update_counter = 0    
         self.loss_T=0
+        self.loss_T2=0
         self.loss_disentangle_t=0
         #self.loss_disentangle_a=0
         self.lossR=0
@@ -81,6 +88,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.Q = self.learn_and_plan.Q_model()
         self.R = self.learn_and_plan.R_model()
         self.transition = self.learn_and_plan.transition_model()
+        self.transition2 = self.learn_and_plan.transition_model2()
 
         self.full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q)
         
@@ -164,6 +172,8 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print ETs,Es_
             
         # Fit transition
+        for i in range(10):
+            self.loss_T2+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
         self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],onehot_actions,next_states_val[0]], np.zeros((32,3)))
 
         # Fit rewards
@@ -176,7 +186,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 #        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,3)))
 
         #print self.loss_disambiguate1
-        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)*2) 
+        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)) #np.ones((32,3))*2) 
 #
 #        # Loss to have all s' following s,a with a to a distance 1 of s,a)
 #        tiled_x=np.tile(Es,(self._n_actions,1))
@@ -188,9 +198,10 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         if(self.update_counter%100==0):
             print "losses"
-            print "self.loss_T/100.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100."
-            print self.loss_T/100.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100.
+            print "self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100."
+            print self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100.
             self.loss_T=0
+            self.loss_T2=0
             self.lossR=0
 
             self.loss_disentangle_t=0
@@ -285,7 +296,7 @@ def qValues_planning(self, state_val, d=2.):
         #tile3_state_val=np.array([state for state in state_val for i in range(self._n_actions)])
         tile3_state_val=np.tile(state_val,(3,1,1,1))
         
-        next_x_predicted=self.transition.predict([tile3_encoded_x,identity_matrix])
+        next_x_predicted=self.transition2.predict([tile3_encoded_x,identity_matrix])
         q_vals_d1=self.Q.predict([next_x_predicted])
         #print q_vals_d1
         #print (1-1/d)+(1-1/d)**2
@@ -322,6 +333,7 @@ def _compile(self):
         optimizer=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
 
         self.diff_Tx_x_.compile(optimizer=optimizer, loss='mse') # Fit transitions
+        self.transition2.compile(optimizer=optimizer, loss='mse') # Fit accurate transitions without encoders
         self.full_R.compile(optimizer=optimizer, loss='mse') # Fit rewards
 
         self.encoder.compile(optimizer=optimizer,
@@ -331,7 +343,7 @@ def _compile(self):
                   #metrics=['accuracy'])
 
         self.diff_s_s_.compile(optimizer=optimizer,
-                  loss='mse')
+                  loss=squared_error_from_squared_components)#exp_dec_error)#'mse')
                   #metrics=['accuracy'])
 
 #        self.diff_Tx.compile(optimizer=optimizer,
@@ -356,8 +368,9 @@ def setLearningRate(self, lr):
 
         K.set_value(self.full_R.optimizer.lr, self._lr/10.)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr/10.)
-
-        K.set_value(self.encoder.optimizer.lr, self._lr/100.)
+        K.set_value(self.transition2.optimizer.lr, self._lr/10.)
+        
+        #K.set_value(self.encoder.optimizer.lr, self._lr/100.)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr/10.)
 
         K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index 3c773ce0..a3d2aba0 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -38,7 +38,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.005
+    LEARNING_RATE = 0.002
     LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1

From b9a54b1461d1f3e9191af99203b33196512f2701 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 22 Dec 2017 15:58:45 -0500
Subject: [PATCH 20/96] working

---
 deer/agent.py                     |  9 +++
 deer/q_networks/NN_keras.py       |  2 +-
 deer/q_networks/NN_keras_lp.py    | 17 +++---
 deer/q_networks/q_net_keras_lp.py | 94 +++++++++++++++++++++++++------
 examples/PLE/PLE_env.py           | 29 ++++++----
 examples/PLE/run_PLE.py           |  2 +-
 6 files changed, 116 insertions(+), 37 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index 292846fe..d551a849 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -270,6 +270,15 @@ def run(self, n_epochs, epoch_length):
             self._training_loss_averages = []
 
             if self._mode != -1:
+                #loss=0
+                #for ii in range(10000):
+                #    states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch(self._batch_size, self._exp_priority)
+                #    loss+=self._network.train_model(states, actions, rewards, next_states, terminals)
+                #    if(ii%100==99):
+                #        print "loss T before valid or test"
+                #        print loss/100.
+                #        loss=0
+                
                 self._totalModeNbrEpisode=0
                 while self._mode_epochs_length > 0:
                     self._totalModeNbrEpisode += 1
diff --git a/deer/q_networks/NN_keras.py b/deer/q_networks/NN_keras.py
index 5a55573e..e12f1dc4 100644
--- a/deer/q_networks/NN_keras.py
+++ b/deer/q_networks/NN_keras.py
@@ -107,7 +107,7 @@ def _buildDQN(self):
                 out = Dense(len(self._n_actions))(x)
         else:
             out = Dense(1)(x)
-                
+
         model = Model(input=inputs, output=out)
         layers=model.layers
         
diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index ab84b557..495f516f 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -6,9 +6,8 @@
 import numpy as np
 from keras import backend as K
 from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate
+from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization
 from keras import regularizers
-
 np.random.seed(102912)
 
 class NN():
@@ -107,8 +106,9 @@ def transition_model(self):
         inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
 
         x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(20, activation='relu')(x)
-        x = Dense(20, activation='relu')(x)
+        x = Dense(15, activation='tanh')(x)
+        x = Dense(30, activation='tanh')(x)
+        x = Dense(15, activation='tanh')(x)
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         x = Add()([inputs[0],x])
         
@@ -132,9 +132,12 @@ def transition_model2(self):
         inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
 
         x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(20, activation='relu')(x)
-        x = Dense(50, activation='relu')(x)
-        x = Dense(20, activation='relu')(x)
+        x = Dense(20, activation='tanh')(x)
+        x = BatchNormalization()(x)
+        x = Dense(50, activation='tanh')(x)
+        x = BatchNormalization()(x)
+        x = Dense(20, activation='tanh')(x)
+        x = BatchNormalization()(x)
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         x = Add()([inputs[0],x])
         
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 76842f88..c22d5ccd 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -27,6 +27,9 @@ def rms_from_squared_components(y_true, y_pred):
 def squared_error_from_squared_components(y_true, y_pred):
     return - K.sum(  K.clip(y_pred,0.,1)  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
 
+def loss_diff_s_s_(y_true, y_pred):
+    return K.square(   1.    -    K.sqrt(  K.clip( K.sum(y_pred,axis=-1,keepdims=True), 0.000001 , 1. )  )     ) # tend to increase y_pred --> loss -1
+
 class MyQNetwork(QNetwork):
     """
     Deep Q-learning network using Keras (with any backend)
@@ -159,7 +162,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         Es=self.encoder.predict([states_val[0]])
         ETs=self.transition.predict([Es,onehot_actions])
                    
-        X = np.concatenate((ETs, Es_))
         if(self.update_counter%100==0):
             print states_val[0][0]
             print "len(states_val)"
@@ -172,8 +174,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print ETs,Es_
             
         # Fit transition
-        for i in range(10):
-            self.loss_T2+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
         self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],onehot_actions,next_states_val[0]], np.zeros((32,3)))
 
         # Fit rewards
@@ -197,9 +197,14 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         
         if(self.update_counter%100==0):
-            print "losses"
-            print "self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100."
-            print self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100.
+#            print "self.transition"
+#            print [ K.get_value(param)
+#                    for layer in self.transition.layers
+#                    for param in layer.trainable_weights ]
+#            print self.transition.layers
+#            print "losses"
+#            print "self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100."
+#            print self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100.
             self.loss_T=0
             self.loss_T2=0
             self.lossR=0
@@ -256,6 +261,52 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         return np.sqrt(loss),loss_ind
 
 
+#    def train_model(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
+#        """
+#        Train the model based part
+#
+#        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
+#        2. perform batch training
+#
+#        Parameters
+#        -----------
+#        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
+#        actions_val : b x 1 numpy array of integers
+#        rewards_val : b x 1 numpy array
+#        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
+#        terminals_val : b x 1 numpy boolean array
+#
+#        Returns
+#        -------
+#        Average loss of the batch training (RMSE)
+#        Individual (square) losses for each tuple
+#        """
+#
+#        onehot_actions = np.zeros((self._batch_size, self._n_actions))
+#        onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
+#        Es_=self.encoder.predict([next_states_val[0]])
+#        Es=self.encoder.predict([states_val[0]])
+#        ETs=self.transition.predict([Es,onehot_actions])
+#
+##        if(self.update_counter>3000):
+#        self.loss_T2=self.transition2.train_on_batch([Es,onehot_actions], Es_)
+##        if(self.update_counter%100==0):
+##            loss=0.
+##            for i in range (100):
+##                loss+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
+##                if(i%10==0):
+##                    print "loss/(i+1)"
+##                    print loss/(i+1)
+##            print "loss/100."
+##            print loss/100.
+#            #print K.get_value(self.transition2.optimizer.lr)
+#            #print [ K.get_value(param)
+#            #        for layer in self.encoder.layers
+#            #        for param in layer.trainable_weights ][0][0]
+#        return self.loss_T2
+
+
+
     def qValues(self, state_val):
         """ Get the q values for one belief state (without planning)
 
@@ -291,7 +342,7 @@ def qValues_planning(self, state_val, d=2.):
         #tile3_encoded_x=np.array([enc for enc in encoded_x for i in range(self._n_actions)])
         tile3_encoded_x=np.tile(encoded_x,(3,1))
         #print tile3_encoded_x
-        r_vals_d0=self.R.predict([tile3_encoded_x,identity_matrix])
+        r_vals_d0=np.array(self.R.predict([tile3_encoded_x,identity_matrix])).reshape((self._n_actions))
         
         #tile3_state_val=np.array([state for state in state_val for i in range(self._n_actions)])
         tile3_state_val=np.tile(state_val,(3,1,1,1))
@@ -299,7 +350,11 @@ def qValues_planning(self, state_val, d=2.):
         next_x_predicted=self.transition2.predict([tile3_encoded_x,identity_matrix])
         q_vals_d1=self.Q.predict([next_x_predicted])
         #print q_vals_d1
+        #print [np.max(vals) for vals in q_vals_d1]
+        #print r_vals_d0
         #print (1-1/d)+(1-1/d)**2
+        #print "r_vals_d0+self._df*np.array([np.max(vals) for vals in q_vals_d1])"
+        #print r_vals_d0+self._df*np.array([np.max(vals) for vals in q_vals_d1])
         #print ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*np.array([np.max(vals) for vals in q_vals_d1])
         return ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*(r_vals_d0+self._df*np.array([np.max(vals) for vals in q_vals_d1]))
 
@@ -315,7 +370,6 @@ def chooseBestAction(self, state):
         The best action : int
         """        
         q_vals = self.qValues_planning(state)#self.qValues(state)#
-        
         return np.argmax(q_vals),np.max(q_vals)
         
     def _compile(self):
@@ -330,20 +384,25 @@ def _compile(self):
         
         self.full_Q.compile(optimizer=optimizer, loss='mse')
 
-        optimizer=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer1=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # Different optimizers for each network; otherwise not possible to modify each
+        optimizer2=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately
+        optimizer3=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer5=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer6=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
 
-        self.diff_Tx_x_.compile(optimizer=optimizer, loss='mse') # Fit transitions
-        self.transition2.compile(optimizer=optimizer, loss='mse') # Fit accurate transitions without encoders
-        self.full_R.compile(optimizer=optimizer, loss='mse') # Fit rewards
+        self.diff_Tx_x_.compile(optimizer=optimizer1, loss='mse') # Fit transitions
+        self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
+        self.full_R.compile(optimizer=optimizer3, loss='mse') # Fit rewards
 
-        self.encoder.compile(optimizer=optimizer,
+        self.encoder.compile(optimizer=optimizer4,
                   loss=mean_squared_error_div10)
-        self.encoder_diff.compile(optimizer=optimizer,
+        self.encoder_diff.compile(optimizer=optimizer5,
                   loss=exp_dec_error)
                   #metrics=['accuracy'])
 
-        self.diff_s_s_.compile(optimizer=optimizer,
-                  loss=squared_error_from_squared_components)#exp_dec_error)#'mse')
+        self.diff_s_s_.compile(optimizer=optimizer6,
+                  loss=loss_diff_s_s_)#exp_dec_error)#'mse')
                   #metrics=['accuracy'])
 
 #        self.diff_Tx.compile(optimizer=optimizer,
@@ -363,13 +422,14 @@ def setLearningRate(self, lr):
             The learning rate that has to bet set
         """
         self._lr = lr
+        print "modif lr"
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
         K.set_value(self.full_Q.optimizer.lr, self._lr)
 
         K.set_value(self.full_R.optimizer.lr, self._lr/10.)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr/10.)
-        K.set_value(self.transition2.optimizer.lr, self._lr/10.)
         
+        K.set_value(self.transition2.optimizer.lr, self._lr/10.)
         #K.set_value(self.encoder.optimizer.lr, self._lr/100.)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr/10.)
 
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index f992eafa..93a8bbc4 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -127,17 +127,24 @@ def summarizePerformance(self, test_data_set, learning_algo):
             ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
 
         # Plot the fitted one-step trajectory from time t=10
-        i=16
-        one_hot_a=np.zeros((1,3))
-        one_hot_a[0,actions[i:i+1]]=1
-        predicted=learning_algo.transition.predict([abs_states[i:i+1],one_hot_a])
-        predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0]])])
-        predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0]])])
-        predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1]])])
-        ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:]]), color="1")
-        ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:]]), color="0.5")
-        ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:]]), color="0")
-        #ax.plot(np.concatenate([x[i:i+1],predicted[0,:1]]), np.concatenate([y[i:i+1],predicted[0,1:2]]), np.concatenate([z[i:i+1],predicted[0,2:]]), color="g")
+        for i in range(19):
+            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0]])])
+            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0]])])
+            predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1]])])
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:]]), color="1", alpha=0.5)
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:]]), color="0.5", alpha=0.5)
+            ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:]]), color="0", alpha=0.5)
+
+#        for xx in [-2,-1.,0, 1., 2.]:
+#            for yy in [-2,-1.,0, 1., 2.]:
+#                for zz in [-2,-1.,0, 1., 2.]:
+#                    predicted1=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[1,0,0]])])
+#                    predicted2=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,1,0]])])
+#                    predicted3=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,0,1]])])
+#                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
+#                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
+#                    ax.plot(np.concatenate([np.array([xx]),predicted3[0,:1]]), np.concatenate([np.array([yy]),predicted3[0,1:2]]), np.concatenate([np.array([zz]),predicted3[0,2:]]), color="0", alpha=0.5)
+                    #ax.plot(np.concatenate([x[i:i+1],predicted[0,:1]]), np.concatenate([y[i:i+1],predicted[0,1:2]]), np.concatenate([z[i:i+1],predicted[0,2:]]), color="g")
         
 
         # Plot the colorbar for the trajectory
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index a3d2aba0..7c4fee9e 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -25,7 +25,7 @@ class Defaults:
     # Experiment Parameters
     # ----------------------
     STEPS_PER_EPOCH = 1000
-    EPOCHS = 100
+    EPOCHS = 500
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
     

From 0773ce46f2feecaf287425854565fc3c30459483 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 12 Jan 2018 13:02:43 -0500
Subject: [PATCH 21/96] fixing NN for all types of inputs and switching to an
 entropy + L2 reg for abstract state space

---
 deer/q_networks/NN_keras_lp.py    | 235 +++++++++++++++++++++++++-----
 deer/q_networks/q_net_keras_lp.py |  57 +++++---
 examples/PLE/run_PLE.py           |   2 +-
 3 files changed, 237 insertions(+), 57 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 495f516f..1e1c071e 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -45,22 +45,75 @@ def encoder_model(self):
         model with output x (= encoding of s)
     
         """
-        inputs = [ Input( shape=(2,48,48,) ) ]
-        # input_distr
-        
-        x = inputs[0]
-        x = Conv2D(16, (4, 4), padding='same', activation='relu')(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-        x = Conv2D(16, (4, 4), padding='same', activation='relu')(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-        x = Conv2D(8, (4, 4), padding='same', activation='relu')(x)
-        x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-        
-        x = Flatten()(x)
+        layers=[]
+        outs_conv=[]
+        inputs=[]
+
+        for i, dim in enumerate(self._input_dimensions):
+            # - observation[i] is a FRAME
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
+                x = Conv2D(16, (4, 4), padding='same', activation='relu')(input)
+                x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                x = Conv2D(16, (4, 4), padding='same', activation='relu')(x)
+                x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                x = Conv2D(8, (4, 4), padding='same', activation='relu')(x)
+                x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                
+                out = Flatten()(x)
+                
+            # - observation[i] is a VECTOR
+            elif len(dim) == 2:
+                if dim[0] > 3:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+                    reshaped=Reshape((dim[0],dim[1],1), input_shape=(dim[0],dim[1]))(input) 
+                    x = Conv2D(16, (2, 1), activation='relu', border_mode='valid')(reshaped)#Conv on the history
+                    x = Conv2D(16, (2, 2), activation='relu', border_mode='valid')(x)       #Conv on the history & features
+
+                    out = Flatten()(x)
+                else:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+                    out = Flatten()(input)
+
+            # - observation[i] is a SCALAR -
+            else:
+                if dim[0] > 3:
+                    # this returns a tensor
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+                    reshaped=Reshape((1,dim[0],1), input_shape=(dim[0],))(input)  
+                    x = Conv2D(8, (1,2), activation='relu', border_mode='valid')(reshaped)  #Conv on the history
+                    x = Conv2D(8, (1,2), activation='relu', border_mode='valid')(x)         #Conv on the history
+                    
+                    out = Flatten()(x)
+                                        
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+                    out=input
+                    
+            outs_conv.append(out)
+
+        if (self._action_as_input==True):
+            if ( isinstance(self._n_actions,int)):
+                print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
+            else:
+                input = Input(shape=(len(self._n_actions),))
+                inputs.append(input)
+                outs_conv.append(input)
+        
+        if len(outs_conv)>1:
+            x = merge(outs_conv, mode='concat')
+        else:
+            x= outs_conv [0]
         
+        # we stack a deep fully-connected network on top
         x = Dense(20, activation='relu')(x)
         x = Dense(10, activation='relu')(x)
-
+        
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         
         model = Model(inputs=inputs, outputs=x)
@@ -79,11 +132,25 @@ def encoder_diff_model(self,encoder_model):
         model with output x (= encoding of s)
     
         """
-        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(2,48,48,) ) ]
-        # input_distr
+        inputs=[]
         
-        x1 = encoder_model(inputs[0])
-        x2 = encoder_model(inputs[1])
+        for j in range(2):
+            for i, dim in enumerate(self._input_dimensions):
+                if len(dim) == 3:
+                    input = Input(shape=(dim[0],dim[1],dim[2]))
+                    inputs.append(input)
+            
+                elif len(dim) == 2:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+            
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+        
+        half = len(inputs)/2
+        x1 = encoder_model(inputs[:half])
+        x2 = encoder_model(inputs[half:])
         
         x = Subtract()([x1,x2])
         model = Model(inputs=inputs, outputs=x)
@@ -159,12 +226,29 @@ def diff_Tx_x_(self,encoder_model,transition_model):
         model with output Tx (= model estimate of x')
     
         """
-        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ) , Input( shape=(2,48,48,) ) ] #s,s'
-        
-        enc_x = encoder_model(inputs[0]) #s --> x
-        enc_x_ = encoder_model(inputs[2]) #s --> x
-        
-        Tx= transition_model([enc_x,inputs[1]])
+        inputs=[]
+        for j in range(2):
+            for i, dim in enumerate(self._input_dimensions):
+                if len(dim) == 3:
+                    input = Input(shape=(dim[0],dim[1],dim[2]))
+                    inputs.append(input)
+            
+                elif len(dim) == 2:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+            
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+
+        half = len(inputs)/2
+        enc_x = encoder_model(inputs[:half]) #s --> x
+        enc_x_ = encoder_model(inputs[half:]) #s --> x
+
+        input = Input(shape=(self._n_actions,))
+        inputs.append(input)
+                
+        Tx= transition_model([enc_x,inputs[-1]])
         
         x = Subtract()([Tx,enc_x_])
 #        x = Dot(axes=-1, normalize=False)([x,x])
@@ -187,10 +271,25 @@ def diff_s_s_(self,encoder_model):
         model with output Tx (= model estimate of x')
     
         """
-        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(2,48,48,) ) ] #s,s'
+        inputs=[]
         
-        enc_x = encoder_model(inputs[0]) #s --> x
-        enc_x_ = encoder_model(inputs[1]) #s --> x
+        for j in range(2):
+            for i, dim in enumerate(self._input_dimensions):
+                if len(dim) == 3:
+                    input = Input(shape=(dim[0],dim[1],dim[2]))
+                    inputs.append(input)
+            
+                elif len(dim) == 2:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+            
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+        
+        half = len(inputs)/2
+        enc_x = encoder_model(inputs[:half]) #s --> x #FIXME
+        enc_x_ = encoder_model(inputs[half:]) #s --> x
         
         x = Subtract()([enc_x,enc_x_])
         x = Dot(axes=-1, normalize=False)([x,x])
@@ -199,6 +298,51 @@ def diff_s_s_(self,encoder_model):
         
         return model
 
+#    def diff_sa_sa(self,encoder_model,transition_model):
+#        """
+#    
+#        Parameters
+#        -----------
+#        s
+#        a
+#        rand_a
+#    
+#        Returns
+#        -------
+#        model with output Tx (= model estimate of x')
+#    
+#        """
+#        inputs=[]
+#        
+#        for i, dim in enumerate(self._input_dimensions):
+#            if len(dim) == 3:
+#                input = Input(shape=(dim[0],dim[1],dim[2]))
+#                inputs.append(input)
+#
+#            elif len(dim) == 2:
+#                input = Input(shape=(dim[0],dim[1]))
+#                inputs.append(input)
+#
+#            else:
+#                input = Input(shape=(dim[0],))
+#                inputs.append(input)
+#        
+#        input = Input(shape=(self._n_actions,))
+#        inputs.append(input)
+#        input = Input(shape=(self._n_actions,))
+#        inputs.append(input)
+#        
+#        enc_x = encoder_model(inputs[:-2]) #s --> x
+#        Tx= transition_model([enc_x,inputs[-2]])
+#        rand_Tx= transition_model([enc_x,inputs[-1]])
+#                
+#        x = Subtract()([Tx,rand_Tx])
+#        x = Dot(axes=-1, normalize=False)([x,x])
+#        
+#        model = Model(inputs=inputs, outputs=x )
+#        
+#        return model
+
     def diff_Tx(self,transition_model):
         """
     
@@ -271,11 +415,27 @@ def full_R_model(self,encoder_model,R_model):
         r
         """
         
-        inputs = [ Input( shape=(2,48,48,) ), Input( shape=(self._n_actions,) ) ] #s,a
+        inputs=[]
         
-        enc_x = encoder_model(inputs[0]) #s --> x
+        for i, dim in enumerate(self._input_dimensions):
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
+
+            elif len(dim) == 2:
+                input = Input(shape=(dim[0],dim[1]))
+                inputs.append(input)
+
+            else:
+                input = Input(shape=(dim[0],))
+                inputs.append(input)
+        
+        input = Input(shape=(self._n_actions,))
+        inputs.append(input)
+        
+        enc_x = encoder_model(inputs[:-1]) #s --> x
                 
-        out = R_model([enc_x]+inputs[1:])
+        out = R_model([enc_x]+inputs[-1:])
                 
         model = Model(inputs=inputs, outputs=out)
         
@@ -324,14 +484,21 @@ def full_Q_model(self, encoder_model, Q_model):
         -------
         model with output Tx (= model estimate of x')
         """
-        layers=[]
-        outs_conv=[]
         inputs=[]
         
-        #if len(dim) == 3:
         for i, dim in enumerate(self._input_dimensions):
-            input = Input(shape=(dim[0],dim[1],dim[2]))
-            inputs.append(input)
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
+
+            elif len(dim) == 2:
+                input = Input(shape=(dim[0],dim[1]))
+                inputs.append(input)
+
+            else:
+                input = Input(shape=(dim[0],))
+                inputs.append(input)
+        
         
         input = Input(shape=(self.internal_dim,))
         inputs.append(input)
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index c22d5ccd..fb88fb93 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -14,12 +14,9 @@
 def mean_squared_error(y_true, y_pred):
     return K.mean(K.square(y_pred - y_true), axis=-1) # tend to reduce the square of the diff between y_pred and y_true
 
-def mean_squared_error_div10(y_true, y_pred):
-    return K.mean(K.square(y_pred - y_true), axis=-1) # tend to reduce the square of the diff between y_pred and y_true
-
 def exp_dec_error(y_true, y_pred):
     #return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred
-    return K.exp( - K.sqrt( K.sum(K.square(y_pred), axis=-1, keepdims=True) + 0.0001 )  ) # tend to increase y_pred
+    return K.exp( - 2*K.sqrt( K.sum(K.square(y_pred), axis=-1, keepdims=True) + 0.0001 )  ) # tend to increase y_pred
 
 def rms_from_squared_components(y_true, y_pred):
     return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
@@ -77,7 +74,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.loss_T=0
         self.loss_T2=0
         self.loss_disentangle_t=0
-        #self.loss_disentangle_a=0
+        self.loss_disentangle_a=0
         self.lossR=0
         self.loss_disambiguate1=0
         self.loss_disambiguate2=0
@@ -100,11 +97,14 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         
         # used to fit transitions
         self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)#full_transition_model(self.encoder,self.transition)
-
+        
+        
         # constraint on consecutive t
         self.diff_s_s_ = self.learn_and_plan.diff_s_s_(self.encoder)
 #        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
-        
+
+        # used to disentangle actions
+        self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)        
                 
         layers=self.full_Q.layers
         # Grab all the parameters together.
@@ -158,6 +158,8 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         onehot_actions = np.zeros((self._batch_size, self._n_actions))
         onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
+        #onehot_actions_rand = np.zeros((self._batch_size, self._n_actions))
+        #onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1
         Es_=self.encoder.predict([next_states_val[0]])
         Es=self.encoder.predict([states_val[0]])
         ETs=self.transition.predict([Es,onehot_actions])
@@ -174,19 +176,23 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print ETs,Es_
             
         # Fit transition
-        self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],onehot_actions,next_states_val[0]], np.zeros((32,3)))
+        self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],next_states_val[0],onehot_actions], np.zeros((32,3)))
 
         # Fit rewards
         self.lossR+=self.full_R.train_on_batch([states_val[0],onehot_actions], rewards_val) 
 
-#        # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
-#        # reduce the squared value of the abstract features
-#        #self.loss_disambiguate1+=self.encoder.train_on_batch([states_val[0]],np.zeros((32,3)))
-#        # increase the squared difference of the abstract features of two states
-#        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,3)))
+        # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
+        # reduce the squared value of the abstract features
+        self.loss_disambiguate1+=self.encoder.train_on_batch([states_val[0]],np.zeros((32,3)))
+#        # Increase the entropy in the abstract features of two states
+        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,3)))
 
         #print self.loss_disambiguate1
-        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)) #np.ones((32,3))*2) 
+        #self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)) #np.ones((32,3))*2) 
+
+        # Disentangle actions
+        #self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch([states_val[0],onehot_actions,onehot_actions_rand], np.ones(32))
+
 #
 #        # Loss to have all s' following s,a with a to a distance 1 of s,a)
 #        tiled_x=np.tile(Es,(self._n_actions,1))
@@ -203,19 +209,20 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 #                    for param in layer.trainable_weights ]
 #            print self.transition.layers
 #            print "losses"
-#            print "self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100."
-#            print self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate2/100.
+            print "self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
+            print self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
             self.loss_T=0
             self.loss_T2=0
             self.lossR=0
 
             self.loss_disentangle_t=0
-            #self.loss_disentangle_a=0
+            self.loss_disentangle_a=0
             
             self.loss_disambiguate1=0
             self.loss_disambiguate2=0
 
 
+
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
         
@@ -340,12 +347,12 @@ def qValues_planning(self, state_val, d=2.):
         #print "q_vals_d0"
         #print q_vals_d0
         #tile3_encoded_x=np.array([enc for enc in encoded_x for i in range(self._n_actions)])
-        tile3_encoded_x=np.tile(encoded_x,(3,1))
+        tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
         #print tile3_encoded_x
         r_vals_d0=np.array(self.R.predict([tile3_encoded_x,identity_matrix])).reshape((self._n_actions))
         
         #tile3_state_val=np.array([state for state in state_val for i in range(self._n_actions)])
-        tile3_state_val=np.tile(state_val,(3,1,1,1))
+        #tile3_state_val=np.tile(state_val,(3,1,1,1))
         
         next_x_predicted=self.transition2.predict([tile3_encoded_x,identity_matrix])
         q_vals_d1=self.Q.predict([next_x_predicted])
@@ -385,18 +392,19 @@ def _compile(self):
         self.full_Q.compile(optimizer=optimizer, loss='mse')
 
         optimizer1=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # Different optimizers for each network; otherwise not possible to modify each
-        optimizer2=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately
+        optimizer2=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately (e.g. lr)
         optimizer3=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer5=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer6=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer7=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
 
         self.diff_Tx_x_.compile(optimizer=optimizer1, loss='mse') # Fit transitions
         self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
         self.full_R.compile(optimizer=optimizer3, loss='mse') # Fit rewards
 
         self.encoder.compile(optimizer=optimizer4,
-                  loss=mean_squared_error_div10)
+                  loss=mean_squared_error)
         self.encoder_diff.compile(optimizer=optimizer5,
                   loss=exp_dec_error)
                   #metrics=['accuracy'])
@@ -405,6 +413,9 @@ def _compile(self):
                   loss=loss_diff_s_s_)#exp_dec_error)#'mse')
                   #metrics=['accuracy'])
 
+        self.diff_sa_sa.compile(optimizer=optimizer7,
+                  loss=loss_diff_s_s_)
+
 #        self.diff_Tx.compile(optimizer=optimizer,
 #                  loss=mean_squared_error)
 #                  #metrics=['accuracy'])
@@ -430,8 +441,10 @@ def setLearningRate(self, lr):
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr/10.)
         
         K.set_value(self.transition2.optimizer.lr, self._lr/10.)
-        #K.set_value(self.encoder.optimizer.lr, self._lr/100.)
+
+        K.set_value(self.encoder.optimizer.lr, self._lr/100.)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr/10.)
 
         K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
+        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.)
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index 7c4fee9e..19d28bcf 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -39,7 +39,7 @@ class Defaults:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
     LEARNING_RATE = 0.002
-    LEARNING_RATE_DECAY = 0.98
+    LEARNING_RATE_DECAY = 0.99
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
     DISCOUNT_MAX = 0.99

From 1a87f0ddfccd7a44a444934df153c89cc856f115 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 12 Jan 2018 13:40:44 -0500
Subject: [PATCH 22/96] fix

---
 deer/q_networks/q_net_keras_lp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index fb88fb93..456abf6e 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -104,7 +104,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
 #        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
 
         # used to disentangle actions
-        self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)        
+        #self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)
                 
         layers=self.full_Q.layers
         # Grab all the parameters together.
@@ -413,8 +413,8 @@ def _compile(self):
                   loss=loss_diff_s_s_)#exp_dec_error)#'mse')
                   #metrics=['accuracy'])
 
-        self.diff_sa_sa.compile(optimizer=optimizer7,
-                  loss=loss_diff_s_s_)
+#        self.diff_sa_sa.compile(optimizer=optimizer7,
+#                  loss=loss_diff_s_s_)
 
 #        self.diff_Tx.compile(optimizer=optimizer,
 #                  loss=mean_squared_error)
@@ -446,5 +446,5 @@ def setLearningRate(self, lr):
         K.set_value(self.encoder_diff.optimizer.lr, self._lr/10.)
 
         K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
-        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.)
+#        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.)
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)

From 6fcf97d82244ff0a12b19d59b2013bde0410f9d9 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 13 Feb 2018 10:19:33 -0500
Subject: [PATCH 23/96] working example simplest_test_PLI

---
 deer/q_networks/NN_keras_lp.py    | 16 ++++----
 deer/q_networks/q_net_keras_lp.py | 68 +++++++++++++++++++++++--------
 examples/PLE/PLE_env.py           | 18 ++++----
 3 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 1e1c071e..ad024777 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -54,11 +54,11 @@ def encoder_model(self):
             if len(dim) == 3:
                 input = Input(shape=(dim[0],dim[1],dim[2]))
                 inputs.append(input)
-                x = Conv2D(16, (4, 4), padding='same', activation='relu')(input)
+                x = Conv2D(8, (4, 4), padding='same', activation='tanh')(input)
                 x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                x = Conv2D(16, (4, 4), padding='same', activation='relu')(x)
+                x = Conv2D(8, (4, 4), padding='same', activation='tanh')(x)
                 x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                x = Conv2D(8, (4, 4), padding='same', activation='relu')(x)
+                x = Conv2D(8, (4, 4), padding='same', activation='tanh')(x)
                 x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                 
                 out = Flatten()(x)
@@ -173,9 +173,9 @@ def transition_model(self):
         inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
 
         x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(15, activation='tanh')(x)
-        x = Dense(30, activation='tanh')(x)
-        x = Dense(15, activation='tanh')(x)
+        x = Dense(5, activation='tanh')(x) #5,15
+        #x = Dense(10, activation='tanh')(x) # ,30
+        x = Dense(5, activation='tanh')(x) #5,15
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         x = Add()([inputs[0],x])
         
@@ -199,11 +199,11 @@ def transition_model2(self):
         inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
 
         x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(20, activation='tanh')(x)
+        x = Dense(10, activation='tanh')(x)
         x = BatchNormalization()(x)
         x = Dense(50, activation='tanh')(x)
         x = BatchNormalization()(x)
-        x = Dense(20, activation='tanh')(x)
+        x = Dense(10, activation='tanh')(x)
         x = BatchNormalization()(x)
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         x = Add()([inputs[0],x])
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 456abf6e..f914113d 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -12,11 +12,10 @@
 from .NN_keras_lp import NN # Default Neural network used
 
 def mean_squared_error(y_true, y_pred):
-    return K.mean(K.square(y_pred - y_true), axis=-1) # tend to reduce the square of the diff between y_pred and y_true
+    return K.mean(K.square(y_pred - y_true), axis=-1)   # = mse error
 
 def exp_dec_error(y_true, y_pred):
-    #return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred
-    return K.exp( - 2*K.sqrt( K.sum(K.square(y_pred), axis=-1, keepdims=True) + 0.0001 )  ) # tend to increase y_pred
+    return K.exp( - 2.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
 
 def rms_from_squared_components(y_true, y_pred):
     return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
@@ -76,6 +75,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.loss_disentangle_t=0
         self.loss_disentangle_a=0
         self.lossR=0
+        self.loss_Q=0
         self.loss_disambiguate1=0
         self.loss_disambiguate2=0
 
@@ -176,17 +176,39 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print ETs,Es_
             
         # Fit transition
-        self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],next_states_val[0],onehot_actions], np.zeros((32,3)))
+        for i in range(10):
+            self.loss_T2+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
+        self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],next_states_val[0],onehot_actions], np.zeros((32,self.learn_and_plan.internal_dim)))
 
         # Fit rewards
         self.lossR+=self.full_R.train_on_batch([states_val[0],onehot_actions], rewards_val) 
 
         # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
         # reduce the squared value of the abstract features
-        self.loss_disambiguate1+=self.encoder.train_on_batch([states_val[0]],np.zeros((32,3)))
+        self.loss_disambiguate1+=self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))
 #        # Increase the entropy in the abstract features of two states
-        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,3)))
-
+        rolled=np.roll(states_val[0],1,axis=0)
+        #print "states_val[0]"
+        #print states_val[0]
+        #print "rolled"
+        #print rolled
+        for i in range(32):
+            j=0
+            l=0
+            while((states_val[0][i]==rolled[i+j-l]).all()):
+                if(i+j==31):
+                    l=32
+                if(j==31):
+                    break
+                j=j+1
+            rolled[i]=rolled[i+j-l]
+        #print "rolled"
+        #print rolled
+        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((32,self.learn_and_plan.internal_dim)))
+        #print self.loss_disambiguate2
+        #print "states_val[0]"
+        #print states_val[0]
+        #print rolled
         #print self.loss_disambiguate1
         #self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)) #np.ones((32,3))*2) 
 
@@ -203,24 +225,33 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         
         if(self.update_counter%100==0):
-#            print "self.transition"
-#            print [ K.get_value(param)
-#                    for layer in self.transition.layers
-#                    for param in layer.trainable_weights ]
-#            print self.transition.layers
-#            print "losses"
-            print "self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
-            print self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
+            print "self.loss_Q"
+            print self.loss_Q
+            print "self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
+            print self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
+            print K.get_value(self.encoder.optimizer.lr)
+            print K.get_value(self.encoder_diff.optimizer.lr)
             self.loss_T=0
             self.loss_T2=0
             self.lossR=0
+            self.loss_Q=0
 
             self.loss_disentangle_t=0
             self.loss_disentangle_a=0
             
             self.loss_disambiguate1=0
             self.loss_disambiguate2=0
+            
+            print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
+            print self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))
+            print self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))
+
+            print "self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,self.learn_and_plan.internal_dim)))"
+            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((32,self.learn_and_plan.internal_dim)))
+            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((32,self.learn_and_plan.internal_dim)))
 
+            print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
+            print self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))
 
 
         if self.update_counter % self._freeze_interval == 0:
@@ -258,7 +289,8 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
         #print "self.q_vals.optimizer.lr"
         #print K.eval(self.q_vals.optimizer.lr)
-        
+        self.loss_Q+=loss
+
         if(self.update_counter%100==0):
             print self.update_counter
         
@@ -440,9 +472,9 @@ def setLearningRate(self, lr):
         K.set_value(self.full_R.optimizer.lr, self._lr/10.)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr/10.)
         
-        K.set_value(self.transition2.optimizer.lr, self._lr/10.)
+        K.set_value(self.transition2.optimizer.lr, self._lr/20.)
 
-        K.set_value(self.encoder.optimizer.lr, self._lr/100.)
+        K.set_value(self.encoder.optimizer.lr, self._lr/50.)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr/10.)
 
         K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 93a8bbc4..b3c20156 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -128,12 +128,12 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
         # Plot the fitted one-step trajectory from time t=10
         for i in range(19):
-            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0]])])
-            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0]])])
-            predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:]]), color="1", alpha=0.5)
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:]]), color="0.5", alpha=0.5)
-            ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:]]), color="0", alpha=0.5)
+            predicted1=learning_algo.transition2.predict([abs_states[i:i+1],np.array([[1,0,0]])])
+            predicted2=learning_algo.transition2.predict([abs_states[i:i+1],np.array([[0,1,0]])])
+            predicted3=learning_algo.transition2.predict([abs_states[i:i+1],np.array([[0,0,1]])])
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="1", alpha=0.5) #white
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.5", alpha=0.5) #grey
+            ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0", alpha=0.5) #black
 
 #        for xx in [-2,-1.,0, 1., 2.]:
 #            for yy in [-2,-1.,0, 1., 2.]:
@@ -171,12 +171,12 @@ def summarizePerformance(self, test_data_set, learning_algo):
         # Plot the legend for the dots
         from matplotlib.patches import Circle
         from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" Actions (right, left and none) : ", textprops=dict(color="k"))
+        box1 = TextArea(" Actions (none, left and right) : ", textprops=dict(color="k"))
         
         box2 = DrawingArea(60, 20, 0, 0)
-        el1 = Circle((10, 10), 5, fc="k", edgecolor="k")
-        el2 = Circle((30, 10), 5, fc="grey", edgecolor="k") 
         el3 = Circle((50, 10), 5, fc="w", edgecolor="k") 
+        el2 = Circle((30, 10), 5, fc="grey", edgecolor="k") 
+        el1 = Circle((10, 10), 5, fc="k", edgecolor="k")
         box2.add_artist(el1)
         box2.add_artist(el2)
         box2.add_artist(el3)

From 6de62a0bbd26877ab8337ebd062dd71bd70af6e2 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 14 Feb 2018 11:06:18 -0500
Subject: [PATCH 24/96] some partial cleaning

---
 deer/q_networks/NN_keras_lp.py         |   2 +-
 deer/q_networks/q_net_keras_lp.py      |  63 ++---
 examples/simplest_test_PLI/run_test.py | 191 +++++++++++++++
 examples/simplest_test_PLI/test_env.py | 311 +++++++++++++++++++++++++
 4 files changed, 540 insertions(+), 27 deletions(-)
 create mode 100644 examples/simplest_test_PLI/run_test.py
 create mode 100644 examples/simplest_test_PLI/test_env.py

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index ad024777..6a7b5f3e 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -6,7 +6,7 @@
 import numpy as np
 from keras import backend as K
 from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization
+from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization, merge
 from keras import regularizers
 np.random.seed(102912)
 
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index f914113d..0a49f4f5 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -160,8 +160,10 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
         #onehot_actions_rand = np.zeros((self._batch_size, self._n_actions))
         #onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1
-        Es_=self.encoder.predict([next_states_val[0]])
-        Es=self.encoder.predict([states_val[0]])
+        states_val=list(states_val)
+        next_states_val=list(next_states_val)
+        Es_=self.encoder.predict(next_states_val)
+        Es=self.encoder.predict(states_val)
         ETs=self.transition.predict([Es,onehot_actions])
                    
         if(self.update_counter%100==0):
@@ -176,59 +178,67 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print ETs,Es_
             
         # Fit transition
-        for i in range(10):
-            self.loss_T2+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
-        self.loss_T+=self.diff_Tx_x_.train_on_batch([states_val[0],next_states_val[0],onehot_actions], np.zeros((32,self.learn_and_plan.internal_dim)))
+#        for i in range(10):
+#            l=self.transition2.train_on_batch([Es,onehot_actions], Es_)
+#            print l
+#            self.loss_T2+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
 
-        # Fit rewards
-        self.lossR+=self.full_R.train_on_batch([states_val[0],onehot_actions], rewards_val) 
+        l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions], np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        self.loss_T+=l
+        
 
+    
+        # Fit rewards
+        self.lossR+=self.full_R.train_on_batch(states_val+[onehot_actions], rewards_val) 
+    
         # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
         # reduce the squared value of the abstract features
-        self.loss_disambiguate1+=self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))
-#        # Increase the entropy in the abstract features of two states
+        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        
+        # Increase the entropy in the abstract features of two states
+        # This is done only when states_val is made up of only one observation --> FIXME
         rolled=np.roll(states_val[0],1,axis=0)
         #print "states_val[0]"
         #print states_val[0]
         #print "rolled"
         #print rolled
-        for i in range(32):
+        for i in range(self._batch_size):
             j=0
             l=0
             while((states_val[0][i]==rolled[i+j-l]).all()):
                 if(i+j==31):
-                    l=32
+                    l=self._batch_size
                 if(j==31):
                     break
                 j=j+1
             rolled[i]=rolled[i+j-l]
         #print "rolled"
         #print rolled
-        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((32,self.learn_and_plan.internal_dim)))
+        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
         #print self.loss_disambiguate2
         #print "states_val[0]"
         #print states_val[0]
         #print rolled
         #print self.loss_disambiguate1
-        #self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(32)) #np.ones((32,3))*2) 
+        #self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
 
         # Disentangle actions
-        #self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch([states_val[0],onehot_actions,onehot_actions_rand], np.ones(32))
+        #self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch([states_val[0],onehot_actions,onehot_actions_rand], np.ones(self._batch_size))
 
 #
 #        # Loss to have all s' following s,a with a to a distance 1 of s,a)
 #        tiled_x=np.tile(Es,(self._n_actions,1))
 #        tiled_onehot_actions=np.tile(onehot_actions,(self._n_actions,1))
 #        tiled_onehot_actions2=np.repeat(np.diag(np.ones(self._n_actions)),self._batch_size,axis=0)
-#        #self.loss_disentangle_a+=self.diff_Tx.train_on_batch([tiled_x,tiled_onehot_actions,tiled_x,tiled_onehot_actions2], np.ones(32*self._n_actions)) 
+#        #self.loss_disentangle_a+=self.diff_Tx.train_on_batch([tiled_x,tiled_onehot_actions,tiled_x,tiled_onehot_actions2], np.ones(self._batch_size*self._n_actions)) 
 
 
         
         if(self.update_counter%100==0):
             print "self.loss_Q"
             print self.loss_Q
-            print "self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
-            print self.loss_T/100.,self.loss_T2/1000.,self.lossR/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
+            print "self.loss_T/100.,self.loss_T2/100.,self.lossR/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
+            print self.loss_T/100.,self.loss_T2/100.,self.lossR/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
             print K.get_value(self.encoder.optimizer.lr)
             print K.get_value(self.encoder_diff.optimizer.lr)
             self.loss_T=0
@@ -270,7 +280,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         
-        q_vals=self.full_Q.predict([states_val[0],np.zeros((32,self.learn_and_plan.internal_dim))])
+        q_vals=self.full_Q.predict([states_val[0],np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])
 
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
         q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
@@ -284,8 +294,9 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # My loss should only take these into account.
         # Workaround here is that many values are already "exact" in this update
         #if (self.update_counter<10000):
-        noise_to_be_robust=np.random.normal(size=(32,self.learn_and_plan.internal_dim))*0.#25
+        noise_to_be_robust=np.random.normal(size=(self._batch_size,self.learn_and_plan.internal_dim))*0.#25
 
+        loss=0
         loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
         #print "self.q_vals.optimizer.lr"
         #print K.eval(self.q_vals.optimizer.lr)
@@ -357,7 +368,7 @@ def qValues(self, state_val):
         -------
         The q values for the provided belief state
         """ 
-        return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((32,self.learn_and_plan.internal_dim))])[0]
+        return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
 
     def qValues_planning(self, state_val, d=2.):
         """ Get the q values for one belief state with a planning depth d
@@ -469,14 +480,14 @@ def setLearningRate(self, lr):
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
         K.set_value(self.full_Q.optimizer.lr, self._lr)
 
-        K.set_value(self.full_R.optimizer.lr, self._lr/10.)
-        K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr/10.)
+        K.set_value(self.full_R.optimizer.lr, self._lr)
+        K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr)
         
-        K.set_value(self.transition2.optimizer.lr, self._lr/20.)
+        K.set_value(self.transition2.optimizer.lr, self._lr/2.)
 
-        K.set_value(self.encoder.optimizer.lr, self._lr/50.)
-        K.set_value(self.encoder_diff.optimizer.lr, self._lr/10.)
+        K.set_value(self.encoder.optimizer.lr, self._lr/5.)
+        K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr)
 #        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.)
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
diff --git a/examples/simplest_test_PLI/run_test.py b/examples/simplest_test_PLI/run_test.py
new file mode 100644
index 00000000..e8925b2f
--- /dev/null
+++ b/examples/simplest_test_PLI/run_test.py
@@ -0,0 +1,191 @@
+"""ALE launcher. See Wiki for more details about this experiment.
+
+Authors: Vincent Francois-Lavet, David Taralla
+"""
+
+import sys
+import logging
+import numpy as np
+from joblib import hash, dump
+import os
+
+from deer.default_parser import process_args
+from deer.agent import NeuralAgent
+from deer.q_networks.q_net_keras_lp import MyQNetwork
+from test_env import MyEnv as test_env
+import deer.experiment.base_controllers as bc
+
+from deer.policies import EpsilonGreedyPolicy
+
+
+class Defaults:
+    # ----------------------
+    # Experiment Parameters
+    # ----------------------
+    STEPS_PER_EPOCH = 500
+    EPOCHS = 500
+    STEPS_PER_TEST = 20
+    PERIOD_BTW_SUMMARY_PERFS = 1
+    
+    # ----------------------
+    # Environment Parameters
+    # ----------------------
+    FRAME_SKIP = 2
+
+    # ----------------------
+    # DQN Agent parameters:
+    # ----------------------
+    UPDATE_RULE = 'rmsprop'
+    LEARNING_RATE = 0.0002
+    LEARNING_RATE_DECAY = 0.99
+    DISCOUNT = 0.9
+    DISCOUNT_INC = 1
+    DISCOUNT_MAX = 0.99
+    RMS_DECAY = 0.9
+    RMS_EPSILON = 0.0001
+    MOMENTUM = 0
+    CLIP_DELTA = 1.0
+    EPSILON_START = 1.0
+    EPSILON_MIN = .1
+    EPSILON_DECAY = 10000
+    UPDATE_FREQUENCY = 1
+    REPLAY_MEMORY_SIZE = 1000000
+    BATCH_SIZE = 32
+    FREEZE_INTERVAL = 1000
+    DETERMINISTIC = True
+
+
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    
+    # --- Parse parameters ---
+    parameters = process_args(sys.argv[1:], Defaults)
+    if parameters.deterministic:
+        rng = np.random.RandomState(123456)
+    else:
+        rng = np.random.RandomState()
+    
+    # --- Instantiate environment ---
+    env = test_env()
+    
+    # --- Instantiate qnetwork ---
+    qnetwork = MyQNetwork(
+        env,
+        parameters.rms_decay,
+        parameters.rms_epsilon,
+        parameters.momentum,
+        parameters.clip_delta,
+        parameters.freeze_interval,
+        parameters.batch_size,
+        parameters.update_rule,
+        rng)
+    
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
+
+    # --- Instantiate agent ---
+    agent = NeuralAgent(
+        env,
+        qnetwork,
+        parameters.replay_memory_size,
+        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
+        parameters.batch_size,
+        rng,
+        test_policy=test_policy)
+
+    # --- Create unique filename for FindBestController ---
+    h = hash(vars(parameters), hash_name="sha1")
+    fname = "test_" + h
+    print("The parameters hash is: {}".format(h))
+    print("The parameters are: {}".format(parameters))
+
+    # --- Bind controllers to the agent ---
+    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+    # learning rate as well as the training epoch number.
+    agent.attach(bc.VerboseController(
+        evaluate_on='epoch', 
+        periodicity=1))
+    
+    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+    agent.attach(bc.TrainerController(
+        evaluate_on='action', 
+        periodicity=parameters.update_frequency, 
+        show_episode_avg_V_value=True, 
+        show_avg_Bellman_residual=True))
+    
+    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+    agent.attach(bc.LearningRateController(
+        initial_learning_rate=parameters.learning_rate, 
+        learning_rate_decay=parameters.learning_rate_decay,
+        periodicity=1))
+    
+    # Same for the discount factor.
+    agent.attach(bc.DiscountFactorController(
+        initial_discount_factor=parameters.discount, 
+        discount_factor_growth=parameters.discount_inc, 
+        discount_factor_max=parameters.discount_max,
+        periodicity=1))
+    
+    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+    # episode or epoch (or never, hence the resetEvery='none').
+    agent.attach(bc.EpsilonController(
+        initial_e=parameters.epsilon_start, 
+        e_decays=parameters.epsilon_decay, 
+        e_min=parameters.epsilon_min,
+        evaluate_on='action',
+        periodicity=1,
+        reset_every='none'))
+    
+    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
+    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
+    # "true generalization score", or "test score".
+    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
+    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
+    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
+    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
+    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
+    # application.
+    agent.attach(bc.FindBestController(
+        validationID=test_env.VALIDATION_MODE,
+        testID=None,
+        unique_fname=fname))
+    
+    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+    # these validation epoch to interfere with the training of the agent, which is well established by the 
+    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+    # [parameters.period_btw_summary_perfs] *validation* epochs.
+    agent.attach(bc.InterleavedTestEpochController(
+        id=test_env.VALIDATION_MODE, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+    
+    # --- Run the experiment ---
+    try:
+        os.mkdir("params")
+    except Exception:
+        pass
+    dump(vars(parameters), "params/" + fname + ".jldump")
+    agent.run(parameters.epochs, parameters.steps_per_epoch)
+    
+    # --- Show results ---
+    basename = "scores/" + fname
+    scores = joblib.load(basename + "_scores.jldump")
+    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
+    plt.legend()
+    plt.xlabel("Number of epochs")
+    plt.ylabel("Score")
+    plt.savefig(basename + "_scores.pdf")
+    plt.show()
diff --git a/examples/simplest_test_PLI/test_env.py b/examples/simplest_test_PLI/test_env.py
new file mode 100644
index 00000000..d60bfa2e
--- /dev/null
+++ b/examples/simplest_test_PLI/test_env.py
@@ -0,0 +1,311 @@
+""" Interface with the test environment
+
+Authors: Vincent Francois-Lavet
+"""
+import numpy as np
+import cv2
+
+from deer.base_classes import Environment
+
+import matplotlib
+matplotlib.use('qt5agg')
+from mpl_toolkits.axes_grid1 import host_subplot
+import mpl_toolkits.axisartist as AA
+import matplotlib.pyplot as plt
+
+class MyEnv(Environment):
+    VALIDATION_MODE = 0
+
+    def __init__(self):
+
+        self._mode = -1
+        self._mode_score = 0.0
+        self._mode_episode_count = 0
+
+        self._actions = [0,1]
+        self._length_chain=10
+
+                
+    def reset(self, mode):
+        if mode == MyEnv.VALIDATION_MODE:
+            if self._mode != MyEnv.VALIDATION_MODE:
+                self._mode = MyEnv.VALIDATION_MODE
+                self._mode_score = 0.0
+                self._mode_episode_count = 0
+            else:
+                self._mode_episode_count += 1
+        elif self._mode != -1: # and thus mode == -1
+            self._mode = -1
+        
+        self.state=np.zeros(self._length_chain)
+        self.state[0]=1
+                
+        return self.state
+        
+        
+    def act(self, action):
+        action = self._actions[action]
+        
+        self.reward = 0
+        if( self.state[-3]==1 and action==0):        
+            self.reward = 1
+
+#        print self.state, action
+        
+        if (self.state[-2]==1):
+                    self.state[-2]=0
+                    self.state[-1]=1
+        elif (self.state[-1]==1):
+                    self.state[-1]=0
+                    self.state[0]=1
+
+        for i in range(self._length_chain-2):
+            if(self.state[i]==1):
+                if (action==0):
+                    self.state[i]=0
+                    self.state[i+1]=1
+                else:
+                    self.state[i]=0
+                    self.state[-2]=1
+                break
+              
+        self._mode_score += self.reward
+        return self.reward
+
+    def summarizePerformance(self, test_data_set, learning_algo):
+        #print "test_data_set.observations.shape"
+        #print test_data_set.observations()[0][0:1]
+        print "print test_data_set.observations()"
+        print test_data_set.observations()
+        n=self._length_chain-1
+        historics=[]
+        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
+            historics.append(np.expand_dims(observ,axis=0))
+        historics=np.array(historics)
+        print "historics"
+        print historics
+        abs_states=learning_algo.encoder.predict(historics)
+        print "abs_states"
+        print abs_states
+        actions=test_data_set.actions()[0:n]
+        print "actions"
+        print actions
+
+        print actions
+        print "test_data_set.rewards()[0:n]"
+        print test_data_set.rewards()[0:n]
+        print "test_data_set.terminals()[0:n]"
+        print test_data_set.terminals()[0:n]
+        if self.inTerminalState() == False:
+            self._mode_episode_count += 1
+        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
+                
+        
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        import matplotlib.cm as cm
+        m = cm.ScalarMappable(cmap=cm.jet)
+        
+        x = np.array(abs_states)[:,0]
+        y = np.array(abs_states)[:,1]
+        z = np.array(abs_states)[:,2]
+        
+        #Colors
+        #onehot_actions = np.zeros((n, 4))
+        #onehot_actions[np.arange(n), actions] = 1
+        
+        fig = plt.figure()
+        ax = fig.add_subplot(111,projection='3d')
+        for j in range(3):
+            # Plot the trajectory
+            for i in xrange(n-1):
+                ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
+
+        # Plot the fitted one-step trajectory from time t=10
+        for i in range(n-1):
+            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
+            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="1", alpha=0.5)
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.5", alpha=0.5)
+
+#        for xx in [-2,-1.,0, 1., 2.]:
+#            for yy in [-2,-1.,0, 1., 2.]:
+#                for zz in [-2,-1.,0, 1., 2.]:
+#                    predicted1=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[1,0,0]])])
+#                    predicted2=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,1,0]])])
+#                    predicted3=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,0,1]])])
+#                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
+#                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
+#                    ax.plot(np.concatenate([np.array([xx]),predicted3[0,:1]]), np.concatenate([np.array([yy]),predicted3[0,1:2]]), np.concatenate([np.array([zz]),predicted3[0,2:]]), color="0", alpha=0.5)
+                    #ax.plot(np.concatenate([x[i:i+1],predicted[0,:1]]), np.concatenate([y[i:i+1],predicted[0,1:2]]), np.concatenate([z[i:i+1],predicted[0,2:]]), color="g")
+        
+
+        # Plot the colorbar for the trajectory
+        fig.subplots_adjust(right=0.7)
+        ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
+        # Set the colormap and norm to correspond to the data for which the colorbar will be used.
+        cmap = matplotlib.cm.cool
+        norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
+        # everything needed for a standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
+                                norm=norm,
+                                orientation='vertical')
+        cb1.set_label('Beginning to end of trajectory')
+
+
+        # Plot the dots at each time step depending on the action taken
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
+        axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
+        zrange=axes_lims[2][1]-axes_lims[2][0]
+        
+        # Plot the legend for the dots
+        from matplotlib.patches import Circle
+        from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
+        box1 = TextArea(" Actions (action 0, action 1) : ", textprops=dict(color="k"))
+        
+        box2 = DrawingArea(60, 20, 0, 0)
+        el1 = Circle((10, 10), 5, fc="1", edgecolor="k")
+        el2 = Circle((30, 10), 5, fc="0.5", edgecolor="k") 
+        #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
+        box2.add_artist(el1)
+        box2.add_artist(el2)
+        #box2.add_artist(el3)
+        
+        box = HPacker(children=[box1, box2],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=box, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 1.02),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )        
+        ax.add_artist(anchored_box)
+
+        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # Plot the Q_vals
+        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+        #print "actions,C"
+        #print actions
+        #print c
+        #c=np.max(c,axis=1)
+        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        
+        #plt.colorbar(m3)
+        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.RdYlGn
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # fig_visuV
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+
+        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+        c=np.max(c,axis=1)
+        #print "c"
+        #print c
+        
+        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+        #plt.colorbar(m)
+        fig.subplots_adjust(right=0.8)
+        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.hot
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        #plt.show()
+        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # fig_visuR
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+
+        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
+        repeat_nactions_coord=np.repeat(coords,self.nActions(),axis=0)
+        identity_matrix = np.diag(np.ones(self.nActions()))
+        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
+
+        c = learning_algo.R.predict([repeat_nactions_coord,tile_identity_matrix])
+        c=np.max(np.reshape(c,(125,self.nActions())),axis=1)
+        #print "c"
+        #print c
+        #mini=np.min(c)
+        #maxi=np.max(c)
+        
+        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+        #plt.colorbar(m)
+        fig.subplots_adjust(right=0.8)
+        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.hot
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        #plt.show()
+        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
+
+        matplotlib.pyplot.close("all") # avoids memory leaks
+
+    def inputDimensions(self):
+        return [(1,self._length_chain)]
+
+    def observationType(self, subject):
+        return np.float32
+
+    def nActions(self):
+        return len(self._actions)
+
+    def observe(self):
+        return [np.array(self.state)]
+
+    def inTerminalState(self):
+        if (self.state[-1]==1):
+            return True
+        else:
+            return False
+
+
+
+if __name__ == "__main__":
+    pass

From cf662bd7d6c526babf3d057e8a5e9d5a82e74d37 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 14 Feb 2018 13:53:52 -0500
Subject: [PATCH 25/96] test_env2

---
 examples/simplest_test_PLI/run_test2.py | 191 ++++++++++++++
 examples/simplest_test_PLI/test_env2.py | 318 ++++++++++++++++++++++++
 2 files changed, 509 insertions(+)
 create mode 100644 examples/simplest_test_PLI/run_test2.py
 create mode 100644 examples/simplest_test_PLI/test_env2.py

diff --git a/examples/simplest_test_PLI/run_test2.py b/examples/simplest_test_PLI/run_test2.py
new file mode 100644
index 00000000..39995196
--- /dev/null
+++ b/examples/simplest_test_PLI/run_test2.py
@@ -0,0 +1,191 @@
+"""ALE launcher. See Wiki for more details about this experiment.
+
+Authors: Vincent Francois-Lavet, David Taralla
+"""
+
+import sys
+import logging
+import numpy as np
+from joblib import hash, dump
+import os
+
+from deer.default_parser import process_args
+from deer.agent import NeuralAgent
+from deer.q_networks.q_net_keras_lp import MyQNetwork
+from test_env2 import MyEnv as test_env
+import deer.experiment.base_controllers as bc
+
+from deer.policies import EpsilonGreedyPolicy
+
+
+class Defaults:
+    # ----------------------
+    # Experiment Parameters
+    # ----------------------
+    STEPS_PER_EPOCH = 500
+    EPOCHS = 500
+    STEPS_PER_TEST = 20
+    PERIOD_BTW_SUMMARY_PERFS = 1
+    
+    # ----------------------
+    # Environment Parameters
+    # ----------------------
+    FRAME_SKIP = 2
+
+    # ----------------------
+    # DQN Agent parameters:
+    # ----------------------
+    UPDATE_RULE = 'rmsprop'
+    LEARNING_RATE = 0.0002
+    LEARNING_RATE_DECAY = 0.99
+    DISCOUNT = 0.9
+    DISCOUNT_INC = 1
+    DISCOUNT_MAX = 0.99
+    RMS_DECAY = 0.9
+    RMS_EPSILON = 0.0001
+    MOMENTUM = 0
+    CLIP_DELTA = 1.0
+    EPSILON_START = 1.0
+    EPSILON_MIN = .1
+    EPSILON_DECAY = 10000
+    UPDATE_FREQUENCY = 1
+    REPLAY_MEMORY_SIZE = 1000000
+    BATCH_SIZE = 32
+    FREEZE_INTERVAL = 1000
+    DETERMINISTIC = True
+
+
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    
+    # --- Parse parameters ---
+    parameters = process_args(sys.argv[1:], Defaults)
+    if parameters.deterministic:
+        rng = np.random.RandomState(123456)
+    else:
+        rng = np.random.RandomState()
+    
+    # --- Instantiate environment ---
+    env = test_env()
+    
+    # --- Instantiate qnetwork ---
+    qnetwork = MyQNetwork(
+        env,
+        parameters.rms_decay,
+        parameters.rms_epsilon,
+        parameters.momentum,
+        parameters.clip_delta,
+        parameters.freeze_interval,
+        parameters.batch_size,
+        parameters.update_rule,
+        rng)
+    
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
+
+    # --- Instantiate agent ---
+    agent = NeuralAgent(
+        env,
+        qnetwork,
+        parameters.replay_memory_size,
+        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
+        parameters.batch_size,
+        rng,
+        test_policy=test_policy)
+
+    # --- Create unique filename for FindBestController ---
+    h = hash(vars(parameters), hash_name="sha1")
+    fname = "test_" + h
+    print("The parameters hash is: {}".format(h))
+    print("The parameters are: {}".format(parameters))
+
+    # --- Bind controllers to the agent ---
+    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+    # learning rate as well as the training epoch number.
+    agent.attach(bc.VerboseController(
+        evaluate_on='epoch', 
+        periodicity=1))
+    
+    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+    agent.attach(bc.TrainerController(
+        evaluate_on='action', 
+        periodicity=parameters.update_frequency, 
+        show_episode_avg_V_value=True, 
+        show_avg_Bellman_residual=True))
+    
+    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+    agent.attach(bc.LearningRateController(
+        initial_learning_rate=parameters.learning_rate, 
+        learning_rate_decay=parameters.learning_rate_decay,
+        periodicity=1))
+    
+    # Same for the discount factor.
+    agent.attach(bc.DiscountFactorController(
+        initial_discount_factor=parameters.discount, 
+        discount_factor_growth=parameters.discount_inc, 
+        discount_factor_max=parameters.discount_max,
+        periodicity=1))
+    
+    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+    # episode or epoch (or never, hence the resetEvery='none').
+    agent.attach(bc.EpsilonController(
+        initial_e=parameters.epsilon_start, 
+        e_decays=parameters.epsilon_decay, 
+        e_min=parameters.epsilon_min,
+        evaluate_on='action',
+        periodicity=1,
+        reset_every='none'))
+    
+    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
+    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
+    # "true generalization score", or "test score".
+    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
+    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
+    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
+    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
+    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
+    # application.
+    agent.attach(bc.FindBestController(
+        validationID=test_env.VALIDATION_MODE,
+        testID=None,
+        unique_fname=fname))
+    
+    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+    # these validation epoch to interfere with the training of the agent, which is well established by the 
+    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+    # [parameters.period_btw_summary_perfs] *validation* epochs.
+    agent.attach(bc.InterleavedTestEpochController(
+        id=test_env.VALIDATION_MODE, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+    
+    # --- Run the experiment ---
+    try:
+        os.mkdir("params")
+    except Exception:
+        pass
+    dump(vars(parameters), "params/" + fname + ".jldump")
+    agent.run(parameters.epochs, parameters.steps_per_epoch)
+    
+    # --- Show results ---
+    basename = "scores/" + fname
+    scores = joblib.load(basename + "_scores.jldump")
+    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
+    plt.legend()
+    plt.xlabel("Number of epochs")
+    plt.ylabel("Score")
+    plt.savefig(basename + "_scores.pdf")
+    plt.show()
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
new file mode 100644
index 00000000..192982a0
--- /dev/null
+++ b/examples/simplest_test_PLI/test_env2.py
@@ -0,0 +1,318 @@
+""" Interface with the test environment
+
+Authors: Vincent Francois-Lavet
+"""
+import numpy as np
+import cv2
+
+from deer.base_classes import Environment
+
+import matplotlib
+matplotlib.use('qt5agg')
+from mpl_toolkits.axes_grid1 import host_subplot
+import mpl_toolkits.axisartist as AA
+import matplotlib.pyplot as plt
+import copy 
+
+class MyEnv(Environment):
+    VALIDATION_MODE = 0
+
+    def __init__(self):
+
+        self._mode = -1
+        self._mode_score = 0.0
+        self._mode_episode_count = 0
+
+        self._actions = [0,1]
+        self._height=8
+        self._width=5 #preferably an odd number so that it's symmetrical
+
+                
+    def reset(self, mode):
+        if mode == MyEnv.VALIDATION_MODE:
+            if self._mode != MyEnv.VALIDATION_MODE:
+                self._mode = MyEnv.VALIDATION_MODE
+                self._mode_score = 0.0
+                self._mode_episode_count = 0
+            else:
+                self._mode_episode_count += 1
+        elif self._mode != -1: # and thus mode == -1
+            self._mode = -1
+        
+        self.y=self._height-1
+        self.x=self._width/2
+                
+        return np.array([[0,0,0,1,0,1,0]]) #[0,0,1]+[0,1,0]
+        
+        
+    def act(self, action):
+        action = self._actions[action]
+        
+        if(action==0):        
+            self.x = max(self.x-1,0)
+        if(action==1):        
+            self.x = min(self.x+1,self._width-1)
+
+        self.y = self.y-1
+              
+        if(self.y==0 and self.x==self._width/2):
+            self.reward = 1
+        else:
+            self.reward = 0
+
+        self._mode_score += self.reward
+        return self.reward
+
+    def summarizePerformance(self, test_data_set, learning_algo):
+        #print "test_data_set.observations.shape"
+        #print test_data_set.observations()[0][0:1]
+        
+        possib_y = np.zeros((self._height-1, self._height))
+        possib_y[np.arange(self._height-1), 1+np.arange(self._height-1)] = 1
+        possib_x=np.diag(np.ones(self._width))
+        rep_x=np.repeat(possib_x,self._height-1,axis=0)
+        rep_y=np.tile(possib_y,(self._width,1))
+        all_possib_inp=np.expand_dims(np.concatenate((rep_y,rep_x),axis=1),axis=1)
+        all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
+        print "learning_algo.encoder.predict(all_possib_inp)"
+        print all_possib_abs_states
+        
+        print "print test_data_set.observations()"
+        print test_data_set.observations()
+        n=self._height-1
+        historics=[]
+        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
+            historics.append(np.expand_dims(observ,axis=0))
+        historics=np.array(historics)
+        print "historics"
+        print historics
+        abs_states=learning_algo.encoder.predict(historics)
+        print "abs_states"
+        print abs_states
+        actions=test_data_set.actions()[0:n]
+        print "actions"
+        print actions
+
+        print actions
+        print "test_data_set.rewards()[0:n]"
+        print test_data_set.rewards()[0:n]
+        print "test_data_set.terminals()[0:n]"
+        print test_data_set.terminals()[0:n]
+        if self.inTerminalState() == False:
+            self._mode_episode_count += 1
+        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
+                
+        
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        import matplotlib.cm as cm
+        m = cm.ScalarMappable(cmap=cm.jet)
+        
+        x = np.array(abs_states)[:,0]
+        y = np.array(abs_states)[:,1]
+        z = np.array(abs_states)[:,2]
+        
+        #Colors
+        #onehot_actions = np.zeros((n, 4))
+        #onehot_actions[np.arange(n), actions] = 1
+        
+        fig = plt.figure()
+        ax = fig.add_subplot(111,projection='3d')
+        for j in range(3):
+            # Plot the trajectory
+            for i in xrange(n-1):
+                ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
+
+        # Plot the estimated transitions
+        for i in range(n-1):
+            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
+            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.5)
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.5)
+
+#        for xx in [-2,-1.,0, 1., 2.]:
+#            for yy in [-2,-1.,0, 1., 2.]:
+#                for zz in [-2,-1.,0, 1., 2.]:
+#                    predicted1=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[1,0,0]])])
+#                    predicted2=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,1,0]])])
+#                    predicted3=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,0,1]])])
+#                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
+#                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
+#                    ax.plot(np.concatenate([np.array([xx]),predicted3[0,:1]]), np.concatenate([np.array([yy]),predicted3[0,1:2]]), np.concatenate([np.array([zz]),predicted3[0,2:]]), color="0", alpha=0.5)
+                    #ax.plot(np.concatenate([x[i:i+1],predicted[0,:1]]), np.concatenate([y[i:i+1],predicted[0,1:2]]), np.concatenate([z[i:i+1],predicted[0,2:]]), color="g")
+        
+
+        # Plot the colorbar for the trajectory
+        fig.subplots_adjust(right=0.7)
+        ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
+        # Set the colormap and norm to correspond to the data for which the colorbar will be used.
+        cmap = matplotlib.cm.cool
+        norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
+        # everything needed for a standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
+                                norm=norm,
+                                orientation='vertical')
+        cb1.set_label('Beginning to end of trajectory')
+
+
+        # Plot the dots at each time step depending on the action taken
+        #line3 = ax.scatter(all_possib_abs_states[:,0], all_possib_abs_states[:,1] ,all_possib_abs_states[:,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=1.)
+        axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
+        zrange=axes_lims[2][1]-axes_lims[2][0]
+        
+        # Plot the legend for the dots
+        from matplotlib.patches import Circle
+        from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
+        box1 = TextArea(" Actions (action 0, action 1) : ", textprops=dict(color="k"))
+        
+        box2 = DrawingArea(60, 20, 0, 0)
+        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k")
+        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k") 
+        #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
+        box2.add_artist(el1)
+        box2.add_artist(el2)
+        #box2.add_artist(el3)
+        
+        box = HPacker(children=[box1, box2],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=box, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 1.02),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )        
+        ax.add_artist(anchored_box)
+
+        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # Plot the Q_vals
+        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+        #print "actions,C"
+        #print actions
+        #print c
+        #c=np.max(c,axis=1)
+        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        
+        #plt.colorbar(m3)
+        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.RdYlGn
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # fig_visuV
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+
+        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+        c=np.max(c,axis=1)
+        #print "c"
+        #print c
+        
+        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+        #plt.colorbar(m)
+        fig.subplots_adjust(right=0.8)
+        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.hot
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        #plt.show()
+        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # fig_visuR
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+
+        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
+        repeat_nactions_coord=np.repeat(coords,self.nActions(),axis=0)
+        identity_matrix = np.diag(np.ones(self.nActions()))
+        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
+
+        c = learning_algo.R.predict([repeat_nactions_coord,tile_identity_matrix])
+        c=np.max(np.reshape(c,(125,self.nActions())),axis=1)
+        #print "c"
+        #print c
+        #mini=np.min(c)
+        #maxi=np.max(c)
+        
+        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+        #plt.colorbar(m)
+        fig.subplots_adjust(right=0.8)
+        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.hot
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        #plt.show()
+        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
+
+        matplotlib.pyplot.close("all") # avoids memory leaks
+
+    def inputDimensions(self):
+        return [(1,self._height+self._width)]
+
+    def observationType(self, subject):
+        return np.float32
+
+    def nActions(self):
+        return len(self._actions)
+
+    def observe(self):
+        one_hot_x=np.zeros(self._width)
+        one_hot_x[self.x]=1
+        one_hot_y=np.zeros(self._height)
+        one_hot_y[self.y]=1
+        return [np.array(list(one_hot_y)+list(one_hot_x))]
+
+    def inTerminalState(self):
+        if (self.y==0):
+            return True
+        else:
+            return False
+
+
+
+if __name__ == "__main__":
+    pass

From 71969aca1fb129d369461f796fa9c5e0d935e6ae Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 14 Feb 2018 14:56:47 -0500
Subject: [PATCH 26/96] improve plot visibility

---
 examples/simplest_test_PLI/test_env.py  | 38 +++++++++++++++++-----
 examples/simplest_test_PLI/test_env2.py | 42 ++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/examples/simplest_test_PLI/test_env.py b/examples/simplest_test_PLI/test_env.py
index d60bfa2e..631f2591 100644
--- a/examples/simplest_test_PLI/test_env.py
+++ b/examples/simplest_test_PLI/test_env.py
@@ -125,8 +125,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
         for i in range(n-1):
             predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
             predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="1", alpha=0.5)
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.5", alpha=0.5)
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.5)
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.5)
 
 #        for xx in [-2,-1.,0, 1., 2.]:
 #            for yy in [-2,-1.,0, 1., 2.]:
@@ -157,18 +157,18 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
 
         # Plot the dots at each time step depending on the action taken
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
         axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
         zrange=axes_lims[2][1]-axes_lims[2][0]
         
         # Plot the legend for the dots
-        from matplotlib.patches import Circle
+        from matplotlib.patches import Circle, Rectangle
         from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" Actions (action 0, action 1) : ", textprops=dict(color="k"))
+        box1 = TextArea(" State (action 0, action 1) : ", textprops=dict(color="k"))
         
         box2 = DrawingArea(60, 20, 0, 0)
-        el1 = Circle((10, 10), 5, fc="1", edgecolor="k")
-        el2 = Circle((30, 10), 5, fc="0.5", edgecolor="k") 
+        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k")
+        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k") 
         #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
         box2.add_artist(el1)
         box2.add_artist(el2)
@@ -181,12 +181,33 @@ def summarizePerformance(self, test_data_set, learning_algo):
         anchored_box = AnchoredOffsetbox(loc=3,
                                          child=box, pad=0.,
                                          frameon=True,
-                                         bbox_to_anchor=(0., 1.02),
+                                         bbox_to_anchor=(0., 1.07),
                                          bbox_transform=ax.transAxes,
                                          borderpad=0.,
                                          )        
         ax.add_artist(anchored_box)
 
+        # Plot the legend for transition estimates
+        box1b = TextArea(" Estimated transitions (action 0, action 1): ", textprops=dict(color="k"))
+        box2b = DrawingArea(60, 20, 0, 0)
+        el1b = Rectangle((5, 10), 15,2, fc="0.75")
+        el2b = Rectangle((25, 10), 15,2, fc="0.25") 
+        box2b.add_artist(el1b)
+        box2b.add_artist(el2b)
+
+        boxb = HPacker(children=[box1b, box2b],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=boxb, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 0.98),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )
+        ax.add_artist(anchored_box)
+
         plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 
@@ -243,6 +264,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
         cb1.set_label('Estimated expected return')
 
+
         #plt.show()
         plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
 
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
index 192982a0..dd315d90 100644
--- a/examples/simplest_test_PLI/test_env2.py
+++ b/examples/simplest_test_PLI/test_env2.py
@@ -160,14 +160,14 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
         # Plot the dots at each time step depending on the action taken
         #line3 = ax.scatter(all_possib_abs_states[:,0], all_possib_abs_states[:,1] ,all_possib_abs_states[:,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=1.)
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=0.5, depthshade=True)
         axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
         zrange=axes_lims[2][1]-axes_lims[2][0]
         
         # Plot the legend for the dots
-        from matplotlib.patches import Circle
+        from matplotlib.patches import Circle, Rectangle
         from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" Actions (action 0, action 1) : ", textprops=dict(color="k"))
+        box1 = TextArea(" State (action 0, action 1): ", textprops=dict(color="k"))
         
         box2 = DrawingArea(60, 20, 0, 0)
         el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k")
@@ -176,7 +176,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
         box2.add_artist(el1)
         box2.add_artist(el2)
         #box2.add_artist(el3)
-        
+
+
         box = HPacker(children=[box1, box2],
                       align="center",
                       pad=0, sep=5)
@@ -184,12 +185,43 @@ def summarizePerformance(self, test_data_set, learning_algo):
         anchored_box = AnchoredOffsetbox(loc=3,
                                          child=box, pad=0.,
                                          frameon=True,
-                                         bbox_to_anchor=(0., 1.02),
+                                         bbox_to_anchor=(0., 1.07),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )
+        ax.add_artist(anchored_box)
+
+
+        # Plot the legend for transition estimates
+#        #Create custom artists
+#        simArtist = plt.Line2D((0,1),(0,0), color='0.75')
+#        anyArtist = plt.Line2D((0,1),(0,0), color='0.25')
+#        
+#        #Create legend from custom artist/label lists
+#        ax.legend([simArtist,anyArtist],
+#                  ['est. tr. action 0', 'est. tr. action 1'])
+        box1b = TextArea(" Estimated transitions (action 0, action 1): ", textprops=dict(color="k"))
+        box2b = DrawingArea(60, 20, 0, 0)
+        el1b = Rectangle((5, 10), 15,2, fc="0.75")
+        el2b = Rectangle((25, 10), 15,2, fc="0.25") 
+        box2b.add_artist(el1b)
+        box2b.add_artist(el2b)
+
+        boxb = HPacker(children=[box1b, box2b],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=boxb, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 0.98),
                                          bbox_transform=ax.transAxes,
                                          borderpad=0.,
                                          )        
         ax.add_artist(anchored_box)
 
+        
+
         plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 

From 98753c501e7bc10c1909e470b36791bf8ef6eb3c Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 14 Feb 2018 15:06:41 -0500
Subject: [PATCH 27/96] improve figs

---
 examples/simplest_test_PLI/test_env.py  | 3 +++
 examples/simplest_test_PLI/test_env2.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/examples/simplest_test_PLI/test_env.py b/examples/simplest_test_PLI/test_env.py
index 631f2591..8f68406f 100644
--- a/examples/simplest_test_PLI/test_env.py
+++ b/examples/simplest_test_PLI/test_env.py
@@ -208,6 +208,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
                                          )
         ax.add_artist(anchored_box)
 
+        ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
index dd315d90..76905ca6 100644
--- a/examples/simplest_test_PLI/test_env2.py
+++ b/examples/simplest_test_PLI/test_env2.py
@@ -222,6 +222,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
         
 
+        ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 

From 84d177b68506b1feb3717fbdfe9fba3c0b85c8a7 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 14 Feb 2018 21:45:29 -0500
Subject: [PATCH 28/96] improve plots

---
 deer/q_networks/NN_keras_lp.py          |  6 +++---
 examples/simplest_test_PLI/test_env.py  | 22 +++++++++++-----------
 examples/simplest_test_PLI/test_env2.py | 20 ++++++++++----------
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 6a7b5f3e..d8b4144e 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -173,9 +173,9 @@ def transition_model(self):
         inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
 
         x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(5, activation='tanh')(x) #5,15
-        #x = Dense(10, activation='tanh')(x) # ,30
-        x = Dense(5, activation='tanh')(x) #5,15
+        #x = Dense(5, activation='tanh')(x) #5,15
+        x = Dense(10, activation='tanh')(x) # ,30
+        #x = Dense(5, activation='tanh')(x) #5,15
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         x = Add()([inputs[0],x])
         
diff --git a/examples/simplest_test_PLI/test_env.py b/examples/simplest_test_PLI/test_env.py
index 8f68406f..4239f4de 100644
--- a/examples/simplest_test_PLI/test_env.py
+++ b/examples/simplest_test_PLI/test_env.py
@@ -23,7 +23,7 @@ def __init__(self):
         self._mode_episode_count = 0
 
         self._actions = [0,1]
-        self._length_chain=10
+        self._length_chain=11
 
                 
     def reset(self, mode):
@@ -50,8 +50,8 @@ def act(self, action):
         if( self.state[-3]==1 and action==0):        
             self.reward = 1
 
-#        print self.state, action
-        
+        # self.state[-2] is the end state
+        # at self.state[-1] the env is reset
         if (self.state[-2]==1):
                     self.state[-2]=0
                     self.state[-1]=1
@@ -125,8 +125,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
         for i in range(n-1):
             predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
             predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.5)
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.5)
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75)
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.75)
 
 #        for xx in [-2,-1.,0, 1., 2.]:
 #            for yy in [-2,-1.,0, 1., 2.]:
@@ -164,11 +164,11 @@ def summarizePerformance(self, test_data_set, learning_algo):
         # Plot the legend for the dots
         from matplotlib.patches import Circle, Rectangle
         from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" State (action 0, action 1) : ", textprops=dict(color="k"))
+        box1 = TextArea(" State representation (action 0, action 1) : ", textprops=dict(color="k"))
         
         box2 = DrawingArea(60, 20, 0, 0)
-        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k")
-        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k") 
+        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k", alpha=0.75)
+        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k", alpha=0.75) 
         #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
         box2.add_artist(el1)
         box2.add_artist(el2)
@@ -190,8 +190,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
         # Plot the legend for transition estimates
         box1b = TextArea(" Estimated transitions (action 0, action 1): ", textprops=dict(color="k"))
         box2b = DrawingArea(60, 20, 0, 0)
-        el1b = Rectangle((5, 10), 15,2, fc="0.75")
-        el2b = Rectangle((25, 10), 15,2, fc="0.25") 
+        el1b = Rectangle((5, 10), 15,2, fc="0.75", alpha=0.75)
+        el2b = Rectangle((25, 10), 15,2, fc="0.25", alpha=0.75) 
         box2b.add_artist(el1b)
         box2b.add_artist(el2b)
 
@@ -237,7 +237,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1.set_label('Estimated expected return')
 
         plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
-
+        #plt.show()
 
         # fig_visuV
         fig = plt.figure()
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
index 76905ca6..ee1ee49d 100644
--- a/examples/simplest_test_PLI/test_env2.py
+++ b/examples/simplest_test_PLI/test_env2.py
@@ -24,8 +24,8 @@ def __init__(self):
         self._mode_episode_count = 0
 
         self._actions = [0,1]
-        self._height=8
-        self._width=5 #preferably an odd number so that it's symmetrical
+        self._height=18
+        self._width=11 #preferably an odd number so that it's symmetrical
 
                 
     def reset(self, mode):
@@ -127,8 +127,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
         for i in range(n-1):
             predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
             predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.5)
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.5)
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75)
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.75)
 
 #        for xx in [-2,-1.,0, 1., 2.]:
 #            for yy in [-2,-1.,0, 1., 2.]:
@@ -160,18 +160,18 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
         # Plot the dots at each time step depending on the action taken
         #line3 = ax.scatter(all_possib_abs_states[:,0], all_possib_abs_states[:,1] ,all_possib_abs_states[:,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=0.5, depthshade=True)
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
         axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
         zrange=axes_lims[2][1]-axes_lims[2][0]
         
         # Plot the legend for the dots
         from matplotlib.patches import Circle, Rectangle
         from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" State (action 0, action 1): ", textprops=dict(color="k"))
+        box1 = TextArea(" State representation (action 0, action 1): ", textprops=dict(color="k"))
         
         box2 = DrawingArea(60, 20, 0, 0)
-        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k")
-        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k") 
+        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k", alpha=0.75)
+        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k", alpha=0.75) 
         #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
         box2.add_artist(el1)
         box2.add_artist(el2)
@@ -202,8 +202,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
 #                  ['est. tr. action 0', 'est. tr. action 1'])
         box1b = TextArea(" Estimated transitions (action 0, action 1): ", textprops=dict(color="k"))
         box2b = DrawingArea(60, 20, 0, 0)
-        el1b = Rectangle((5, 10), 15,2, fc="0.75")
-        el2b = Rectangle((25, 10), 15,2, fc="0.25") 
+        el1b = Rectangle((5, 10), 15,2, fc="0.75", alpha=0.75)
+        el2b = Rectangle((25, 10), 15,2, fc="0.25", alpha=0.75) 
         box2b.add_artist(el1b)
         box2b.add_artist(el2b)
 

From 46f4a40113d1fb0dd7dc80de9bf3b16c013210fb Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 14 Feb 2018 23:05:17 -0500
Subject: [PATCH 29/96] modif exploration

---
 examples/simplest_test_PLI/run_test2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/simplest_test_PLI/run_test2.py b/examples/simplest_test_PLI/run_test2.py
index 39995196..4d293e7c 100644
--- a/examples/simplest_test_PLI/run_test2.py
+++ b/examples/simplest_test_PLI/run_test2.py
@@ -46,7 +46,7 @@ class Defaults:
     MOMENTUM = 0
     CLIP_DELTA = 1.0
     EPSILON_START = 1.0
-    EPSILON_MIN = .1
+    EPSILON_MIN = .3
     EPSILON_DECAY = 10000
     UPDATE_FREQUENCY = 1
     REPLAY_MEMORY_SIZE = 1000000

From 99488c1648d232f465744f4ac25a38fa38715a91 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 15 Feb 2018 11:51:14 -0500
Subject: [PATCH 30/96] modif PLE (with PLI)

---
 examples/PLE/PLE_env.py                 | 54 +++++++++++++++++++------
 examples/PLE/run_PLE.py                 |  4 +-
 examples/simplest_test_PLI/test_env2.py | 15 +++----
 3 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index b3c20156..b545affa 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -128,12 +128,12 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
         # Plot the fitted one-step trajectory from time t=10
         for i in range(19):
-            predicted1=learning_algo.transition2.predict([abs_states[i:i+1],np.array([[1,0,0]])])
-            predicted2=learning_algo.transition2.predict([abs_states[i:i+1],np.array([[0,1,0]])])
-            predicted3=learning_algo.transition2.predict([abs_states[i:i+1],np.array([[0,0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="1", alpha=0.5) #white
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.5", alpha=0.5) #grey
-            ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0", alpha=0.5) #black
+            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0]])])
+            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0]])])
+            predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1]])])
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75) #white
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.5", alpha=0.75) #grey
+            ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.25", alpha=0.75) #black
 
 #        for xx in [-2,-1.,0, 1., 2.]:
 #            for yy in [-2,-1.,0, 1., 2.]:
@@ -164,19 +164,21 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
 
         # Plot the dots at each time step depending on the action taken
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(actions/2.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
+        print np.tile(np.expand_dims(actions,axis=1),(1,3))
+        print np.tile(np.expand_dims(0.75-actions/4.,axis=1),(1,3))
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(0.75-actions/4.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
         axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
         zrange=axes_lims[2][1]-axes_lims[2][0]
         
         # Plot the legend for the dots
-        from matplotlib.patches import Circle
+        from matplotlib.patches import Circle, Rectangle
         from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" Actions (none, left and right) : ", textprops=dict(color="k"))
+        box1 = TextArea(" State representation (action 0, 1 or 2) : ", textprops=dict(color="k")) #none, left and right
         
         box2 = DrawingArea(60, 20, 0, 0)
-        el3 = Circle((50, 10), 5, fc="w", edgecolor="k") 
-        el2 = Circle((30, 10), 5, fc="grey", edgecolor="k") 
-        el1 = Circle((10, 10), 5, fc="k", edgecolor="k")
+        el1 = Circle((10, 10), 5, fc="0.75", alpha=0.75, edgecolor="k")
+        el2 = Circle((30, 10), 5, fc="0.5", alpha=0.75, edgecolor="k") 
+        el3 = Circle((50, 10), 5, fc="0.25", alpha=0.75, edgecolor="k") 
         box2.add_artist(el1)
         box2.add_artist(el2)
         box2.add_artist(el3)
@@ -188,12 +190,38 @@ def summarizePerformance(self, test_data_set, learning_algo):
         anchored_box = AnchoredOffsetbox(loc=3,
                                          child=box, pad=0.,
                                          frameon=True,
-                                         bbox_to_anchor=(0., 1.02),
+                                         bbox_to_anchor=(0., 1.07),
                                          bbox_transform=ax.transAxes,
                                          borderpad=0.,
                                          )        
         ax.add_artist(anchored_box)
 
+        # Plot the legend for transition estimates
+        box1b = TextArea(" Estimated transitions (action 0, 1 or 2): ", textprops=dict(color="k"))
+        box2b = DrawingArea(70, 20, 0, 0)
+        el1b = Rectangle((5, 10), 15,2, fc="0.75", alpha=0.75)
+        el2b = Rectangle((25, 10), 15,2, fc="0.5", alpha=0.75) 
+        el3b = Rectangle((45, 10), 15,2, fc="0.25", alpha=0.75) 
+        box2b.add_artist(el1b)
+        box2b.add_artist(el2b)
+        box2b.add_artist(el3b)
+
+        boxb = HPacker(children=[box1b, box2b],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=boxb, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 0.98),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )        
+        ax.add_artist(anchored_box)
+
+        ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index 19d28bcf..2407d411 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -38,7 +38,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.002
+    LEARNING_RATE = 0.0001
     LEARNING_RATE_DECAY = 0.99
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
@@ -48,7 +48,7 @@ class Defaults:
     MOMENTUM = 0
     CLIP_DELTA = 1.0
     EPSILON_START = 1.0
-    EPSILON_MIN = .1
+    EPSILON_MIN = .3
     EPSILON_DECAY = 10000
     UPDATE_FREQUENCY = 1
     REPLAY_MEMORY_SIZE = 1000000
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
index ee1ee49d..1e609b0b 100644
--- a/examples/simplest_test_PLI/test_env2.py
+++ b/examples/simplest_test_PLI/test_env2.py
@@ -25,7 +25,7 @@ def __init__(self):
 
         self._actions = [0,1]
         self._height=18
-        self._width=11 #preferably an odd number so that it's symmetrical
+        self._width=7 #preferably an odd number so that it's symmetrical
 
                 
     def reset(self, mode):
@@ -40,7 +40,7 @@ def reset(self, mode):
             self._mode = -1
         
         self.y=self._height-1
-        self.x=self._width/2
+        self.x=self._width//2
                 
         return np.array([[0,0,0,1,0,1,0]]) #[0,0,1]+[0,1,0]
         
@@ -55,8 +55,10 @@ def act(self, action):
 
         self.y = self.y-1
               
-        if(self.y==0 and self.x==self._width/2):
+        if(self.y==0 and self.x==self._width//2):
             self.reward = 1
+        elif(self.y==0):
+            self.reward = -1
         else:
             self.reward = 0
 
@@ -193,13 +195,6 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
 
         # Plot the legend for transition estimates
-#        #Create custom artists
-#        simArtist = plt.Line2D((0,1),(0,0), color='0.75')
-#        anyArtist = plt.Line2D((0,1),(0,0), color='0.25')
-#        
-#        #Create legend from custom artist/label lists
-#        ax.legend([simArtist,anyArtist],
-#                  ['est. tr. action 0', 'est. tr. action 1'])
         box1b = TextArea(" Estimated transitions (action 0, action 1): ", textprops=dict(color="k"))
         box2b = DrawingArea(60, 20, 0, 0)
         el1b = Rectangle((5, 10), 15,2, fc="0.75", alpha=0.75)

From fb46fb32547e433931014b4ad79aec7cd849a055 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 15 Feb 2018 12:04:28 -0500
Subject: [PATCH 31/96] improve plot

---
 examples/PLE/PLE_env.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index b545affa..8099ef69 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -131,9 +131,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
             predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0]])])
             predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0]])])
             predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75) #white
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.5", alpha=0.75) #grey
-            ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.25", alpha=0.75) #black
+            ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.23", alpha=0.75) #black
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.57", alpha=0.75) #grey
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.9", alpha=0.75) #white
 
 #        for xx in [-2,-1.,0, 1., 2.]:
 #            for yy in [-2,-1.,0, 1., 2.]:
@@ -166,7 +166,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         # Plot the dots at each time step depending on the action taken
         print np.tile(np.expand_dims(actions,axis=1),(1,3))
         print np.tile(np.expand_dims(0.75-actions/4.,axis=1),(1,3))
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(0.75-actions/4.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(0.9-actions/3.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
         axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
         zrange=axes_lims[2][1]-axes_lims[2][0]
         
@@ -176,9 +176,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
         box1 = TextArea(" State representation (action 0, 1 or 2) : ", textprops=dict(color="k")) #none, left and right
         
         box2 = DrawingArea(60, 20, 0, 0)
-        el1 = Circle((10, 10), 5, fc="0.75", alpha=0.75, edgecolor="k")
-        el2 = Circle((30, 10), 5, fc="0.5", alpha=0.75, edgecolor="k") 
-        el3 = Circle((50, 10), 5, fc="0.25", alpha=0.75, edgecolor="k") 
+        el1 = Circle((10, 10), 5, fc="0.9", alpha=0.75, edgecolor="k")
+        el2 = Circle((30, 10), 5, fc="0.57", alpha=0.75, edgecolor="k") 
+        el3 = Circle((50, 10), 5, fc="0.23", alpha=0.75, edgecolor="k") 
         box2.add_artist(el1)
         box2.add_artist(el2)
         box2.add_artist(el3)
@@ -199,9 +199,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
         # Plot the legend for transition estimates
         box1b = TextArea(" Estimated transitions (action 0, 1 or 2): ", textprops=dict(color="k"))
         box2b = DrawingArea(70, 20, 0, 0)
-        el1b = Rectangle((5, 10), 15,2, fc="0.75", alpha=0.75)
-        el2b = Rectangle((25, 10), 15,2, fc="0.5", alpha=0.75) 
-        el3b = Rectangle((45, 10), 15,2, fc="0.25", alpha=0.75) 
+        el1b = Rectangle((5, 10), 15,2, fc="0.9", alpha=0.75)
+        el2b = Rectangle((25, 10), 15,2, fc="0.57", alpha=0.75) 
+        el3b = Rectangle((45, 10), 15,2, fc="0.23", alpha=0.75) 
         box2b.add_artist(el1b)
         box2b.add_artist(el2b)
         box2b.add_artist(el3b)

From 04639a4dae761ee2b181aceb8e340b96bb46c78a Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 15 Feb 2018 19:55:48 -0500
Subject: [PATCH 32/96] implementation planning d

---
 deer/q_networks/q_net_keras_lp.py       | 57 +++++++++++++++----------
 examples/simplest_test_PLI/run_test2.py |  2 +-
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 0a49f4f5..cb5a0019 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -370,7 +370,7 @@ def qValues(self, state_val):
         """ 
         return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
 
-    def qValues_planning(self, state_val, d=2.):
+    def qValues_planning(self, state_val, d=2):
         """ Get the q values for one belief state with a planning depth d
 
         Arguments
@@ -382,31 +382,42 @@ def qValues_planning(self, state_val, d=2.):
         -------
         The q values with planning depth d for the provided belief state
         """ 
-        identity_matrix = np.diag(np.ones(self._n_actions))
-        
         encoded_x = self.encoder.predict([np.expand_dims(state,axis=0) for state in state_val])
-
-        q_vals_d0=self.Q.predict([encoded_x])[0]
-        #print "q_vals_d0"
-        #print q_vals_d0
-        #tile3_encoded_x=np.array([enc for enc in encoded_x for i in range(self._n_actions)])
-        tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
-        #print tile3_encoded_x
-        r_vals_d0=np.array(self.R.predict([tile3_encoded_x,identity_matrix])).reshape((self._n_actions))
+        QD_plan=0
+        for i in range(d+1): #TO DO: improve planning algorithm
+            Qd=self.qValues_planning_abstr(encoded_x, d=i)
+            QD_plan+=Qd
+            print "Qd,i"
+            print Qd,i
+        QD_plan=QD_plan/(d+1)
         
-        #tile3_state_val=np.array([state for state in state_val for i in range(self._n_actions)])
-        #tile3_state_val=np.tile(state_val,(3,1,1,1))
+        print "QD_plan"
+        print QD_plan
+
+        return QD_plan
+
+    def qValues_planning_abstr(self, state_abstr_val, d):
+        """ 
+        """ 
+        #print "qValues_planning_abstr d"
+        #print d
+        n=len(state_abstr_val)
+        identity_matrix = np.diag(np.ones(self._n_actions))
+        if (d==0):
+            #print self.Q.predict([state_abstr_val])
+            return self.Q.predict([state_abstr_val])
+        else:
+            tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
+            repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0)
+            #print tile3_encoded_x
+            #print repeat_identity
+            r_vals_d0=np.array(self.R.predict([tile3_encoded_x,repeat_identity]))
+            #print r_vals_d0
+            r_vals_d0=r_vals_d0.flatten()
+            next_x_predicted=self.transition.predict([tile3_encoded_x,repeat_identity])
+            return r_vals_d0+self._df*np.amax(self.qValues_planning_abstr(next_x_predicted,d=d-1).reshape(len(state_abstr_val)*self._n_actions,self._n_actions),axis=1).flatten()
         
-        next_x_predicted=self.transition2.predict([tile3_encoded_x,identity_matrix])
-        q_vals_d1=self.Q.predict([next_x_predicted])
-        #print q_vals_d1
-        #print [np.max(vals) for vals in q_vals_d1]
-        #print r_vals_d0
-        #print (1-1/d)+(1-1/d)**2
-        #print "r_vals_d0+self._df*np.array([np.max(vals) for vals in q_vals_d1])"
-        #print r_vals_d0+self._df*np.array([np.max(vals) for vals in q_vals_d1])
-        #print ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*np.array([np.max(vals) for vals in q_vals_d1])
-        return ((1-1/d)+(1-1/d)**2)*np.array(q_vals_d0)+((1-1/d)**2)*(r_vals_d0+self._df*np.array([np.max(vals) for vals in q_vals_d1]))
+
 
     def chooseBestAction(self, state):
         """ Get the best action for a belief state
diff --git a/examples/simplest_test_PLI/run_test2.py b/examples/simplest_test_PLI/run_test2.py
index 4d293e7c..507a122d 100644
--- a/examples/simplest_test_PLI/run_test2.py
+++ b/examples/simplest_test_PLI/run_test2.py
@@ -24,7 +24,7 @@ class Defaults:
     # ----------------------
     STEPS_PER_EPOCH = 500
     EPOCHS = 500
-    STEPS_PER_TEST = 20
+    STEPS_PER_TEST = 100
     PERIOD_BTW_SUMMARY_PERFS = 1
     
     # ----------------------

From dc3006d7d4b793ed72b9c70f2e315d685f3a5128 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 15 Feb 2018 19:57:21 -0500
Subject: [PATCH 33/96] removing useless trick roll

---
 deer/q_networks/q_net_keras_lp.py | 33 +++++++++++--------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index cb5a0019..fd92d0c7 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -198,28 +198,19 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # Increase the entropy in the abstract features of two states
         # This is done only when states_val is made up of only one observation --> FIXME
         rolled=np.roll(states_val[0],1,axis=0)
-        #print "states_val[0]"
-        #print states_val[0]
-        #print "rolled"
-        #print rolled
-        for i in range(self._batch_size):
-            j=0
-            l=0
-            while((states_val[0][i]==rolled[i+j-l]).all()):
-                if(i+j==31):
-                    l=self._batch_size
-                if(j==31):
-                    break
-                j=j+1
-            rolled[i]=rolled[i+j-l]
-        #print "rolled"
-        #print rolled
+#        for i in range(self._batch_size):
+#            j=0
+#            l=0
+#            while((states_val[0][i]==rolled[i+j-l]).all()):
+#                if(i+j==31):
+#                    l=self._batch_size
+#                if(j==31):
+#                    break
+#                j=j+1
+#            rolled[i]=rolled[i+j-l]
         self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
-        #print self.loss_disambiguate2
-        #print "states_val[0]"
-        #print states_val[0]
-        #print rolled
-        #print self.loss_disambiguate1
+
+
         #self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
 
         # Disentangle actions

From cfaf66e1fbe40bdf8122a89cb3c8d781af6800ae Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 16 Feb 2018 09:20:02 -0500
Subject: [PATCH 34/96] uniformize env so that test22 and PLE are similar

---
 deer/q_networks/NN_keras_lp.py          | 1 +
 deer/q_networks/q_net_keras_lp.py       | 8 ++++----
 examples/PLE/PLE_env.py                 | 4 ++--
 examples/PLE/run_PLE.py                 | 2 +-
 examples/simplest_test_PLI/run_test2.py | 2 +-
 examples/simplest_test_PLI/test_env2.py | 2 +-
 6 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index d8b4144e..89433bcf 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -175,6 +175,7 @@ def transition_model(self):
         x = Concatenate()(inputs)#,axis=-1)
         #x = Dense(5, activation='tanh')(x) #5,15
         x = Dense(10, activation='tanh')(x) # ,30
+        x = Dense(10, activation='tanh')(x) # ,30
         #x = Dense(5, activation='tanh')(x) #5,15
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         x = Add()([inputs[0],x])
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index fd92d0c7..a5a41028 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -378,12 +378,12 @@ def qValues_planning(self, state_val, d=2):
         for i in range(d+1): #TO DO: improve planning algorithm
             Qd=self.qValues_planning_abstr(encoded_x, d=i)
             QD_plan+=Qd
-            print "Qd,i"
-            print Qd,i
+            #print "Qd,i"
+            #print Qd,i
         QD_plan=QD_plan/(d+1)
         
-        print "QD_plan"
-        print QD_plan
+        #print "QD_plan"
+        #print QD_plan
 
         return QD_plan
 
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 8099ef69..7bccb4b7 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -88,7 +88,7 @@ def act(self, action):
     def summarizePerformance(self, test_data_set, learning_algo):
         #print "test_data_set.observations.shape"
         #print test_data_set.observations()[0][0:1]
-        n=20
+        n=14
         historics=[]
         for i,observ in enumerate(test_data_set.observations()[0][0:n+1]):
             if(i<n):
@@ -127,7 +127,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
             ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
 
         # Plot the fitted one-step trajectory from time t=10
-        for i in range(19):
+        for i in range(n-1):
             predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0]])])
             predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0]])])
             predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1]])])
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index 2407d411..f97528ff 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -72,7 +72,7 @@ class Defaults:
     
     # --- Instantiate environment ---
     env = PLE_env(rng, game=game, frame_skip=parameters.frame_skip,
-            ple_options={"display_screen": True, "force_fps":True, "fps":30})
+            ple_options={"display_screen": True, "force_fps":True, "fps":20})
     
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
diff --git a/examples/simplest_test_PLI/run_test2.py b/examples/simplest_test_PLI/run_test2.py
index 507a122d..c849a6ae 100644
--- a/examples/simplest_test_PLI/run_test2.py
+++ b/examples/simplest_test_PLI/run_test2.py
@@ -52,7 +52,7 @@ class Defaults:
     REPLAY_MEMORY_SIZE = 1000000
     BATCH_SIZE = 32
     FREEZE_INTERVAL = 1000
-    DETERMINISTIC = True
+    DETERMINISTIC = False
 
 
 
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
index 1e609b0b..81aaf57c 100644
--- a/examples/simplest_test_PLI/test_env2.py
+++ b/examples/simplest_test_PLI/test_env2.py
@@ -24,7 +24,7 @@ def __init__(self):
         self._mode_episode_count = 0
 
         self._actions = [0,1]
-        self._height=18
+        self._height=15
         self._width=7 #preferably an odd number so that it's symmetrical
 
                 

From b7a0685c6536235eb2550c9cbf392bfedfa324cb Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Sun, 18 Feb 2018 18:16:18 -0500
Subject: [PATCH 35/96] working

---
 deer/q_networks/NN_keras_lp.py          | 11 ++++++-----
 deer/q_networks/q_net_keras_lp.py       |  4 ++--
 examples/PLE/PLE_env.py                 | 18 ++++++++++--------
 examples/PLE/run_PLE.py                 |  6 +++---
 examples/simplest_test_PLI/test_env2.py | 16 +++++++---------
 5 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 89433bcf..274f451f 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -54,7 +54,7 @@ def encoder_model(self):
             if len(dim) == 3:
                 input = Input(shape=(dim[0],dim[1],dim[2]))
                 inputs.append(input)
-                x = Conv2D(8, (4, 4), padding='same', activation='tanh')(input)
+                x = Conv2D(4, (4, 4), padding='same', activation='tanh')(input)
                 x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                 x = Conv2D(8, (4, 4), padding='same', activation='tanh')(x)
                 x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
@@ -111,8 +111,9 @@ def encoder_model(self):
             x= outs_conv [0]
         
         # we stack a deep fully-connected network on top
-        x = Dense(20, activation='relu')(x)
-        x = Dense(10, activation='relu')(x)
+        x = Dense(50, activation='tanh')(x)
+        x = Dense(20, activation='tanh')(x)
+        x = Dense(10, activation='tanh')(x)
         
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         
@@ -173,8 +174,8 @@ def transition_model(self):
         inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
 
         x = Concatenate()(inputs)#,axis=-1)
-        #x = Dense(5, activation='tanh')(x) #5,15
-        x = Dense(10, activation='tanh')(x) # ,30
+        x = Dense(10, activation='tanh')(x) #5,15
+#        x = Dense(30, activation='tanh')(x) # ,30
         x = Dense(10, activation='tanh')(x) # ,30
         #x = Dense(5, activation='tanh')(x) #5,15
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index a5a41028..a4ce5ee1 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -12,7 +12,7 @@
 from .NN_keras_lp import NN # Default Neural network used
 
 def mean_squared_error(y_true, y_pred):
-    return K.mean(K.square(y_pred - y_true), axis=-1)   # = mse error
+    return K.clip(K.mean(K.square(y_pred - y_true), axis=-1)-1,0.,100.)   # = mse error
 
 def exp_dec_error(y_true, y_pred):
     return K.exp( - 2.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
@@ -487,7 +487,7 @@ def setLearningRate(self, lr):
         
         K.set_value(self.transition2.optimizer.lr, self._lr/2.)
 
-        K.set_value(self.encoder.optimizer.lr, self._lr/5.)
+        K.set_value(self.encoder.optimizer.lr, self._lr)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
         K.set_value(self.diff_s_s_.optimizer.lr, self._lr)
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 7bccb4b7..8b7e9bc9 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -62,7 +62,7 @@ def reset(self, mode):
         self._screen = self._ple.getScreenGrayscale()
         cv2.resize(self._screen, (48, 48), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
         
-        return [2 * [48 * [48 * [0]]]]
+        return [1 * [48 * [48 * [0]]]]
         
         
     def act(self, action):
@@ -90,16 +90,18 @@ def summarizePerformance(self, test_data_set, learning_algo):
         #print test_data_set.observations()[0][0:1]
         n=14
         historics=[]
-        for i,observ in enumerate(test_data_set.observations()[0][0:n+1]):
-            if(i<n):
-                historics.append(np.expand_dims(observ,axis=0))
-            if(i>0):
-                historics[i-1]=np.concatenate([historics[i-1],np.expand_dims(observ,axis=0)], axis=0)
+        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
+            historics.append(np.expand_dims(observ,axis=0))
+#        for i,observ in enumerate(test_data_set.observations()[0][0:n+1]):
+#            if(i<n):
+#                historics.append(np.expand_dims(observ,axis=0))
+#            if(i>0):
+#                historics[i-1]=np.concatenate([historics[i-1],np.expand_dims(observ,axis=0)], axis=0)
         historics=np.array(historics)
         #print historics
         abs_states=learning_algo.encoder.predict(historics)
         print abs_states
-        actions=test_data_set.actions()[1:n+1] #instead of 0:n because history of 2 time steps considered
+        actions=test_data_set.actions()[0:n] #instead of 0:n because history of 2 time steps considered
         print actions
         print test_data_set.rewards()[0:n]
         if self.inTerminalState() == False:
@@ -324,7 +326,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         matplotlib.pyplot.close("all") # avoids memory leaks
 
     def inputDimensions(self):
-        return [(2, 48, 48)]
+        return [(1, 48, 48)]
 
     def observationType(self, subject):
         return np.float32
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index f97528ff..46700242 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -39,7 +39,7 @@ class Defaults:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
     LEARNING_RATE = 0.0001
-    LEARNING_RATE_DECAY = 0.99
+    LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
     DISCOUNT_MAX = 0.99
@@ -48,13 +48,13 @@ class Defaults:
     MOMENTUM = 0
     CLIP_DELTA = 1.0
     EPSILON_START = 1.0
-    EPSILON_MIN = .3
+    EPSILON_MIN = .8
     EPSILON_DECAY = 10000
     UPDATE_FREQUENCY = 1
     REPLAY_MEMORY_SIZE = 1000000
     BATCH_SIZE = 32
     FREEZE_INTERVAL = 1000
-    DETERMINISTIC = True
+    DETERMINISTIC = False
 
 
 
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
index 81aaf57c..56e6998c 100644
--- a/examples/simplest_test_PLI/test_env2.py
+++ b/examples/simplest_test_PLI/test_env2.py
@@ -132,16 +132,13 @@ def summarizePerformance(self, test_data_set, learning_algo):
             ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75)
             ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.75)
 
-#        for xx in [-2,-1.,0, 1., 2.]:
-#            for yy in [-2,-1.,0, 1., 2.]:
-#                for zz in [-2,-1.,0, 1., 2.]:
-#                    predicted1=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[1,0,0]])])
-#                    predicted2=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,1,0]])])
-#                    predicted3=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,0,1]])])
+#        for xx in np.arange(self._width)-self._width//2:
+#            for yy in np.arange(self._width)-self._width//2:
+#                for zz in np.arange(self._width)-self._width//2:
+#                    predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
+#                    predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
 #                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
 #                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
-#                    ax.plot(np.concatenate([np.array([xx]),predicted3[0,:1]]), np.concatenate([np.array([yy]),predicted3[0,1:2]]), np.concatenate([np.array([zz]),predicted3[0,2:]]), color="0", alpha=0.5)
-                    #ax.plot(np.concatenate([x[i:i+1],predicted[0,:1]]), np.concatenate([y[i:i+1],predicted[0,1:2]]), np.concatenate([z[i:i+1],predicted[0,2:]]), color="g")
         
 
         # Plot the colorbar for the trajectory
@@ -161,7 +158,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
 
         # Plot the dots at each time step depending on the action taken
-        #line3 = ax.scatter(all_possib_abs_states[:,0], all_possib_abs_states[:,1] ,all_possib_abs_states[:,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
+        line3 = ax.scatter(all_possib_abs_states[:,0], all_possib_abs_states[:,1] ,all_possib_abs_states[:,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
         line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
         axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
         zrange=axes_lims[2][1]-axes_lims[2][0]
@@ -245,6 +242,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
         cb1.set_label('Estimated expected return')
 
+        plt.show()
         plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
 
 

From f356a4fdb1fb2025a46d685ae1fb6dcb2a3406a7 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 7 Mar 2018 17:41:41 -0500
Subject: [PATCH 36/96] agent gathering_data option and first draft
 sticky_action

---
 deer/agent.py | 54 +++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index d551a849..287614c2 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -86,6 +86,8 @@ def __init__(self, environment, q_network, replay_memory_size=1000000, replay_st
             self._test_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.)
         else:
             self._test_policy = test_policy
+        self.gathering_data=True    # Whether the agent is gathering data or not
+        self.sticky_action=1        # Number of times the agent is forced to take the same action as part of one actual time step
 
     def setControllersActive(self, toDisable, active):
         """ Activate controller
@@ -311,24 +313,27 @@ def _runEpisode(self, maxSteps):
                 self._state[i][1:] = initState[i][1:]
         
         self._Vs_on_last_episode = []
+        is_terminal=False
+        reward=0
         while maxSteps > 0:
             maxSteps -= 1
-
-            obs = self._environment.observe()
-
-            for i in range(len(obs)):
-                self._state[i][0:-1] = self._state[i][1:]
-                self._state[i][-1] = obs[i]
-
-            V, action, reward = self._step()
-            
-            self._Vs_on_last_episode.append(V)
-            if self._mode != -1:
-                self._total_mode_reward += reward
-
-            is_terminal = self._environment.inTerminalState()
+            if(self.gathering_data==True or self._mode!=-1):
+                obs = self._environment.observe()
+                
+                for i in range(len(obs)):
+                    self._state[i][0:-1] = self._state[i][1:]
+                    self._state[i][-1] = obs[i]
+                
+                V, action, reward = self._step()
                 
-            self._addSample(obs, action, reward, is_terminal)
+                self._Vs_on_last_episode.append(V)
+                if self._mode != -1:
+                    self._total_mode_reward += reward
+                
+                is_terminal = self._environment.inTerminalState()
+                    
+                self._addSample(obs, action, reward, is_terminal)
+            
             for c in self._controllers: c.onActionTaken(self)
             
             if is_terminal:
@@ -359,8 +364,10 @@ def _step(self):
             Estimated value function of current state.
         """
 
-        action, V = self._chooseAction()        
-        reward = self._environment.act(action)
+        action, V = self._chooseAction()
+        reward=0
+        for i in range(self.sticky_action):
+            reward += self._environment.act(action)
 
         return V, action, reward
 
@@ -448,6 +455,7 @@ def __init__(self, env, random_state=None, max_size=1000, use_priority=False, on
             self._random_state = random_state
 
         self.n_elems  = 0
+        self.sticky_action=1        # Number of times the agent is forced to take the same action as part of one actual time step
 
     def actions(self):
         """Get all actions currently in the replay memory, ordered by time where they were taken."""
@@ -521,7 +529,7 @@ def randomBatch(self, size, use_priority):
                 trajectories are too short).
         """
 
-        if (self._max_history_size - 1 >= self.n_elems):
+        if (self._max_history_size - self.sticky_action >= self.n_elems):
             raise SliceError(
                 "Not enough elements in the dataset to create a "
                 "complete state. {} elements in dataset; requires {}"
@@ -536,10 +544,10 @@ def randomBatch(self, size, use_priority):
             rndValidIndices = np.zeros(size, dtype='int32')
             if (self._only_full_history):
                 for i in range(size): # TODO: multithread this loop?
-                    rndValidIndices[i] = self._randomValidStateIndex(self._max_history_size)
+                    rndValidIndices[i] = self._randomValidStateIndex(self._max_history_size+self.sticky_action-1)
             else:
                 for i in range(size): # TODO: multithread this loop?
-                    rndValidIndices[i] = self._randomValidStateIndex(minimum_without_terminal=1)
+                    rndValidIndices[i] = self._randomValidStateIndex(minimum_without_terminal=self.sticky_action)
                 
 
         actions   = np.vstack( self._actions.getSliceBySeq(rndValidIndices) )
@@ -549,11 +557,11 @@ def randomBatch(self, size, use_priority):
         states = np.zeros(len(self._batch_dimensions), dtype='object')
         next_states = np.zeros_like(states)
         # We calculate the first terminal index backward in time and set it 
-        # at maximum to the value self._max_history_size
+        # at maximum to the value self._max_history_size+self.sticky_action-1
         first_terminals=[]
         for rndValidIndex in rndValidIndices:
             first_terminal=1
-            while first_terminal<self._max_history_size:
+            while first_terminal<self._max_history_size+self.sticky_action-1:
                 if (self._terminals[rndValidIndex-first_terminal]==True or first_terminal>rndValidIndex):
                     break 
                 first_terminal+=1
@@ -563,7 +571,7 @@ def randomBatch(self, size, use_priority):
             states[input] = np.zeros((size,) + self._batch_dimensions[input], dtype=self._observations[input].dtype)
             next_states[input] = np.zeros_like(states[input])
             for i in range(size):
-                slice=self._observations[input].getSlice(rndValidIndices[i]+1-min(self._batch_dimensions[input][0],first_terminals[i]), rndValidIndices[i]+1)
+                slice=self._observations[input].getSlice(rndValidIndices[i]-self.sticky_action+2-min(self._batch_dimensions[input][0],first_terminals[i]+self.sticky_action-1), rndValidIndices[i]-self.sticky_action+2)
                 if (len(slice)==len(states[input][i])):
                     states[input][i] = slice
                 else:

From 01f65330322ca96794020819d71df451cf577621 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 7 Mar 2018 17:49:24 -0500
Subject: [PATCH 37/96]  a few modifs

---
 deer/q_networks/NN_keras_lp.py          | 124 ++++++------
 deer/q_networks/q_net_keras_lp.py       |  65 ++++---
 examples/ALE/ALE_env.py                 |   8 +-
 examples/PLE/PLE_env.py                 | 238 +++++++++++++-----------
 examples/PLE/run_PLE.py                 |  10 +-
 examples/simplest_test_PLI/run_test2.py |   2 +-
 examples/simplest_test_PLI/test_env2.py |  42 ++++-
 7 files changed, 274 insertions(+), 215 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
index 274f451f..abcc0312 100644
--- a/deer/q_networks/NN_keras_lp.py
+++ b/deer/q_networks/NN_keras_lp.py
@@ -54,12 +54,13 @@ def encoder_model(self):
             if len(dim) == 3:
                 input = Input(shape=(dim[0],dim[1],dim[2]))
                 inputs.append(input)
-                x = Conv2D(4, (4, 4), padding='same', activation='tanh')(input)
-                x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                x = Conv2D(8, (4, 4), padding='same', activation='tanh')(x)
-                x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                x = Conv2D(8, (4, 4), padding='same', activation='tanh')(x)
+                x=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)    #data_format='channels_last'
+                x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
+                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
                 x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                #x = Conv2D(16, (4, 4), padding='same', activation='tanh')(x)
+                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                 
                 out = Flatten()(x)
                 
@@ -111,8 +112,9 @@ def encoder_model(self):
             x= outs_conv [0]
         
         # we stack a deep fully-connected network on top
+        x = Dense(200, activation='tanh')(x)
+        x = Dense(100, activation='tanh')(x)
         x = Dense(50, activation='tanh')(x)
-        x = Dense(20, activation='tanh')(x)
         x = Dense(10, activation='tanh')(x)
         
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
@@ -175,7 +177,8 @@ def transition_model(self):
 
         x = Concatenate()(inputs)#,axis=-1)
         x = Dense(10, activation='tanh')(x) #5,15
-#        x = Dense(30, activation='tanh')(x) # ,30
+        x = Dense(30, activation='tanh')(x) # ,30
+        x = Dense(30, activation='tanh')(x) # ,30
         x = Dense(10, activation='tanh')(x) # ,30
         #x = Dense(5, activation='tanh')(x) #5,15
         x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
@@ -253,7 +256,6 @@ def diff_Tx_x_(self,encoder_model,transition_model):
         Tx= transition_model([enc_x,inputs[-1]])
         
         x = Subtract()([Tx,enc_x_])
-#        x = Dot(axes=-1, normalize=False)([x,x])
         
         model = Model(inputs=inputs, outputs=x )
         
@@ -300,50 +302,50 @@ def diff_s_s_(self,encoder_model):
         
         return model
 
-#    def diff_sa_sa(self,encoder_model,transition_model):
-#        """
-#    
-#        Parameters
-#        -----------
-#        s
-#        a
-#        rand_a
-#    
-#        Returns
-#        -------
-#        model with output Tx (= model estimate of x')
-#    
-#        """
-#        inputs=[]
-#        
-#        for i, dim in enumerate(self._input_dimensions):
-#            if len(dim) == 3:
-#                input = Input(shape=(dim[0],dim[1],dim[2]))
-#                inputs.append(input)
-#
-#            elif len(dim) == 2:
-#                input = Input(shape=(dim[0],dim[1]))
-#                inputs.append(input)
-#
-#            else:
-#                input = Input(shape=(dim[0],))
-#                inputs.append(input)
-#        
-#        input = Input(shape=(self._n_actions,))
-#        inputs.append(input)
-#        input = Input(shape=(self._n_actions,))
-#        inputs.append(input)
-#        
-#        enc_x = encoder_model(inputs[:-2]) #s --> x
-#        Tx= transition_model([enc_x,inputs[-2]])
-#        rand_Tx= transition_model([enc_x,inputs[-1]])
-#                
-#        x = Subtract()([Tx,rand_Tx])
-#        x = Dot(axes=-1, normalize=False)([x,x])
-#        
-#        model = Model(inputs=inputs, outputs=x )
-#        
-#        return model
+    def diff_sa_sa(self,encoder_model,transition_model):
+        """
+    
+        Parameters
+        -----------
+        s
+        a
+        rand_a
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs=[]
+        
+        for i, dim in enumerate(self._input_dimensions):
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
+
+            elif len(dim) == 2:
+                input = Input(shape=(dim[0],dim[1]))
+                inputs.append(input)
+
+            else:
+                input = Input(shape=(dim[0],))
+                inputs.append(input)
+        
+        input = Input(shape=(self._n_actions,))
+        inputs.append(input)
+        input = Input(shape=(self._n_actions,))
+        inputs.append(input)
+        
+        enc_x = encoder_model(inputs[:-2]) #s --> x
+        Tx= transition_model([enc_x,inputs[-2]])
+        rand_Tx= transition_model([enc_x,inputs[-1]])
+                
+        x = Subtract()([Tx,rand_Tx])
+        x = Dot(axes=-1, normalize=False)([x,x])
+        
+        model = Model(inputs=inputs, outputs=x )
+        
+        return model
 
     def diff_Tx(self,transition_model):
         """
@@ -392,9 +394,10 @@ def R_model(self):
         
         inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
         
-        x = Concatenate()(inputs[:1]+inputs[1:])#,axis=-1)
-        x = Dense(20, activation='relu')(inputs[0])
-        #x = Dense(10, activation='relu')(inputs[0])
+        x = Concatenate()(inputs)#,axis=-1)
+        x = Dense(10, activation='tanh')(x)
+        x = Dense(20, activation='tanh')(x)
+        x = Dense(10, activation='tanh')(x)
         
         out = Dense(1)(x)
                 
@@ -457,8 +460,9 @@ def Q_model(self):
         #x = Add()([x,inputs[-1]]) #????
         
         # we stack a deep fully-connected network on top
-        x = Dense(50, activation='relu')(inputs[0])
-        x = Dense(20, activation='relu')(x)
+        x = Dense(20, activation='tanh')(inputs[0])
+        x = Dense(50, activation='tanh')(x)
+        x = Dense(20, activation='tanh')(x)
         
         #if (self._action_as_input==False):
         #    if ( isinstance(self._n_actions,int)):
@@ -500,12 +504,10 @@ def full_Q_model(self, encoder_model, Q_model):
             else:
                 input = Input(shape=(dim[0],))
                 inputs.append(input)
-        
-        
+                
+        out = encoder_model(inputs)
         input = Input(shape=(self.internal_dim,))
         inputs.append(input)
-
-        out = encoder_model(inputs[:-1])
                 
         x=Add()([out,inputs[-1]]) # adding noise in the abstract state space
         
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index a4ce5ee1..f9669332 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -9,13 +9,14 @@
 from keras.optimizers import SGD,RMSprop
 from keras import backend as K
 from ..base_classes import QNetwork
-from .NN_keras_lp import NN # Default Neural network used
+from .NN_keras_lp_high_int_dim import NN # Default Neural network used
 
 def mean_squared_error(y_true, y_pred):
-    return K.clip(K.mean(K.square(y_pred - y_true), axis=-1)-1,0.,100.)   # = mse error
+    return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
+    #return K.mean(  K.square( K.clip(K.abs(y_pred - y_true)-1,0.,100.) )  ,  axis=-1  )   # = mse error
 
 def exp_dec_error(y_true, y_pred):
-    return K.exp( - 2.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
+    return K.exp( - 5.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
 
 def rms_from_squared_components(y_true, y_pred):
     return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
@@ -80,7 +81,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.loss_disambiguate2=0
 
         
-        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
+        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=True)
 
         self.encoder = self.learn_and_plan.encoder_model()
         self.encoder_diff = self.learn_and_plan.encoder_diff_model(self.encoder)
@@ -104,7 +105,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
 #        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
 
         # used to disentangle actions
-        #self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)
+        self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)
                 
         layers=self.full_Q.layers
         # Grab all the parameters together.
@@ -158,13 +159,14 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         onehot_actions = np.zeros((self._batch_size, self._n_actions))
         onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
-        #onehot_actions_rand = np.zeros((self._batch_size, self._n_actions))
-        #onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1
+        onehot_actions_rand = np.zeros((self._batch_size, self._n_actions))
+        onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1
         states_val=list(states_val)
         next_states_val=list(next_states_val)
         Es_=self.encoder.predict(next_states_val)
         Es=self.encoder.predict(states_val)
         ETs=self.transition.predict([Es,onehot_actions])
+        R=self.R.predict([Es,onehot_actions])
                    
         if(self.update_counter%100==0):
             print states_val[0][0]
@@ -172,10 +174,13 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print len(states_val)
             print next_states_val[0][0]
             print actions_val, rewards_val, terminals_val
-            print "Es"
-            print Es
-            print "ETs,Es_"
-            print ETs,Es_
+            print "Es,ETs,Es_"
+            if(Es.ndim==4):
+                print np.transpose(Es, (0, 3, 1, 2)),np.transpose(ETs, (0, 3, 1, 2)),np.transpose(Es_, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
+            else:
+                print Es,ETs,Es_
+            print "R"
+            print R
             
         # Fit transition
 #        for i in range(10):
@@ -183,7 +188,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 #            print l
 #            self.loss_T2+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
 
-        l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions], np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions], np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))
         self.loss_T+=l
         
 
@@ -193,7 +198,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
     
         # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
         # reduce the squared value of the abstract features
-        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
         
         # Increase the entropy in the abstract features of two states
         # This is done only when states_val is made up of only one observation --> FIXME
@@ -208,13 +213,13 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 #                    break
 #                j=j+1
 #            rolled[i]=rolled[i+j-l]
-        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
 
 
-        #self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
+        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
 
         # Disentangle actions
-        #self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch([states_val[0],onehot_actions,onehot_actions_rand], np.ones(self._batch_size))
+        self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch([states_val[0],onehot_actions,onehot_actions_rand], np.ones(self._batch_size))
 
 #
 #        # Loss to have all s' following s,a with a to a distance 1 of s,a)
@@ -244,21 +249,21 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             self.loss_disambiguate2=0
             
             print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))
-            print self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))
+            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
+            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
 
             print "self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((32,self.learn_and_plan.internal_dim)))
-            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.zeros((32,self.learn_and_plan.internal_dim)))
+            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((32,self.learn_and_plan.internal_dim)))
+            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((32,self.learn_and_plan.internal_dim)))
 
             print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))
+            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
 
 
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
         
-        next_q_vals = self.next_full_Q.predict([next_states_val[0],np.zeros((32,self.learn_and_plan.internal_dim))])
+        next_q_vals = self.next_full_Q.predict([next_states_val[0],np.zeros_like(Es)]) #np.zeros((32,self.learn_and_plan.internal_dim))])
         
         if(self._double_Q==True):
             next_q_vals_current_qnet=self.full_Q.predict(next_states_val.tolist())
@@ -271,7 +276,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         
-        q_vals=self.full_Q.predict([states_val[0],np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])
+        q_vals=self.full_Q.predict([states_val[0],np.zeros_like(Es)]) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])
 
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
         q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
@@ -285,7 +290,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # My loss should only take these into account.
         # Workaround here is that many values are already "exact" in this update
         #if (self.update_counter<10000):
-        noise_to_be_robust=np.random.normal(size=(self._batch_size,self.learn_and_plan.internal_dim))*0.#25
+        noise_to_be_robust=np.zeros_like(Es) #np.random.normal(size=(self._batch_size,self.learn_and_plan.internal_dim))*0.#25
 
         loss=0
         loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
@@ -455,11 +460,11 @@ def _compile(self):
                   #metrics=['accuracy'])
 
         self.diff_s_s_.compile(optimizer=optimizer6,
-                  loss=loss_diff_s_s_)#exp_dec_error)#'mse')
+                  loss=exp_dec_error)#'mse')#loss_diff_s_s_)
                   #metrics=['accuracy'])
 
-#        self.diff_sa_sa.compile(optimizer=optimizer7,
-#                  loss=loss_diff_s_s_)
+        self.diff_sa_sa.compile(optimizer=optimizer7,
+                  loss=exp_dec_error)#loss_diff_s_s_)
 
 #        self.diff_Tx.compile(optimizer=optimizer,
 #                  loss=mean_squared_error)
@@ -480,7 +485,7 @@ def setLearningRate(self, lr):
         self._lr = lr
         print "modif lr"
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
-        K.set_value(self.full_Q.optimizer.lr, self._lr)
+        K.set_value(self.full_Q.optimizer.lr, self._lr*2)
 
         K.set_value(self.full_R.optimizer.lr, self._lr)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr)
@@ -490,6 +495,6 @@ def setLearningRate(self, lr):
         K.set_value(self.encoder.optimizer.lr, self._lr)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr)
-#        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.)
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
+        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.)
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
diff --git a/examples/ALE/ALE_env.py b/examples/ALE/ALE_env.py
index 782ad36d..2c063473 100644
--- a/examples/ALE/ALE_env.py
+++ b/examples/ALE/ALE_env.py
@@ -67,11 +67,9 @@ def reset(self, mode):
     def act(self, action):
         action = self._actions[action]
         
-        reward = 0
-        for _ in range(self._frame_skip):
-            reward += self._ale.act(action)
-            if self.inTerminalState():
-                break
+        reward = self._ale.act(action)
+        if self.inTerminalState():
+            break
             
         self._ale.getScreenGrayscale(self._screen)
         cv2.resize(self._screen, (84, 84), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 8b7e9bc9..64ed37d9 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -14,11 +14,14 @@
 from mpl_toolkits.axes_grid1 import host_subplot
 import mpl_toolkits.axisartist as AA
 import matplotlib.pyplot as plt
+import sys
+sys.path.insert(0, 'all_frames')
+import plot_all_frames
 
 class MyEnv(Environment):
     VALIDATION_MODE = 0
 
-    def __init__(self, rng, game=None, frame_skip=2, 
+    def __init__(self, rng, game=None, frame_skip=2, width=64, height=64,
             ple_options={"display_screen": True, "force_fps":True, "fps":15}):
 
         self._mode = -1
@@ -27,6 +30,8 @@ def __init__(self, rng, game=None, frame_skip=2,
 
         self._frame_skip = frame_skip if frame_skip >= 1 else 1
         self._random_state = rng
+        self.width=width
+        self.height=height
        
         if game is None:
             raise ValueError("Game must be provided")
@@ -37,7 +42,7 @@ def __init__(self, rng, game=None, frame_skip=2,
 
         w, h = self._ple.getScreenDims()
         self._screen = np.empty((h, w), dtype=np.uint8)
-        self._reduced_screen = np.empty((48, 48), dtype=np.uint8)
+        self._reduced_screen = np.empty((32, 32), dtype=np.uint8)
         self._actions = self._ple.getActionSet()
 
                 
@@ -49,7 +54,7 @@ def reset(self, mode):
                 self._mode_episode_count = 0
                 # fix the seed for every new validation. It potentially removes one source of variance and
                 # it allows to show some illustration of the learning for the same setting in validation
-                self._ple.game.rng = np.random.RandomState(23) # 23:left, center, right, ...
+                #self._ple.game.rng = np.random.RandomState(23) # 23:left, center, right, ...
             else:
                 self._mode_episode_count += 1
         elif self._mode != -1: # and thus mode == -1
@@ -60,9 +65,9 @@ def reset(self, mode):
         #for _ in range(self._ple.rng.randint(15)):
         #    self._ple.act(self._ple.NOOP)
         self._screen = self._ple.getScreenGrayscale()
-        cv2.resize(self._screen, (48, 48), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
+        cv2.resize(self._screen, (32, 32), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
         
-        return [1 * [48 * [48 * [0]]]]
+        return [1 * [32 * [32 * [0]]]]
         
         
     def act(self, action):
@@ -80,17 +85,31 @@ def act(self, action):
                 break
             
         self._screen = self._ple.getScreenGrayscale()
-        cv2.resize(self._screen, (48, 48), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
+        cv2.resize(self._screen, (32, 32), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
   
         self._mode_score += self.reward
         return np.sign(self.reward)
 
     def summarizePerformance(self, test_data_set, learning_algo):
+        all_possib_inp=np.expand_dims(np.array(plot_all_frames.get_all_possib_inp(self.width,self.height)),axis=1)/256.
+        #print "all_possib_inp[0]"
+        print "all_possib_inp.shape"
+        print all_possib_inp.shape
+        #print all_possib_inp[0]
+        #print all_possib_inp[224]
+        #print all_possib_inp[225]
+        #print "all_possib_inp[449]"
+        #print all_possib_inp[449]
+        #print all_possib_inp[450]
+        all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
+        print "np.array(all_possib_abs_states).shape"
+        print np.array(all_possib_abs_states).shape
+        #print all_possib_abs_states[0,0]
         #print "test_data_set.observations.shape"
         #print test_data_set.observations()[0][0:1]
         n=14
         historics=[]
-        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
+        for i,observ in enumerate(test_data_set.observations()[0][1:n]):
             historics.append(np.expand_dims(observ,axis=0))
 #        for i,observ in enumerate(test_data_set.observations()[0][0:n+1]):
 #            if(i<n):
@@ -98,12 +117,14 @@ def summarizePerformance(self, test_data_set, learning_algo):
 #            if(i>0):
 #                historics[i-1]=np.concatenate([historics[i-1],np.expand_dims(observ,axis=0)], axis=0)
         historics=np.array(historics)
-        #print historics
+        #print "historics[0]"
+        #print historics.shape
+        #print historics[0]
         abs_states=learning_algo.encoder.predict(historics)
         print abs_states
-        actions=test_data_set.actions()[0:n] #instead of 0:n because history of 2 time steps considered
+        actions=test_data_set.actions()[1:n] #instead of 0:n because history of 2 time steps considered
         print actions
-        print test_data_set.rewards()[0:n]
+        print test_data_set.rewards()[1:n]
         if self.inTerminalState() == False:
             self._mode_episode_count += 1
         print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
@@ -166,6 +187,10 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
 
         # Plot the dots at each time step depending on the action taken
+        self._nx_block=3
+        length_block=15*8
+        for i in range(self._nx_block):
+            line3 = ax.scatter(all_possib_abs_states[i*length_block:(i+1)*length_block,0], all_possib_abs_states[i*length_block:(i+1)*length_block,1] ,all_possib_abs_states[i*length_block:(i+1)*length_block,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.2)
         print np.tile(np.expand_dims(actions,axis=1),(1,3))
         print np.tile(np.expand_dims(0.75-actions/4.,axis=1),(1,3))
         line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(0.9-actions/3.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
@@ -224,109 +249,110 @@ def summarizePerformance(self, test_data_set, learning_algo):
         ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
-
-
-        # Plot the Q_vals
-        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-        #print "actions,C"
-        #print actions
-        #print c
-        #c=np.max(c,axis=1)
-        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-        m3=ax.scatter(x, y, z+zrange/10, c=c[:,2], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-        
-        #plt.colorbar(m3)
-        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.RdYlGn
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
-
-        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
-
-
-        # fig_visuV
-        fig = plt.figure()
-        ax = fig.add_subplot(111, projection='3d')
-        
-        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
-        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
-        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-
-        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-        c=np.max(c,axis=1)
-        #print "c"
-        #print c
-        
-        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-        #plt.colorbar(m)
-        fig.subplots_adjust(right=0.8)
-        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.hot
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
-
         #plt.show()
-        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
-
-
-        # fig_visuR
-        fig = plt.figure()
-        ax = fig.add_subplot(111, projection='3d')
-        
-        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
-        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
-        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-
-        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
-        repeat3_coord=np.repeat(coords,3,axis=0)
-        identity_matrix = np.diag(np.ones(self.nActions()))
-        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
-
-        c = learning_algo.R.predict([repeat3_coord,tile_identity_matrix])
-        c=np.max(np.reshape(c,(125,3)),axis=1)
-        #print "c"
-        #print c
-        #mini=np.min(c)
-        #maxi=np.max(c)
-        
-        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-        #plt.colorbar(m)
-        fig.subplots_adjust(right=0.8)
-        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.hot
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
 
-        #plt.show()
-        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
+#        # Plot the Q_vals
+#        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+#        #print "actions,C"
+#        #print actions
+#        #print c
+#        #c=np.max(c,axis=1)
+#        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+#        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+#        m3=ax.scatter(x, y, z+zrange/10, c=c[:,2], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+#        
+#        #plt.colorbar(m3)
+#        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
+#        cmap = matplotlib.cm.RdYlGn
+#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+#
+#        # ColorbarBase derives from ScalarMappable and puts a colorbar
+#        # in a specified axes, so it has everything needed for a
+#        # standalone colorbar.  There are many more kwargs, but the
+#        # following gives a basic continuous colorbar with ticks
+#        # and labels.
+#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+#        cb1.set_label('Estimated expected return')
+#
+#        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
+#
+#
+#        # fig_visuV
+#        fig = plt.figure()
+#        ax = fig.add_subplot(111, projection='3d')
+#        
+#        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+#        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+#        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+#
+#        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+#        c=np.max(c,axis=1)
+#        #print "c"
+#        #print c
+#        
+#        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+#        #plt.colorbar(m)
+#        fig.subplots_adjust(right=0.8)
+#        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+#        cmap = matplotlib.cm.hot
+#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+#
+#        # ColorbarBase derives from ScalarMappable and puts a colorbar
+#        # in a specified axes, so it has everything needed for a
+#        # standalone colorbar.  There are many more kwargs, but the
+#        # following gives a basic continuous colorbar with ticks
+#        # and labels.
+#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+#        cb1.set_label('Estimated expected return')
+#
+#        #plt.show()
+#        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
+#
+#
+#        # fig_visuR
+#        fig = plt.figure()
+#        ax = fig.add_subplot(111, projection='3d')
+#        
+#        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+#        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+#        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+#
+#        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
+#        repeat3_coord=np.repeat(coords,3,axis=0)
+#        identity_matrix = np.diag(np.ones(self.nActions()))
+#        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
+#
+#        c = learning_algo.R.predict([repeat3_coord,tile_identity_matrix])
+#        c=np.max(np.reshape(c,(125,3)),axis=1)
+#        #print "c"
+#        #print c
+#        #mini=np.min(c)
+#        #maxi=np.max(c)
+#        
+#        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+#        #plt.colorbar(m)
+#        fig.subplots_adjust(right=0.8)
+#        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+#        cmap = matplotlib.cm.hot
+#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+#
+#        # ColorbarBase derives from ScalarMappable and puts a colorbar
+#        # in a specified axes, so it has everything needed for a
+#        # standalone colorbar.  There are many more kwargs, but the
+#        # following gives a basic continuous colorbar with ticks
+#        # and labels.
+#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+#        cb1.set_label('Estimated expected return')
+#
+#        #plt.show()
+#        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
 
         matplotlib.pyplot.close("all") # avoids memory leaks
 
     def inputDimensions(self):
-        return [(1, 48, 48)]
+        return [(1, 32, 32)]
 
     def observationType(self, subject):
         return np.float32
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
index 46700242..ac150aea 100644
--- a/examples/PLE/run_PLE.py
+++ b/examples/PLE/run_PLE.py
@@ -38,7 +38,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0001
+    LEARNING_RATE = 0.0002
     LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
@@ -48,7 +48,7 @@ class Defaults:
     MOMENTUM = 0
     CLIP_DELTA = 1.0
     EPSILON_START = 1.0
-    EPSILON_MIN = .8
+    EPSILON_MIN = 1.0
     EPSILON_DECAY = 10000
     UPDATE_FREQUENCY = 1
     REPLAY_MEMORY_SIZE = 1000000
@@ -60,7 +60,9 @@ class Defaults:
 
 
 if __name__ == "__main__":
-    game = Catcher(width=64, height=64) 
+    width=64
+    height=64
+    game = Catcher(width=width, height=height) 
     logging.basicConfig(level=logging.INFO)
     
     # --- Parse parameters ---
@@ -71,7 +73,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = PLE_env(rng, game=game, frame_skip=parameters.frame_skip,
+    env = PLE_env(rng, game=game, frame_skip=parameters.frame_skip,width=width, height=height,
             ple_options={"display_screen": True, "force_fps":True, "fps":20})
     
     # --- Instantiate qnetwork ---
diff --git a/examples/simplest_test_PLI/run_test2.py b/examples/simplest_test_PLI/run_test2.py
index c849a6ae..5075e14c 100644
--- a/examples/simplest_test_PLI/run_test2.py
+++ b/examples/simplest_test_PLI/run_test2.py
@@ -24,7 +24,7 @@ class Defaults:
     # ----------------------
     STEPS_PER_EPOCH = 500
     EPOCHS = 500
-    STEPS_PER_TEST = 100
+    STEPS_PER_TEST = 200
     PERIOD_BTW_SUMMARY_PERFS = 1
     
     # ----------------------
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
index 56e6998c..6c543fe2 100644
--- a/examples/simplest_test_PLI/test_env2.py
+++ b/examples/simplest_test_PLI/test_env2.py
@@ -26,6 +26,12 @@ def __init__(self):
         self._actions = [0,1]
         self._height=15
         self._width=7 #preferably an odd number so that it's symmetrical
+        self._nx_block=3 #number of different x positions of the falling blocks
+        if(self._nx_block==1):
+            self._x_block=self._width//2
+        else:
+            rand=np.random.randint(self._nx_block) # random selection of the pos for falling block
+            self._x_block=rand*((self._width-1)//(self._nx_block-1)) # traduction in a number in [0,self._width] of rand
 
                 
     def reset(self, mode):
@@ -41,6 +47,11 @@ def reset(self, mode):
         
         self.y=self._height-1
         self.x=self._width//2
+        if(self._nx_block==1):
+            self._x_block=self._width//2
+        else:
+            rand=np.random.randint(self._nx_block) # random selection of the pos for falling block
+            self._x_block=rand*((self._width-1)//(self._nx_block-1)) # traduction in a number in [0,self._width] of rand
                 
         return np.array([[0,0,0,1,0,1,0]]) #[0,0,1]+[0,1,0]
         
@@ -55,7 +66,7 @@ def act(self, action):
 
         self.y = self.y-1
               
-        if(self.y==0 and self.x==self._width//2):
+        if(self.y==0 and self.x==self._x_block):
             self.reward = 1
         elif(self.y==0):
             self.reward = -1
@@ -72,9 +83,20 @@ def summarizePerformance(self, test_data_set, learning_algo):
         possib_y = np.zeros((self._height-1, self._height))
         possib_y[np.arange(self._height-1), 1+np.arange(self._height-1)] = 1
         possib_x=np.diag(np.ones(self._width))
-        rep_x=np.repeat(possib_x,self._height-1,axis=0)
-        rep_y=np.tile(possib_y,(self._width,1))
-        all_possib_inp=np.expand_dims(np.concatenate((rep_y,rep_x),axis=1),axis=1)
+        rep_x=np.tile(np.repeat(possib_x,self._height-1,axis=0),(self._nx_block,1))
+        rep_y=np.tile(np.tile(possib_y,(self._width,1)),(self._nx_block,1))
+        if(self._nx_block==1):
+            possib_x_block=np.zeros((1,self._width))
+            possib_x_block[0,self._width//2]=1
+        else:
+            possib_x_block=[]
+            for i in range(self._nx_block):
+                one_hot_x_block=np.zeros((self._width))
+                j=i*((self._width-1)//(self._nx_block-1))
+                one_hot_x_block[j]=1
+                possib_x_block.append(one_hot_x_block)
+        rep_x_block=np.repeat(np.array(possib_x_block),(self._height-1)*self._width,axis=0)
+        all_possib_inp=np.expand_dims(np.concatenate((rep_y,rep_x,rep_x_block),axis=1),axis=1)
         all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
         print "learning_algo.encoder.predict(all_possib_inp)"
         print all_possib_abs_states
@@ -158,7 +180,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
 
         # Plot the dots at each time step depending on the action taken
-        line3 = ax.scatter(all_possib_abs_states[:,0], all_possib_abs_states[:,1] ,all_possib_abs_states[:,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
+        length_block=(self._height-1)*self._width
+        for i in range(self._nx_block):
+            line3 = ax.scatter(all_possib_abs_states[i*length_block:(i+1)*length_block,0], all_possib_abs_states[i*length_block:(i+1)*length_block,1] ,all_possib_abs_states[i*length_block:(i+1)*length_block,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.2)
         line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
         axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
         zrange=axes_lims[2][1]-axes_lims[2][0]
@@ -217,6 +241,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        #plt.show()
         plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 
@@ -242,7 +267,6 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
         cb1.set_label('Estimated expected return')
 
-        plt.show()
         plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
 
 
@@ -319,7 +343,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         matplotlib.pyplot.close("all") # avoids memory leaks
 
     def inputDimensions(self):
-        return [(1,self._height+self._width)]
+        return [(1,self._height+self._width+self._width)]
 
     def observationType(self, subject):
         return np.float32
@@ -332,7 +356,9 @@ def observe(self):
         one_hot_x[self.x]=1
         one_hot_y=np.zeros(self._height)
         one_hot_y[self.y]=1
-        return [np.array(list(one_hot_y)+list(one_hot_x))]
+        one_hot_x_block=np.zeros(self._width)
+        one_hot_x_block[self._x_block]=1
+        return [np.array(list(one_hot_y)+list(one_hot_x)+list(one_hot_x_block))]
 
     def inTerminalState(self):
         if (self.y==0):

From a53809fc95d0fe7f4af851e22d8b2d061cd6e3d9 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 7 Mar 2018 17:52:38 -0500
Subject: [PATCH 38/96] adding files

---
 deer/q_networks/NN_keras_lp_high_int_dim.py | 648 ++++++++++++++++++++
 examples/simplest_test_PLI/run_test3.py     | 191 ++++++
 examples/simplest_test_PLI/run_test4.py     | 195 ++++++
 examples/simplest_test_PLI/test_env3.py     | 377 ++++++++++++
 examples/simplest_test_PLI/test_env4.py     | 432 +++++++++++++
 5 files changed, 1843 insertions(+)
 create mode 100644 deer/q_networks/NN_keras_lp_high_int_dim.py
 create mode 100644 examples/simplest_test_PLI/run_test3.py
 create mode 100644 examples/simplest_test_PLI/run_test4.py
 create mode 100644 examples/simplest_test_PLI/test_env3.py
 create mode 100644 examples/simplest_test_PLI/test_env4.py

diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/q_networks/NN_keras_lp_high_int_dim.py
new file mode 100644
index 00000000..04997aa8
--- /dev/null
+++ b/deer/q_networks/NN_keras_lp_high_int_dim.py
@@ -0,0 +1,648 @@
+"""
+Neural network using Keras (called by q_net_keras)
+.. Author: Vincent Francois-Lavet
+"""
+
+import numpy as np
+from keras import backend as K
+from keras.models import Model
+from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization, merge, RepeatVector
+from keras import regularizers
+np.random.seed(102912)
+
+class NN():
+    """
+    Deep Q-learning network using Keras
+    
+    Parameters
+    -----------
+    batch_size : int
+        Number of tuples taken into account for each iteration of gradient descent
+    input_dimensions :
+    n_actions :
+    random_state : numpy random number generator
+    action_as_input : Boolean
+        Whether the action is given as input or as output
+    high_int_dim : Boolean
+        Whether the abstract state should be high dimensional in the form of frames/vectors or whether it should be low-dimensional
+    """
+    def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False, high_int_dim=True):
+        self._input_dimensions=input_dimensions
+        self._batch_size=batch_size
+        self._random_state=random_state
+        self._n_actions=n_actions
+        self._action_as_input=action_as_input
+        self._high_int_dim=high_int_dim
+        if(high_int_dim==True):            
+            self.internal_dim=input_dimensions[0][-2]*input_dimensions[0][-1] # In the case where the observation is a frame (or an history of frames)
+        else:
+            self.internal_dim=3
+
+    def encoder_model(self):
+        """
+    
+        Parameters
+        -----------
+        s
+    
+        Returns
+        -------
+        model with output x (= encoding of s)
+    
+        """
+        layers=[]
+        outs_conv=[]
+        inputs=[]
+
+        for i, dim in enumerate(self._input_dimensions):
+            # - observation[i] is a FRAME
+            print "dim enc"
+            print dim
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
+                x=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)    #data_format='channels_last'
+                #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
+                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                #x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
+                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
+                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                #x = Conv2D(16, (4, 4), padding='same', activation='tanh')(x)
+                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                
+                if(self._high_int_dim==True):
+                    x = Conv2D(dim[0], (3, 3), padding='same', activation='tanh')(x)
+                    out = x
+                else:
+                    out = Flatten()(x)
+                
+            # - observation[i] is a VECTOR
+            elif len(dim) == 2:
+                if dim[0] > 3:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+                    reshaped=Reshape((dim[0],dim[1],1), input_shape=(dim[0],dim[1]))(input)     #data_format='channels_last'
+                    x = Conv2D(16, (2, 1), activation='relu', border_mode='valid')(reshaped)    #Conv on the history
+                    x = Conv2D(16, (2, 2), activation='relu', border_mode='valid')(x)           #Conv on the history & features
+            
+                    if(self._high_int_dim==True):
+                        out = x
+                    else:
+                        out = Flatten()(x)
+                else:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+                    out = Flatten()(input)
+            
+            # - observation[i] is a SCALAR -
+            else:
+                if dim[0] > 3:
+                    # this returns a tensor
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+                    reshaped=Reshape((1,dim[0],1), input_shape=(dim[0],))(input)            #data_format='channels_last'
+                    x = Conv2D(8, (1,2), activation='relu', border_mode='valid')(reshaped)  #Conv on the history
+                    x = Conv2D(8, (1,2), activation='relu', border_mode='valid')(x)         #Conv on the history
+                    
+                    if(self._high_int_dim==True):
+                        out = x
+                    else:
+                        out = Flatten()(x)
+                                        
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+                    out=input
+                    
+            outs_conv.append(out)
+
+        if (self._high_int_dim==True):
+            if ( isinstance(self._n_actions,int)):
+                print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
+            else:
+                input = Input(shape=(len(self._n_actions),))
+                inputs.append(input)
+                outs_conv.append(input)
+        
+        if(self._high_int_dim==False):
+            if len(outs_conv)>1:
+                x = merge(outs_conv, mode='concat')
+            else:
+                x= outs_conv [0]
+        
+            # we stack a deep fully-connected network on top
+            x = Dense(200, activation='tanh')(x)
+            x = Dense(100, activation='tanh')(x)
+            x = Dense(50, activation='tanh')(x)
+            x = Dense(10, activation='tanh')(x)
+        
+            x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+        
+        model = Model(inputs=inputs, outputs=x)
+        
+        return model
+
+    def encoder_diff_model(self,encoder_model):
+        """
+    
+        Parameters
+        -----------
+        s
+    
+        Returns
+        -------
+        model with output x (= encoding of s)
+    
+        """
+        inputs=[]
+        
+        for j in range(2):
+            for i, dim in enumerate(self._input_dimensions):
+                if len(dim) == 3:
+                    input = Input(shape=(dim[0],dim[1],dim[2]))
+                    inputs.append(input)
+            
+                elif len(dim) == 2:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+            
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+        
+        half = len(inputs)/2
+        x1 = encoder_model(inputs[:half])
+        x2 = encoder_model(inputs[half:])
+        
+        if (self._high_int_dim==True):
+            x1=Flatten()(x1)
+            x2=Flatten()(x2)
+            x = Subtract()([x1,x2])   #([Tx,enc_x_])
+        else:
+            x = Subtract()([Tx,enc_x_])
+        x = Subtract()([x1,x2])
+        model = Model(inputs=inputs, outputs=x)
+        
+        return model
+
+    def transition_model(self):
+        """
+    
+        Parameters
+        -----------
+        x
+        a
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        if(self._high_int_dim==True):
+            dim=self._input_dimensions[0] #FIXME
+            inputs = [ Input(shape=(dim[1],dim[2],dim[0])), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
+            print inputs[0]._keras_shape
+            print inputs[1]._keras_shape
+            
+            layers_action=inputs[1]
+            layers_action=RepeatVector(dim[1]*dim[2])(layers_action)#K.repeat_elements(layers_action,rep=dim[1]*dim[2],axis=1)
+            layers_action=Reshape((self._n_actions,dim[1],dim[2]))(layers_action)
+            layers_action=Permute((2,3,1), input_shape=(dim[0]+self._n_actions,dim[1],dim[2]))(layers_action)    #data_format='channels_last'
+            
+            x = Concatenate(axis=-1)([layers_action,inputs[0]])
+            
+            x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+            #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+            x = Conv2D(dim[0], (3, 3), padding='same', activation='tanh')(x)
+            x = Add()([inputs[0],x])
+        else:
+            inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ]     # x
+
+            x = Concatenate()(inputs)#,axis=-1)
+            x = Dense(10, activation='tanh')(x) #5,15
+            x = Dense(30, activation='tanh')(x) # ,30
+            x = Dense(30, activation='tanh')(x) # ,30
+            x = Dense(10, activation='tanh')(x) # ,30
+            x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+            x = Add()([inputs[0],x])
+        
+        model = Model(inputs=inputs, outputs=x)
+        
+        return model
+
+    def transition_model2(self):
+        """
+    
+        Parameters
+        -----------
+        x
+        a
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
+
+        x = Concatenate()(inputs)#,axis=-1)
+        x = Dense(10, activation='tanh')(x)
+        x = BatchNormalization()(x)
+        x = Dense(50, activation='tanh')(x)
+        x = BatchNormalization()(x)
+        x = Dense(10, activation='tanh')(x)
+        x = BatchNormalization()(x)
+        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+        x = Add()([inputs[0],x])
+        
+        model = Model(inputs=inputs, outputs=x)
+        
+        return model
+
+    def diff_Tx_x_(self,encoder_model,transition_model):
+        """
+    
+        Parameters
+        -----------
+        s
+        a
+        s'
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs=[]
+        for j in range(2):
+            for i, dim in enumerate(self._input_dimensions):
+                if len(dim) == 3:
+                    input = Input(shape=(dim[0],dim[1],dim[2]))
+                    inputs.append(input)
+            
+                elif len(dim) == 2:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+            
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+
+        half = len(inputs)/2
+        enc_x = encoder_model(inputs[:half]) #s --> x
+        enc_x_ = encoder_model(inputs[half:]) #s --> x
+
+        input = Input(shape=(self._n_actions,))
+        inputs.append(input)
+                
+        Tx= transition_model([enc_x,inputs[-1]])
+        print "Tx._keras_shape"
+        print Tx._keras_shape
+        print enc_x_._keras_shape
+
+        if (self._high_int_dim==True):
+            #Tx=Flatten()(Tx)
+            #enc_x_=Flatten()(enc_x_)
+            x = Subtract()([Tx,enc_x_])   #([Tx,enc_x_])
+        else:
+            x = Subtract()([Tx,enc_x_])
+        
+        model = Model(inputs=inputs, outputs=x )
+        
+        return model
+
+    def diff_s_s_(self,encoder_model):
+        """
+    
+        Parameters
+        -----------
+        s
+        a
+        random z
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs=[]
+        
+        for j in range(2):
+            for i, dim in enumerate(self._input_dimensions):
+                if len(dim) == 3:
+                    input = Input(shape=(dim[0],dim[1],dim[2]))
+                    inputs.append(input)
+            
+                elif len(dim) == 2:
+                    input = Input(shape=(dim[0],dim[1]))
+                    inputs.append(input)
+            
+                else:
+                    input = Input(shape=(dim[0],))
+                    inputs.append(input)
+        
+        half = len(inputs)/2
+        enc_x = encoder_model(inputs[:half]) #s --> x #FIXME
+        enc_x_ = encoder_model(inputs[half:]) #s --> x
+        
+        if (self._high_int_dim==True):
+            x = Subtract()([Flatten()(enc_x),Flatten()(enc_x_)])
+        else:
+            x = Subtract()([enc_x,enc_x_])
+        x = Dot(axes=-1, normalize=False)([x,x])
+        
+        model = Model(inputs=inputs, outputs=x )
+        
+        return model
+
+    def diff_sa_sa(self,encoder_model,transition_model):
+        """
+    
+        Parameters
+        -----------
+        s
+        a
+        rand_a
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs=[]
+        
+        for i, dim in enumerate(self._input_dimensions):
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
+
+            elif len(dim) == 2:
+                input = Input(shape=(dim[0],dim[1]))
+                inputs.append(input)
+
+            else:
+                input = Input(shape=(dim[0],))
+                inputs.append(input)
+        
+        input = Input(shape=(self._n_actions,))
+        inputs.append(input)
+        input = Input(shape=(self._n_actions,))
+        inputs.append(input)
+        
+        enc_x = encoder_model(inputs[:-2]) #s --> x
+        Tx= transition_model([enc_x,inputs[-2]])
+        rand_Tx= transition_model([enc_x,inputs[-1]])
+        
+        if (self._high_int_dim==True):
+            Tx=Flatten()(Tx)
+            rand_Tx=Flatten()(rand_Tx)
+            x = Subtract()([Tx,rand_Tx])
+        else:
+            x = Subtract()([Tx,rand_Tx])
+        print "x._keras_shape"
+        print x._keras_shape
+        x = Dot(axes=-1, normalize=False)([x,x])
+        print "x._keras_shape"
+        print x._keras_shape
+        
+        model = Model(inputs=inputs, outputs=x )
+        
+        return model
+
+    def diff_Tx(self,transition_model):
+        """
+    
+        Parameters
+        -----------
+        x
+        a
+        x
+        a
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+    
+        """
+        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ), Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) )] #x,a,x,a
+        
+        #identity_mat=inputs[2]#K.constant(np.diag(np.ones(self._n_actions)), name="identity_mat")
+        Tx = transition_model(inputs[:2])
+        Tx2 = transition_model(inputs[2:])
+        
+        #tile_x=K.tile(inputs[0],(self._n_actions,1))        
+        #Tx_ = transition_model([tile_x]+[identity_mat])
+        
+        x = Subtract()([Tx,Tx2])
+        x = Dot(axes=-1, normalize=False)([x,x])
+        
+        model = Model(inputs=inputs, outputs=x )
+        
+        return model
+
+    def R_model(self):
+        """
+        Build a network consistent with each type of inputs
+
+        Parameters
+        -----------
+        x
+        a
+    
+        Returns
+        -------
+        r
+        """
+        
+        if(self._high_int_dim==True):
+            dim=self._input_dimensions[0] #FIXME
+            inputs = [ Input(shape=(dim[1],dim[2],dim[0])), Input( shape=(self._n_actions,) ) ]     #data_format='channels_last'
+            
+            layers_action=inputs[1]
+            layers_action=RepeatVector(dim[1]*dim[2])(layers_action)
+            layers_action=Reshape((self._n_actions,dim[1],dim[2]))(layers_action)
+            layers_action=Permute((2,3,1), input_shape=(dim[0]+self._n_actions,dim[1],dim[2]))(layers_action)    #data_format='channels_last'
+            
+            x = Concatenate(axis=-1)([layers_action,inputs[0]])
+            x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
+            #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+            x = Conv2D(dim[0], (3, 3), padding='same', activation='tanh')(x)
+            x = Flatten()(x)            
+        else:
+            inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
+            x = Concatenate()(inputs)#,axis=-1)
+            x = Dense(10, activation='tanh')(x)
+       
+        x = Dense(20, activation='tanh')(x)
+        x = Dense(10, activation='tanh')(x)
+        
+        out = Dense(1)(x)
+                
+        model = Model(inputs=inputs, outputs=out)
+        
+        return model
+
+    def full_R_model(self,encoder_model,R_model):
+        """
+        Maps internal state to immediate rewards
+
+        Parameters
+        -----------
+        s
+        a
+        (noise in abstract state space) : FIXME
+    
+        Returns
+        -------
+        r
+        """
+        
+        inputs=[]
+        
+        for i, dim in enumerate(self._input_dimensions):
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
+
+            elif len(dim) == 2:
+                input = Input(shape=(dim[0],dim[1]))
+                inputs.append(input)
+
+            else:
+                input = Input(shape=(dim[0],))
+                inputs.append(input)
+        
+        input = Input(shape=(self._n_actions,))
+        inputs.append(input)
+        
+        enc_x = encoder_model(inputs[:-1]) #s --> x
+                
+        out = R_model([enc_x]+inputs[-1:])
+
+        model = Model(inputs=inputs, outputs=out)
+        
+        return model
+
+    def Q_model(self):
+        if(self._high_int_dim==True):
+            inputs=[]
+            outs_conv=[]
+            for i, dim in enumerate(self._input_dimensions):
+                # - observation[i] is a FRAME
+                print "dim Q mod"
+                print dim
+                if len(dim) == 3:
+                    input = Input(shape=(dim[1],dim[2],dim[0])) #data_format is already 'channels_last'
+                    inputs.append(input)
+                    #reshaped=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)
+                    x = input     #data_format is already 'channels_last'
+                    print x._keras_shape
+            
+                    x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
+                    x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
+                    x = Conv2D(1, (2, 2), padding='same', activation='tanh')(x)
+                    out = (x)
+                else:
+                    print ("FIXME")
+                        
+                outs_conv.append(out)
+            
+            if (self._action_as_input==True):
+                if ( isinstance(self._n_actions,int)):
+                    print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
+                else:
+                    input = Input(shape=(len(self._n_actions),))
+                    inputs.append(input)
+                    outs_conv.append(input)
+            
+            if len(outs_conv)>1:
+                x = merge(outs_conv, mode='concat')
+            else:
+                x= outs_conv [0]
+            
+            # we stack a deep fully-connected network on top
+            x = Flatten()(x)
+            x = Dense(200, activation='tanh')(x)
+        else:
+            inputs = [ Input( shape=(self.internal_dim,) ) ] #x
+            x = Dense(20, activation='tanh')(inputs[0])
+
+        
+        #if (self._action_as_input==True):
+        #    if ( isinstance(self._n_actions,int)):
+        #        print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
+        #    else:
+        #        input = Input(shape=(len(self._n_actions),))
+        #        inputs.append(input)
+                
+        #x = Add()([x,inputs[-1]]) #????
+        
+        # we stack a deep fully-connected network on top
+        x = Dense(50, activation='tanh')(x)
+        x = Dense(20, activation='tanh')(x)
+        
+        #if (self._action_as_input==False):
+        #    if ( isinstance(self._n_actions,int)):
+        out = Dense(self._n_actions)(x)
+        #    else:
+        #        out = Dense(len(self._n_actions))(x)
+        #else:
+        #    out = Dense(1)(x)
+                
+        model = Model(inputs=inputs, outputs=out)
+        
+        return model
+
+
+    def full_Q_model(self, encoder_model, Q_model):
+        """
+        Build a network consistent with each type of inputs
+
+        Parameters
+        -----------
+        s
+        noise in abstract state space
+    
+        Returns
+        -------
+        model with output Tx (= model estimate of x')
+        """
+        inputs=[]
+        
+        for i, dim in enumerate(self._input_dimensions):
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))                
+                inputs.append(input)
+
+            elif len(dim) == 2:
+                input = Input(shape=(dim[0],dim[1]))
+                inputs.append(input)
+
+            else:
+                input = Input(shape=(dim[0],))
+                inputs.append(input)
+        
+        out = encoder_model(inputs)
+        print out._keras_shape
+
+        if(self._high_int_dim==True):
+            input = Input(shape=(dim[1],dim[2],dim[0]))
+            inputs.append(input)
+        else:
+            input = Input(shape=(self.internal_dim,))
+            inputs.append(input)
+
+        x=Add()([out,inputs[-1]]) # adding noise in the abstract state space
+        
+        out = Q_model(out)
+
+        model = Model(inputs=inputs, outputs=out)
+        
+        return model
+
+if __name__ == '__main__':
+    pass
+    
\ No newline at end of file
diff --git a/examples/simplest_test_PLI/run_test3.py b/examples/simplest_test_PLI/run_test3.py
new file mode 100644
index 00000000..41538620
--- /dev/null
+++ b/examples/simplest_test_PLI/run_test3.py
@@ -0,0 +1,191 @@
+"""ALE launcher. See Wiki for more details about this experiment.
+
+Authors: Vincent Francois-Lavet, David Taralla
+"""
+
+import sys
+import logging
+import numpy as np
+from joblib import hash, dump
+import os
+
+from deer.default_parser import process_args
+from deer.agent import NeuralAgent
+from deer.q_networks.q_net_keras_lp import MyQNetwork
+from test_env3 import MyEnv as test_env
+import deer.experiment.base_controllers as bc
+
+from deer.policies import EpsilonGreedyPolicy
+
+
+class Defaults:
+    # ----------------------
+    # Experiment Parameters
+    # ----------------------
+    STEPS_PER_EPOCH = 5000
+    EPOCHS = 500
+    STEPS_PER_TEST = 500
+    PERIOD_BTW_SUMMARY_PERFS = 1
+    
+    # ----------------------
+    # Environment Parameters
+    # ----------------------
+    FRAME_SKIP = 2
+
+    # ----------------------
+    # DQN Agent parameters:
+    # ----------------------
+    UPDATE_RULE = 'rmsprop'
+    LEARNING_RATE = 0.0005
+    LEARNING_RATE_DECAY = 0.98
+    DISCOUNT = 0.9
+    DISCOUNT_INC = 1
+    DISCOUNT_MAX = 0.99
+    RMS_DECAY = 0.9
+    RMS_EPSILON = 0.0001
+    MOMENTUM = 0
+    CLIP_DELTA = 1.0
+    EPSILON_START = 1.0
+    EPSILON_MIN = .3
+    EPSILON_DECAY = 10000
+    UPDATE_FREQUENCY = 1
+    REPLAY_MEMORY_SIZE = 1000000
+    BATCH_SIZE = 32
+    FREEZE_INTERVAL = 1000
+    DETERMINISTIC = False
+
+
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    
+    # --- Parse parameters ---
+    parameters = process_args(sys.argv[1:], Defaults)
+    if parameters.deterministic:
+        rng = np.random.RandomState(123456)
+    else:
+        rng = np.random.RandomState()
+    
+    # --- Instantiate environment ---
+    env = test_env()
+    
+    # --- Instantiate qnetwork ---
+    qnetwork = MyQNetwork(
+        env,
+        parameters.rms_decay,
+        parameters.rms_epsilon,
+        parameters.momentum,
+        parameters.clip_delta,
+        parameters.freeze_interval,
+        parameters.batch_size,
+        parameters.update_rule,
+        rng)
+    
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
+
+    # --- Instantiate agent ---
+    agent = NeuralAgent(
+        env,
+        qnetwork,
+        parameters.replay_memory_size,
+        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
+        parameters.batch_size,
+        rng,
+        test_policy=test_policy)
+
+    # --- Create unique filename for FindBestController ---
+    h = hash(vars(parameters), hash_name="sha1")
+    fname = "test_" + h
+    print("The parameters hash is: {}".format(h))
+    print("The parameters are: {}".format(parameters))
+
+    # --- Bind controllers to the agent ---
+    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+    # learning rate as well as the training epoch number.
+    agent.attach(bc.VerboseController(
+        evaluate_on='epoch', 
+        periodicity=1))
+    
+    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+    agent.attach(bc.TrainerController(
+        evaluate_on='action', 
+        periodicity=parameters.update_frequency, 
+        show_episode_avg_V_value=True, 
+        show_avg_Bellman_residual=True))
+    
+    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+    agent.attach(bc.LearningRateController(
+        initial_learning_rate=parameters.learning_rate, 
+        learning_rate_decay=parameters.learning_rate_decay,
+        periodicity=1))
+    
+    # Same for the discount factor.
+    agent.attach(bc.DiscountFactorController(
+        initial_discount_factor=parameters.discount, 
+        discount_factor_growth=parameters.discount_inc, 
+        discount_factor_max=parameters.discount_max,
+        periodicity=1))
+    
+    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+    # episode or epoch (or never, hence the resetEvery='none').
+    agent.attach(bc.EpsilonController(
+        initial_e=parameters.epsilon_start, 
+        e_decays=parameters.epsilon_decay, 
+        e_min=parameters.epsilon_min,
+        evaluate_on='action',
+        periodicity=1,
+        reset_every='none'))
+    
+    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
+    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
+    # "true generalization score", or "test score".
+    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
+    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
+    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
+    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
+    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
+    # application.
+    agent.attach(bc.FindBestController(
+        validationID=test_env.VALIDATION_MODE,
+        testID=None,
+        unique_fname=fname))
+    
+    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+    # these validation epoch to interfere with the training of the agent, which is well established by the 
+    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+    # [parameters.period_btw_summary_perfs] *validation* epochs.
+    agent.attach(bc.InterleavedTestEpochController(
+        id=test_env.VALIDATION_MODE, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+    
+    # --- Run the experiment ---
+    try:
+        os.mkdir("params")
+    except Exception:
+        pass
+    dump(vars(parameters), "params/" + fname + ".jldump")
+    agent.run(parameters.epochs, parameters.steps_per_epoch)
+    
+    # --- Show results ---
+    basename = "scores/" + fname
+    scores = joblib.load(basename + "_scores.jldump")
+    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
+    plt.legend()
+    plt.xlabel("Number of epochs")
+    plt.ylabel("Score")
+    plt.savefig(basename + "_scores.pdf")
+    plt.show()
diff --git a/examples/simplest_test_PLI/run_test4.py b/examples/simplest_test_PLI/run_test4.py
new file mode 100644
index 00000000..d4c0d0d6
--- /dev/null
+++ b/examples/simplest_test_PLI/run_test4.py
@@ -0,0 +1,195 @@
+"""ALE launcher. See Wiki for more details about this experiment.
+
+Authors: Vincent Francois-Lavet, David Taralla
+"""
+
+import sys
+import logging
+import numpy as np
+from joblib import hash, dump
+import os
+
+from deer.default_parser import process_args
+from deer.agent import NeuralAgent
+from deer.q_networks.q_net_keras_lp import MyQNetwork
+from test_env4 import MyEnv as test_env
+import deer.experiment.base_controllers as bc
+
+from deer.policies import EpsilonGreedyPolicy
+
+
+class Defaults:
+    # ----------------------
+    # Experiment Parameters
+    # ----------------------
+    STEPS_PER_EPOCH = 5000
+    EPOCHS = 50
+    STEPS_PER_TEST = 500
+    PERIOD_BTW_SUMMARY_PERFS = 1
+    
+    # ----------------------
+    # Environment Parameters
+    # ----------------------
+    FRAME_SKIP = 2
+
+    # ----------------------
+    # DQN Agent parameters:
+    # ----------------------
+    UPDATE_RULE = 'rmsprop'
+    LEARNING_RATE = 0.0002
+    LEARNING_RATE_DECAY = 0.98
+    DISCOUNT = 0.9
+    DISCOUNT_INC = 1
+    DISCOUNT_MAX = 0.99
+    RMS_DECAY = 0.9
+    RMS_EPSILON = 0.0001
+    MOMENTUM = 0
+    CLIP_DELTA = 1.0
+    EPSILON_START = 1.0
+    EPSILON_MIN = 1.0
+    EPSILON_DECAY = 10000
+    UPDATE_FREQUENCY = 1
+    REPLAY_MEMORY_SIZE = 1000000
+    BATCH_SIZE = 32
+    FREEZE_INTERVAL = 1000
+    DETERMINISTIC = False
+
+
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    
+    # --- Parse parameters ---
+    parameters = process_args(sys.argv[1:], Defaults)
+    if parameters.deterministic:
+        rng = np.random.RandomState(123456)
+    else:
+        rng = np.random.RandomState()
+    
+    # --- Instantiate environment ---
+    env = test_env()
+    
+    # --- Instantiate qnetwork ---
+    qnetwork = MyQNetwork(
+        env,
+        parameters.rms_decay,
+        parameters.rms_epsilon,
+        parameters.momentum,
+        parameters.clip_delta,
+        parameters.freeze_interval,
+        parameters.batch_size,
+        parameters.update_rule,
+        rng)
+    
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
+
+    # --- Instantiate agent ---
+    agent = NeuralAgent(
+        env,
+        qnetwork,
+        parameters.replay_memory_size,
+        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
+        parameters.batch_size,
+        rng,
+        test_policy=test_policy)
+
+    # --- Create unique filename for FindBestController ---
+    h = hash(vars(parameters), hash_name="sha1")
+    fname = "test_" + h
+    print("The parameters hash is: {}".format(h))
+    print("The parameters are: {}".format(parameters))
+
+    # --- Bind controllers to the agent ---
+    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+    # learning rate as well as the training epoch number.
+    agent.attach(bc.VerboseController(
+        evaluate_on='epoch', 
+        periodicity=1))
+    
+    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+    agent.attach(bc.LearningRateController(
+        initial_learning_rate=parameters.learning_rate, 
+        learning_rate_decay=parameters.learning_rate_decay,
+        periodicity=1))
+    
+    # Same for the discount factor.
+    agent.attach(bc.DiscountFactorController(
+        initial_discount_factor=parameters.discount, 
+        discount_factor_growth=parameters.discount_inc, 
+        discount_factor_max=parameters.discount_max,
+        periodicity=1))
+        
+    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+    # episode or epoch (or never, hence the resetEvery='none').
+    agent.attach(bc.EpsilonController(
+        initial_e=parameters.epsilon_start, 
+        e_decays=parameters.epsilon_decay, 
+        e_min=parameters.epsilon_min,
+        evaluate_on='action',
+        periodicity=1,
+        reset_every='none'))
+
+    agent.run(10, 200)
+    print("end gathering data")
+
+    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+    agent.attach(bc.TrainerController(
+        evaluate_on='action', 
+        periodicity=parameters.update_frequency, 
+        show_episode_avg_V_value=True, 
+        show_avg_Bellman_residual=True))
+    
+    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
+    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
+    # "true generalization score", or "test score".
+    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
+    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
+    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
+    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
+    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
+    # application.
+    agent.attach(bc.FindBestController(
+        validationID=test_env.VALIDATION_MODE,
+        testID=None,
+        unique_fname=fname))
+    
+    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+    # these validation epoch to interfere with the training of the agent, which is well established by the 
+    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+    # [parameters.period_btw_summary_perfs] *validation* epochs.
+    agent.attach(bc.InterleavedTestEpochController(
+        id=test_env.VALIDATION_MODE, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+    
+    # --- Run the experiment ---
+    try:
+        os.mkdir("params")
+    except Exception:
+        pass
+    dump(vars(parameters), "params/" + fname + ".jldump")
+    agent.gathering_data=False
+    agent.run(parameters.epochs, parameters.steps_per_epoch)
+    
+    # --- Show results ---
+    basename = "scores/" + fname
+    scores = joblib.load(basename + "_scores.jldump")
+    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
+    plt.legend()
+    plt.xlabel("Number of epochs")
+    plt.ylabel("Score")
+    plt.savefig(basename + "_scores.pdf")
+    plt.show()
diff --git a/examples/simplest_test_PLI/test_env3.py b/examples/simplest_test_PLI/test_env3.py
new file mode 100644
index 00000000..c4da5122
--- /dev/null
+++ b/examples/simplest_test_PLI/test_env3.py
@@ -0,0 +1,377 @@
+""" Interface with the test environment
+
+Authors: Vincent Francois-Lavet
+"""
+import numpy as np
+import cv2
+
+from deer.base_classes import Environment
+
+import matplotlib
+matplotlib.use('qt5agg')
+from mpl_toolkits.axes_grid1 import host_subplot
+import mpl_toolkits.axisartist as AA
+import matplotlib.pyplot as plt
+import copy 
+
+class MyEnv(Environment):
+    VALIDATION_MODE = 0
+
+    def __init__(self):
+
+        self._mode = -1
+        self._mode_score = 0.0
+        self._mode_episode_count = 0
+
+        self._actions = [0,1]
+        self._height=15
+        self._width=7 #preferably an odd number so that it's symmetrical
+        self._width_paddle=1
+        self._nx_block=3 #number of different x positions of the falling blocks
+        if(self._nx_block==1):
+            self._x_block=self._width//2
+        else:
+            rand=np.random.randint(self._nx_block) # random selection of the pos for falling block
+            self._x_block=rand*((self._width-1)//(self._nx_block-1)) # traduction in a number in [0,self._width] of rand
+
+                
+    def reset(self, mode):
+        if mode == MyEnv.VALIDATION_MODE:
+            if self._mode != MyEnv.VALIDATION_MODE:
+                self._mode = MyEnv.VALIDATION_MODE
+                self._mode_score = 0.0
+                self._mode_episode_count = 0
+                np.random.seed(seed=11) #Seed the generator so that the sequence of falling blocks is the same in validation
+            else:
+                self._mode_episode_count += 1
+        elif self._mode != -1: # and thus mode == -1
+            self._mode = -1
+        
+        self.y=self._height-1
+        self.x=np.random.randint(self._width-self._width_paddle+1) #self._width//2
+        if(self._nx_block==1):
+            self._x_block=self._width//2
+        else:
+            rand=np.random.randint(self._nx_block) # random selection of the pos for falling block
+            self._x_block=rand*((self._width-1)//(self._nx_block-1)) # traduction in a number in [0,self._width] of rand
+                
+        return [1 * [self._height * [self._width * [0]]]] #[0,0,1]+[0,1,0]
+        
+        
+    def act(self, action):
+        action = self._actions[action]
+        
+        if(action==0):        
+            self.x = max(self.x-1,0)
+        if(action==1):        
+            self.x = min(self.x+1,self._width-self._width_paddle)
+
+        self.y = self.y-1
+              
+        if(self.y==0 and self.x>self._x_block-self._width_paddle and self.x<=self._x_block):
+            self.reward = 1
+        elif(self.y==0):
+            self.reward = -1
+        else:
+            self.reward = 0
+
+        self._mode_score += self.reward
+        return self.reward
+
+    def summarizePerformance(self, test_data_set, learning_algo):
+        #print "test_data_set.observations.shape"
+        #print test_data_set.observations()[0][0:1]
+        
+        all_possib_inp=[]
+        for x_b in range(self._nx_block):#[1]:#range(self._nx_block):
+            for y_b in range(self._height):
+                for x_p in range(self._width-self._width_paddle+1):
+                    state=np.zeros((self._height,self._width))
+                    state[y_b,x_b*((self._width-1)//(self._nx_block-1))]=0.5
+                    state[0,x_p-self._width_paddle+1:x_p+1]=1.                    
+                    all_possib_inp.append(state)
+
+        all_possib_inp=np.expand_dims(all_possib_inp,axis=1)
+        print "all_possib_inp"
+        print all_possib_inp[0]
+        print all_possib_inp[self._height*(self._width-self._width_paddle+1)-1]
+        print all_possib_inp[self._height*(self._width-self._width_paddle+1)]
+        print all_possib_inp[2*self._height*(self._width-self._width_paddle+1)-1]
+        print all_possib_inp[2*self._height*(self._width-self._width_paddle+1)]
+        print all_possib_inp[3*self._height*(self._width-self._width_paddle+1)-1]
+        print "all_possib_inp.shape"
+        print all_possib_inp.shape
+        #print all_possib_inp[self._height*self._width]
+        #print "all_possib_inp[2*self._height*self._width]"
+        #print all_possib_inp[2*self._height*self._width]
+        #print all_possib_inp[2*self._height*self._width-1]
+        all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
+        print "learning_algo.encoder.predict(all_possib_inp)"
+        print all_possib_abs_states
+        
+        print "print test_data_set.observations()"
+        print test_data_set.observations()
+        n=self._height-1
+        historics=[]
+        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
+            historics.append(np.expand_dims(observ,axis=0))
+        historics=np.array(historics)
+        print "historics"
+        print historics
+        abs_states=learning_algo.encoder.predict(historics)
+        print "abs_states"
+        print abs_states
+        actions=test_data_set.actions()[0:n]
+        print "actions"
+        print actions
+
+        print actions
+        print "test_data_set.rewards()[0:n]"
+        print test_data_set.rewards()[0:n]
+        print "test_data_set.terminals()[0:n]"
+        print test_data_set.terminals()[0:n]
+        if self.inTerminalState() == False:
+            self._mode_episode_count += 1
+        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
+                
+        
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        import matplotlib.cm as cm
+        m = cm.ScalarMappable(cmap=cm.jet)
+        
+        x = np.array(abs_states)[:,0]
+        y = np.array(abs_states)[:,1]
+        z = np.array(abs_states)[:,2]
+        
+        #Colors
+        #onehot_actions = np.zeros((n, 4))
+        #onehot_actions[np.arange(n), actions] = 1
+        
+        fig = plt.figure()
+        ax = fig.add_subplot(111,projection='3d')
+        for j in range(3):
+            # Plot the trajectory
+            for i in xrange(n-1):
+                ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
+
+        # Plot the estimated transitions
+        for i in range(n-1):
+            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
+            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
+            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75)
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.75)
+
+#        for xx in np.arange(self._width)-self._width//2:
+#            for yy in np.arange(self._width)-self._width//2:
+#                for zz in np.arange(self._width)-self._width//2:
+#                    predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
+#                    predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
+#                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
+#                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
+        
+
+        # Plot the colorbar for the trajectory
+        fig.subplots_adjust(right=0.7)
+        ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
+        # Set the colormap and norm to correspond to the data for which the colorbar will be used.
+        cmap = matplotlib.cm.cool
+        norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
+        # everything needed for a standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
+                                norm=norm,
+                                orientation='vertical')
+        cb1.set_label('Beginning to end of trajectory')
+
+
+        # Plot the dots at each time step depending on the action taken
+        length_block=self._height*(self._width-self._width_paddle+1)
+        for i in range(self._nx_block):
+            line3 = ax.scatter(all_possib_abs_states[i*length_block:(i+1)*length_block,0], all_possib_abs_states[i*length_block:(i+1)*length_block,1] ,all_possib_abs_states[i*length_block:(i+1)*length_block,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.2)
+        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
+        axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
+        zrange=axes_lims[2][1]-axes_lims[2][0]
+        
+        # Plot the legend for the dots
+        from matplotlib.patches import Circle, Rectangle
+        from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
+        box1 = TextArea(" State representation (action 0, action 1): ", textprops=dict(color="k"))
+        
+        box2 = DrawingArea(60, 20, 0, 0)
+        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k", alpha=0.75)
+        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k", alpha=0.75) 
+        #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
+        box2.add_artist(el1)
+        box2.add_artist(el2)
+        #box2.add_artist(el3)
+
+
+        box = HPacker(children=[box1, box2],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=box, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 1.07),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )
+        ax.add_artist(anchored_box)
+
+
+        # Plot the legend for transition estimates
+        box1b = TextArea(" Estimated transitions (action 0, action 1): ", textprops=dict(color="k"))
+        box2b = DrawingArea(60, 20, 0, 0)
+        el1b = Rectangle((5, 10), 15,2, fc="0.75", alpha=0.75)
+        el2b = Rectangle((25, 10), 15,2, fc="0.25", alpha=0.75) 
+        box2b.add_artist(el1b)
+        box2b.add_artist(el2b)
+
+        boxb = HPacker(children=[box1b, box2b],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=boxb, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 0.98),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )        
+        ax.add_artist(anchored_box)
+
+        
+
+        ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # Plot the Q_vals
+        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+        #print "actions,C"
+        #print actions
+        #print c
+        #c=np.max(c,axis=1)
+        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+        
+        #plt.colorbar(m3)
+        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.RdYlGn
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        plt.show()
+        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # fig_visuV
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+
+        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+        c=np.max(c,axis=1)
+        #print "c"
+        #print c
+        
+        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+        #plt.colorbar(m)
+        fig.subplots_adjust(right=0.8)
+        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.hot
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        #plt.show()
+        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
+
+
+        # fig_visuR
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+
+        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
+        repeat_nactions_coord=np.repeat(coords,self.nActions(),axis=0)
+        identity_matrix = np.diag(np.ones(self.nActions()))
+        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
+
+        c = learning_algo.R.predict([repeat_nactions_coord,tile_identity_matrix])
+        c=np.max(np.reshape(c,(125,self.nActions())),axis=1)
+        #print "c"
+        #print c
+        #mini=np.min(c)
+        #maxi=np.max(c)
+        
+        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+        #plt.colorbar(m)
+        fig.subplots_adjust(right=0.8)
+        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+        cmap = matplotlib.cm.hot
+        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+
+        # ColorbarBase derives from ScalarMappable and puts a colorbar
+        # in a specified axes, so it has everything needed for a
+        # standalone colorbar.  There are many more kwargs, but the
+        # following gives a basic continuous colorbar with ticks
+        # and labels.
+        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+        cb1.set_label('Estimated expected return')
+
+        #plt.show()
+        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
+
+        matplotlib.pyplot.close("all") # avoids memory leaks
+
+    def inputDimensions(self):
+        return [(1,self._height,self._width)]
+
+    def observationType(self, subject):
+        return np.float32
+
+    def nActions(self):
+        return len(self._actions)
+
+    def observe(self):
+        obs=np.zeros((self._height,self._width))
+        obs[self.y,self._x_block]=0.5
+        obs[0,self.x-self._width_paddle+1:self.x+1]=1
+        return [obs]
+
+    def inTerminalState(self):
+        if (self.y==0):
+            return True
+        else:
+            return False
+
+
+
+if __name__ == "__main__":
+    pass
diff --git a/examples/simplest_test_PLI/test_env4.py b/examples/simplest_test_PLI/test_env4.py
new file mode 100644
index 00000000..d5618c22
--- /dev/null
+++ b/examples/simplest_test_PLI/test_env4.py
@@ -0,0 +1,432 @@
+""" Interface with the test environment
+
+Authors: Vincent Francois-Lavet
+
+def encoder_model(self):
+
+def transition_model(self):
+    x = Dense(10, activation='tanh')(x) #5,15
+    x = Dense(30, activation='tanh')(x) # ,30
+    x = Dense(30, activation='tanh')(x) # ,30
+    x = Dense(10, activation='tanh')(x) # ,30
+
+"""
+import numpy as np
+import cv2
+
+from deer.base_classes import Environment
+
+import matplotlib
+matplotlib.use('qt5agg')
+from mpl_toolkits.axes_grid1 import host_subplot
+import mpl_toolkits.axisartist as AA
+import matplotlib.pyplot as plt
+import copy 
+
+class MyEnv(Environment):
+    VALIDATION_MODE = 0
+
+    def __init__(self):
+
+        self._mode = -1
+        self._mode_score = 0.0
+        self._mode_episode_count = 0
+        self._actions = [0,1,2,3]
+        self._height=7
+        self._width=9
+        self.create_map()
+        self.intern_dim=3
+
+    def create_map(self):
+        self._map=np.zeros((self._height,self._width))
+        self._map[-1,:]=1
+        self._map[0,:]=1
+        self._map[:,0]=1
+        self._map[:,-1]=1
+        self._map[:,self._width//2]=1
+        #self._map[:,self._width//3]=1
+        #self._map[-2,self._width//3]=0
+        #self._map[:,2*self._width//3]=1
+        #self._map[2,2*self._width//3]=0
+        self._pos_agent=[2,2]
+        #self._pos_goal=[2,6]
+        #self._map[3,6]=0.66
+
+                
+    def reset(self, mode):
+        self.create_map()
+
+        if mode == -1:
+            i=np.random.randint(2)
+            if(i==0):
+                self._map[self._height//2-1,self._width//2]=0
+            if(i==1):
+                self._map[self._height//2+1,self._width//2]=0
+        else:
+            self._map[self._height//2+1,self._width//2]=0
+        
+        if mode == MyEnv.VALIDATION_MODE:
+            if self._mode != MyEnv.VALIDATION_MODE:
+                self._mode = MyEnv.VALIDATION_MODE
+                self._mode_score = 0.0
+                self._mode_episode_count = 0
+                
+            else:
+                self._mode_episode_count += 1
+        elif self._mode != -1:
+            self._mode = -1
+        
+        self._pos_agent=[2,2]
+        print "reset mode"
+        print mode
+        print "self._map"
+        print self._map
+                
+        return [1 * [self._height * [self._width * [0]]]]
+        
+        
+    def act(self, action):
+        action = self._actions[action]
+        
+        if(action==0):
+            if(self._map[self._pos_agent[0]-1,self._pos_agent[1]]==0):
+                self._pos_agent[0]=self._pos_agent[0]-1
+        elif(action==1):        
+            if(self._map[self._pos_agent[0]+1,self._pos_agent[1]]==0):
+                self._pos_agent[0]=self._pos_agent[0]+1
+        elif(action==2):        
+            if(self._map[self._pos_agent[0],self._pos_agent[1]-1]==0):
+                self._pos_agent[1]=self._pos_agent[1]-1
+        elif(action==3):        
+            if(self._map[self._pos_agent[0],self._pos_agent[1]+1]==0):
+                self._pos_agent[1]=self._pos_agent[1]+1
+        
+        self.reward = 0
+        #if (self._pos_agent==self._pos_goal):
+        #    self.reward = 1
+
+        self._mode_score += self.reward
+        return self.reward
+
+    def summarizePerformance(self, test_data_set, learning_algo):
+        #print "test_data_set.observations.shape"
+        #print test_data_set.observations()[0][0:1]
+        
+        for i in range(3):
+            all_possib_inp=[]
+            self.create_map()
+            for x_a in range(self._width):
+                for y_a in range(self._height):
+                    state=copy.deepcopy(self._map)
+                    state[self._height//2+i-1,self._width//2]=0
+
+                    if(state[y_a,x_a]==0):
+                        state[y_a,x_a]=0.5
+                        all_possib_inp.append(state)
+            
+            all_possib_inp=np.expand_dims(all_possib_inp,axis=1)
+            print "all_possib_inp[0:10]"
+            print all_possib_inp[0:10]
+            print "all_possib_inp.shape"
+            print all_possib_inp.shape
+            all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
+            if(all_possib_abs_states.ndim==4):
+                all_possib_abs_states=np.transpose(all_possib_abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
+            print "learning_algo.encoder.predict(all_possib_inp)[0:10]"
+            print all_possib_abs_states[0:10]
+            
+            print "print test_data_set.observations()[0:10]"
+            print test_data_set.observations()[0:10]
+            n=500
+            historics=[]
+            for i,observ in enumerate(test_data_set.observations()[0][0:n]):
+                historics.append(np.expand_dims(observ,axis=0))
+            historics=np.array(historics)
+            #print "historics[0:10]"
+            #print historics[0:10]
+            abs_states=learning_algo.encoder.predict(historics)
+            if(abs_states.ndim==4):
+                abs_states=np.transpose(abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
+            print "abs_states[0:10]"
+            print abs_states[0:10]
+            actions=test_data_set.actions()[0:n]
+            print "actions[0:10]"
+            print actions[0:10]
+            
+            print "test_data_set.rewards()[0:10]"
+            print test_data_set.rewards()[0:10]
+            print "test_data_set.terminals()[0:10]"
+            print test_data_set.terminals()[0:10]
+            if self.inTerminalState() == False:
+                self._mode_episode_count += 1
+            print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
+                    
+            
+#            import matplotlib.pyplot as plt
+#            from mpl_toolkits.mplot3d import Axes3D
+#            import matplotlib.cm as cm
+#            m = cm.ScalarMappable(cmap=cm.jet)
+#            
+#            x = np.array(abs_states)[:,0]
+#            y = np.array(abs_states)[:,1]
+#            if(self.intern_dim>2):
+#                z = np.array(abs_states)[:,2]
+#            
+#            #Colors
+#            #onehot_actions = np.zeros((n, 4))
+#            #onehot_actions[np.arange(n), actions] = 1
+#            
+#            fig = plt.figure()
+#            if(self.intern_dim==2):
+#                ax = fig.add_subplot(111)
+#            else:
+#                ax = fig.add_subplot(111,projection='3d')
+#            
+#            #for j in range(3):
+#            #    # Plot the trajectory
+#            #    for i in xrange(n-1):
+#            #        #ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
+#            #        ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
+#            
+#            # Plot the estimated transitions
+#            for i in range(n-1):
+#                predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0,0]])])
+#                predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0,0]])])
+#                predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1,0]])])
+#                predicted4=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,0,1]])])
+#                if(self.intern_dim==2):
+#                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), color="0.15", alpha=0.75)
+#                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), color="0.4", alpha=0.75)
+#                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), color="0.65", alpha=0.75)
+#                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), color="0.9", alpha=0.75)
+#                else:
+#                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.15", alpha=0.75)
+#                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.4", alpha=0.75)
+#                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.65", alpha=0.75)
+#                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.9", alpha=0.75)
+#            
+##            for xx in np.arange(self._width)-self._width//2:
+##                for yy in np.arange(self._width)-self._width//2:
+##                    for zz in np.arange(self._width)-self._width//2:
+##                        predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
+##                        predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
+##                        ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
+##                        ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
+#            
+#            
+#            ## Plot the colorbar for the trajectory
+#            #fig.subplots_adjust(right=0.7)
+#            #ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
+#            ## Set the colormap and norm to correspond to the data for which the colorbar will be used.
+#            #cmap = matplotlib.cm.cool
+#            #norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
+#            #
+#            ## ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
+#            ## everything needed for a standalone colorbar.  There are many more kwargs, but the
+#            ## following gives a basic continuous colorbar with ticks and labels.
+#            #cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
+#            #                        norm=norm,
+#            #                        orientation='vertical')
+#            #cb1.set_label('Beginning to end of trajectory')
+#            
+#            
+#            # Plot the dots at each time step depending on the action taken
+#            length_block=[[0,15],[15,16],[16,31]]
+#            for i in range(3):
+#                if(self.intern_dim==2):
+#                    line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], s=30, marker='x', edgecolors='k', alpha=0.5)
+#                else:
+#                    line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], s=30, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
+#            #line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/4.,axis=1),(1,3))-0.125, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
+#            #line2 = ax.scatter(x, y, c=np.tile(np.expand_dims(1-actions/4.,axis=1),(1,3))-0.125, s=50, marker='o', edgecolors='k', alpha=0.75)
+#            if(self.intern_dim==2):
+#                axes_lims=[ax.get_xlim(),ax.get_ylim()]
+#            else:
+#                axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
+#            
+#            #zrange=axes_lims[2][1]-axes_lims[2][0]
+#            
+#            # Plot the legend for the dots
+#            from matplotlib.patches import Circle, Rectangle
+#            from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
+##            box1 = TextArea(" State representation (action 0, action 1): ", textprops=dict(color="k"))
+##            
+##            box2 = DrawingArea(80, 20, 0, 0)
+##            el1 = Circle((10, 10), 5, fc="0.9", edgecolor="k", alpha=0.75)
+##            el2 = Circle((25, 10), 5, fc="0.65", edgecolor="k", alpha=0.75)
+##            el3 = Circle((40, 10), 5, fc="0.4", edgecolor="k", alpha=0.75)
+##            el4 = Circle((55, 10), 5, fc="0.15", edgecolor="k", alpha=0.75) 
+##            #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
+##            box2.add_artist(el1)
+##            box2.add_artist(el2)
+##            box2.add_artist(el3)
+##            box2.add_artist(el4)
+##           
+##           
+##            box = HPacker(children=[box1, box2],
+##                          align="center",
+##                          pad=0, sep=5)
+##            
+##            anchored_box = AnchoredOffsetbox(loc=3,
+##                                             child=box, pad=0.,
+##                                             frameon=True,
+##                                             bbox_to_anchor=(0., 1.07),
+##                                             bbox_transform=ax.transAxes,
+##                                             borderpad=0.,
+##                                             )
+##            ax.add_artist(anchored_box)
+#            
+#            
+#            # Plot the legend for transition estimates
+#            box1b = TextArea(" Estimated transitions (action 0, 1, 2 and 3): ", textprops=dict(color="k"))
+#            box2b = DrawingArea(90, 20, 0, 0)
+#            el1b = Rectangle((5, 10), 15,2, fc="0.9", alpha=0.75)
+#            el2b = Rectangle((25, 10), 15,2, fc="0.65", alpha=0.75) 
+#            el3b = Rectangle((45, 10), 15,2, fc="0.4", alpha=0.75)
+#            el4b = Rectangle((65, 10), 15,2, fc="0.15", alpha=0.75) 
+#            box2b.add_artist(el1b)
+#            box2b.add_artist(el2b)
+#            box2b.add_artist(el3b)
+#            box2b.add_artist(el4b)
+#            
+#            boxb = HPacker(children=[box1b, box2b],
+#                          align="center",
+#                          pad=0, sep=5)
+#            
+#            anchored_box = AnchoredOffsetbox(loc=3,
+#                                             child=boxb, pad=0.,
+#                                             frameon=True,
+#                                             bbox_to_anchor=(0., 0.98),
+#                                             bbox_transform=ax.transAxes,
+#                                             borderpad=0.,
+#                                             )        
+#            ax.add_artist(anchored_box)
+#            
+#            
+#            
+#            #ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+#            #ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+#            #ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+#            plt.show()
+#            plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+
+
+#        # Plot the Q_vals
+#        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+#        #print "actions,C"
+#        #print actions
+#        #print c
+#        #c=np.max(c,axis=1)
+#        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+#        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
+#        
+#        #plt.colorbar(m3)
+#        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
+#        cmap = matplotlib.cm.RdYlGn
+#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+#
+#        # ColorbarBase derives from ScalarMappable and puts a colorbar
+#        # in a specified axes, so it has everything needed for a
+#        # standalone colorbar.  There are many more kwargs, but the
+#        # following gives a basic continuous colorbar with ticks
+#        # and labels.
+#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+#        cb1.set_label('Estimated expected return')
+#
+#        #plt.show()
+#        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
+#
+#
+#        # fig_visuV
+#        fig = plt.figure()
+#        ax = fig.add_subplot(111, projection='3d')
+#        
+#        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+#        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+#        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+#
+#        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
+#        c=np.max(c,axis=1)
+#        #print "c"
+#        #print c
+#        
+#        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+#        #plt.colorbar(m)
+#        fig.subplots_adjust(right=0.8)
+#        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+#        cmap = matplotlib.cm.hot
+#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+#
+#        # ColorbarBase derives from ScalarMappable and puts a colorbar
+#        # in a specified axes, so it has everything needed for a
+#        # standalone colorbar.  There are many more kwargs, but the
+#        # following gives a basic continuous colorbar with ticks
+#        # and labels.
+#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+#        cb1.set_label('Estimated expected return')
+#
+#        #plt.show()
+#        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
+#
+#
+#        # fig_visuR
+#        fig = plt.figure()
+#        ax = fig.add_subplot(111, projection='3d')
+#        
+#        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
+#        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
+#        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
+#
+#        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
+#        repeat_nactions_coord=np.repeat(coords,self.nActions(),axis=0)
+#        identity_matrix = np.diag(np.ones(self.nActions()))
+#        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
+#
+#        c = learning_algo.R.predict([repeat_nactions_coord,tile_identity_matrix])
+#        c=np.max(np.reshape(c,(125,self.nActions())),axis=1)
+#        #print "c"
+#        #print c
+#        #mini=np.min(c)
+#        #maxi=np.max(c)
+#        
+#        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
+#        #plt.colorbar(m)
+#        fig.subplots_adjust(right=0.8)
+#        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
+#        cmap = matplotlib.cm.hot
+#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
+#
+#        # ColorbarBase derives from ScalarMappable and puts a colorbar
+#        # in a specified axes, so it has everything needed for a
+#        # standalone colorbar.  There are many more kwargs, but the
+#        # following gives a basic continuous colorbar with ticks
+#        # and labels.
+#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
+#        cb1.set_label('Estimated expected return')
+
+        #plt.show()
+        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
+
+        matplotlib.pyplot.close("all") # avoids memory leaks
+
+    def inputDimensions(self):
+        return [(1,self._height,self._width)]
+
+    def observationType(self, subject):
+        return np.float32
+
+    def nActions(self):
+        return len(self._actions)
+
+    def observe(self):
+        obs=copy.deepcopy(self._map)
+        obs[self._pos_agent[0],self._pos_agent[1]]=0.5
+        return [obs]
+
+    def inTerminalState(self):
+        return False
+
+
+
+if __name__ == "__main__":
+    pass

From 6967c4a16a7641a148896b5be400e1be129f490e Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 7 Mar 2018 18:15:33 -0500
Subject: [PATCH 39/96] fix

---
 deer/q_networks/NN_keras_lp_high_int_dim.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/q_networks/NN_keras_lp_high_int_dim.py
index 04997aa8..e666a506 100644
--- a/deer/q_networks/NN_keras_lp_high_int_dim.py
+++ b/deer/q_networks/NN_keras_lp_high_int_dim.py
@@ -178,9 +178,6 @@ def encoder_diff_model(self,encoder_model):
         if (self._high_int_dim==True):
             x1=Flatten()(x1)
             x2=Flatten()(x2)
-            x = Subtract()([x1,x2])   #([Tx,enc_x_])
-        else:
-            x = Subtract()([Tx,enc_x_])
         x = Subtract()([x1,x2])
         model = Model(inputs=inputs, outputs=x)
         

From cf77bc98040632f0201c9f74cad99d6d7ef689a8 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 23 Mar 2018 14:53:25 -0400
Subject: [PATCH 40/96] nstep learning (work on maze with size5 with low int
 dim) and a few minor changes

---
 deer/agent.py                               | 126 +++++++-
 deer/base_classes/Policy.py                 |   4 +-
 deer/policies/EpsilonGreedyPolicy.py        |   4 +-
 deer/q_networks/NN_keras_lp_high_int_dim.py | 149 ++++++----
 deer/q_networks/q_net_keras_lp.py           | 148 ++++++---
 examples/simplest_test_PLI/run_test4.py     |   9 +-
 examples/simplest_test_PLI/test_env4.py     | 313 ++++++++++----------
 7 files changed, 498 insertions(+), 255 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index 287614c2..cec332af 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -195,8 +195,13 @@ def train(self):
             return
 
         try:
-            states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch(self._batch_size, self._exp_priority)
-            loss, loss_ind = self._network.train(states, actions, rewards, next_states, terminals)
+            if hasattr(self._network, 'nstep'):
+                observations, actions, rewards, terminals, rndValidIndices = self._dataset.randomBatch_nstep(self._batch_size, self._network.nstep, self._exp_priority)
+                loss, loss_ind = self._network.train(observations, actions, rewards, terminals)
+            else:
+                states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch(self._batch_size, self._exp_priority)
+                loss, loss_ind = self._network.train(states, actions, rewards, next_states, terminals)
+
             self._training_loss_averages.append(loss)
             if (self._exp_priority):
                 self._dataset.updatePriorities(pow(loss_ind,self._exp_priority)+0.0001, rndValidIndices[1])
@@ -382,7 +387,7 @@ def _chooseAction(self):
         
         if self._mode != -1:
             # Act according to the test policy if not in training mode
-            action, V = self._test_policy.action(self._state)
+            action, V = self._test_policy.action(self._state, mode=self._mode)
         else:
             if self._dataset.n_elems > self._replay_start_size:
                 # follow the train policy
@@ -495,14 +500,17 @@ def updatePriorities(self, priorities, rndValidIndices):
             self._prioritiy_tree.update(rndValidIndices[i], priorities[i])
 
     def randomBatch(self, size, use_priority):
-        """Return corresponding states, actions, rewards, terminal status, and next_states for size randomly 
-        chosen transitions. Note that if terminal[i] == True, then next_states[s][i] == np.zeros_like(states[s][i]) for 
+        """Return corresponding states, actions, rewards, terminal status, and next_states for size randomly
+        chosen transitions. Note that if terminal[i] == True, then next_states[s][i] == np.zeros_like(states[s][i]) for
         each subject s.
         
         Parameters
         -----------
         size : int
             Number of transitions to return.
+        use_priority : Boolean
+            Whether to use prioritized replay or not
+
         Returns
         -------
         states : ndarray
@@ -522,6 +530,7 @@ def randomBatch(self, size, use_priority):
         terminals : ndarray
             An ndarray(size=number_of_subjects, dtype='bool') where terminals[i] is True if actions[i] lead
             to terminal states and False otherwise
+
         Throws
         -------
             SliceError
@@ -529,7 +538,7 @@ def randomBatch(self, size, use_priority):
                 trajectories are too short).
         """
 
-        if (self._max_history_size - self.sticky_action >= self.n_elems):
+        if (self._max_history_size + self.sticky_action - 1 >= self.n_elems):
             raise SliceError(
                 "Not enough elements in the dataset to create a "
                 "complete state. {} elements in dataset; requires {}"
@@ -571,7 +580,7 @@ def randomBatch(self, size, use_priority):
             states[input] = np.zeros((size,) + self._batch_dimensions[input], dtype=self._observations[input].dtype)
             next_states[input] = np.zeros_like(states[input])
             for i in range(size):
-                slice=self._observations[input].getSlice(rndValidIndices[i]-self.sticky_action+2-min(self._batch_dimensions[input][0],first_terminals[i]+self.sticky_action-1), rndValidIndices[i]-self.sticky_action+2)
+                slice=self._observations[input].getSlice(rndValidIndices[i]-self.sticky_action+2-min(self._batch_dimensions[input][0],first_terminals[i]+self.sticky_action-1), rndValidIndices[i]+1)
                 if (len(slice)==len(states[input][i])):
                     states[input][i] = slice
                 else:
@@ -594,6 +603,109 @@ def randomBatch(self, size, use_priority):
         else:
             return states, actions, rewards, next_states, terminals, rndValidIndices
 
+    def randomBatch_nstep(self, size, nstep, use_priority):
+        """Return corresponding states, actions, rewards, terminal status, and next_states for size randomly
+        chosen transitions. Note that if terminal[i] == True, then next_states[s][i] == np.zeros_like(states[s][i]) for
+        each subject s.
+        
+        Parameters
+        -----------
+        size : int
+            Batch size
+        nstep : int
+            Number of transitions to be considered for each element
+        use_priority : Boolean
+            Whether to use prioritized replay or not
+
+        Returns
+        -------
+        states : ndarray
+            An ndarray(size=number_of_subjects, dtype='object), where states[s] is a 2+D matrix of dimensions
+            size x s.memorySize x "shape of a given observation for this subject". States were taken randomly in
+            the data with the only constraint that they are complete regarding the histories for each observed
+            subject.
+        actions : ndarray
+            An ndarray(size=number_of_subjects, dtype='int32') where actions[i] is the action taken after
+            having observed states[:][i].
+        rewards : ndarray
+            An ndarray(size=number_of_subjects, dtype='float32') where rewards[i] is the reward obtained for
+            taking actions[i-1].
+        next_states : ndarray
+            Same structure than states, but next_states[s][i] is guaranteed to be the information
+            concerning the state following the one described by states[s][i] for each subject s.
+        terminals : ndarray
+            An ndarray(size=number_of_subjects, dtype='bool') where terminals[i] is True if actions[i] lead
+            to terminal states and False otherwise
+
+        Throws
+        -------
+            SliceError
+                If a batch of this size could not be built based on current data set (not enough data or all
+                trajectories are too short).
+        """
+
+        if (self._max_history_size + self.sticky_action - 1 >= self.n_elems):
+            raise SliceError(
+                "Not enough elements in the dataset to create a "
+                "complete state. {} elements in dataset; requires {}"
+                .format(self.n_elems, self._max_history_size))
+
+        if (self._use_priority):
+            #FIXME : take into account the case where self._only_full_history is false
+            rndValidIndices, rndValidIndices_tree = self._randomPrioritizedBatch(size)
+            if (rndValidIndices.size == 0):
+                raise SliceError("Could not find a state with full histories")
+        else:
+            rndValidIndices = np.zeros(size, dtype='int32')
+            if (self._only_full_history):
+                for i in range(size): # TODO: multithread this loop?
+                    rndValidIndices[i] = self._randomValidStateIndex(self._max_history_size+self.sticky_action*nstep-1)
+            else:
+                for i in range(size): # TODO: multithread this loop?
+                    rndValidIndices[i] = self._randomValidStateIndex(minimum_without_terminal=self.sticky_action*nstep)
+                
+
+        actions=np.zeros((size,(nstep)*self.sticky_action), dtype=int)
+        rewards=np.zeros((size,(nstep)*self.sticky_action))
+        terminals=np.zeros((size,(nstep)*self.sticky_action))
+        for i in range(size):
+            actions[i] = self._actions.getSlice(rndValidIndices[i]-self.sticky_action*nstep+1,rndValidIndices[i]+self.sticky_action)
+            rewards[i] = self._rewards.getSlice(rndValidIndices[i]-self.sticky_action*nstep+1,rndValidIndices[i]+self.sticky_action)
+            terminals[i] = self._terminals.getSlice(rndValidIndices[i]-self.sticky_action*nstep+1,rndValidIndices[i]+self.sticky_action)
+        
+        observations = np.zeros(len(self._batch_dimensions), dtype='object')
+        # We calculate the first terminal index backward in time and set it 
+        # at maximum to the value self._max_history_size+self.sticky_action-1
+        first_terminals=[]
+        for rndValidIndex in rndValidIndices:
+            first_terminal=1
+            while first_terminal<self._max_history_size+self.sticky_action*nstep-1:
+                if (self._terminals[rndValidIndex-first_terminal]==True or first_terminal>rndValidIndex):
+                    break 
+                first_terminal+=1
+            first_terminals.append(first_terminal)
+            
+        batch_dimensions=copy.deepcopy(self._batch_dimensions)
+        for input in range(len(self._batch_dimensions)):
+            batch_dimensions[input]=tuple( x + y for x, y in zip(self._batch_dimensions[input],(self.sticky_action*(nstep+1)-1,0,0)) )
+            observations[input] = np.zeros((size,) + batch_dimensions[input], dtype=self._observations[input].dtype)
+            for i in range(size):
+                slice=self._observations[input].getSlice(rndValidIndices[i]-self.sticky_action*nstep+2-min(self._batch_dimensions[input][0],first_terminals[i]-self.sticky_action*nstep+1), rndValidIndices[i]+self.sticky_action+1)
+                if (len(slice)==len(observations[input][i])):
+                    observations[input][i] = slice
+                else:
+                    for j in range(len(slice)):
+                        observations[input][i][-j-1]=slice[-j-1]
+                 # If transition leads to terminal, we don't care about next state
+                if terminals[i][-1]:#rndValidIndices[i] >= self.n_elems - 1 or terminals[i]:
+                    observations[input][rndValidIndices[i]:rndValidIndices[i]+self.sticky_action+1] = 0
+        
+        if (self._use_priority):
+            return observations, actions, rewards, terminals, [rndValidIndices, rndValidIndices_tree]
+        else:
+            return observations, actions, rewards, terminals, rndValidIndices
+
+
     def _randomValidStateIndex(self, minimum_without_terminal):
         """ Returns the index corresponding to a timestep that is valid
         """
diff --git a/deer/base_classes/Policy.py b/deer/base_classes/Policy.py
index b64b3c40..2797f2af 100644
--- a/deer/base_classes/Policy.py
+++ b/deer/base_classes/Policy.py
@@ -19,10 +19,10 @@ def __init__(self, q_network, n_actions,random_state):
 
         pass
 
-    def bestAction(self, state):
+    def bestAction(self, state, mode=None):
         """ Returns the best Action for the given state. This is an additional encapsulation for q-network.
         """
-        action,V = self.q_network.chooseBestAction(state)
+        action,V = self.q_network.chooseBestAction(state, mode)
         return action, V
 
     def randomAction(self):
diff --git a/deer/policies/EpsilonGreedyPolicy.py b/deer/policies/EpsilonGreedyPolicy.py
index f9386923..547142ae 100644
--- a/deer/policies/EpsilonGreedyPolicy.py
+++ b/deer/policies/EpsilonGreedyPolicy.py
@@ -14,11 +14,11 @@ def __init__(self, q_network, n_actions, random_state, epsilon):
         Policy.__init__(self, q_network, n_actions, random_state)
         self._epsilon = epsilon
 
-    def action(self, state):
+    def action(self, state, mode=None):
         if self.random_state.rand() < self._epsilon:
             action, V = self.randomAction()
         else:
-            action, V = self.bestAction(state)
+            action, V = self.bestAction(state, mode)
 
         return action, V
 
diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/q_networks/NN_keras_lp_high_int_dim.py
index e666a506..49c9c4b9 100644
--- a/deer/q_networks/NN_keras_lp_high_int_dim.py
+++ b/deer/q_networks/NN_keras_lp_high_int_dim.py
@@ -26,7 +26,7 @@ class NN():
     high_int_dim : Boolean
         Whether the abstract state should be high dimensional in the form of frames/vectors or whether it should be low-dimensional
     """
-    def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False, high_int_dim=True):
+    def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False, high_int_dim=False):
         self._input_dimensions=input_dimensions
         self._batch_size=batch_size
         self._random_state=random_state
@@ -62,17 +62,25 @@ def encoder_model(self):
                 input = Input(shape=(dim[0],dim[1],dim[2]))
                 inputs.append(input)
                 x=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)    #data_format='channels_last'
-                #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
-                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                #x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
-                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
-                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                #x = Conv2D(16, (4, 4), padding='same', activation='tanh')(x)
-                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                
+                if(dim[1]>8 and dim[2]>8):
+                    self._pooling_encoder=6
+                    #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
+                    #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                    x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
+                    x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                    x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+                    x = MaxPooling2D(pool_size=(3, 3), strides=None, padding='same')(x)
+                    #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
+                    #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                    #x = Conv2D(16, (4, 4), padding='same', activation='tanh')(x)
+                    #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                else:
+                    self._pooling_encoder=1
+                    x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
+                    x = MaxPooling2D(pool_size=(self._pooling_encoder, self._pooling_encoder), strides=None, padding='same')(x)
+                    
                 if(self._high_int_dim==True):
-                    x = Conv2D(dim[0], (3, 3), padding='same', activation='tanh')(x)
+                    x = Conv2D(dim[0], (1, 1), padding='same')(x)
                     out = x
                 else:
                     out = Flatten()(x)
@@ -198,22 +206,22 @@ def transition_model(self):
         """
         if(self._high_int_dim==True):
             dim=self._input_dimensions[0] #FIXME
-            inputs = [ Input(shape=(dim[1],dim[2],dim[0])), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
+            inputs = [ Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),dim[0])), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
             print inputs[0]._keras_shape
             print inputs[1]._keras_shape
             
             layers_action=inputs[1]
-            layers_action=RepeatVector(dim[1]*dim[2])(layers_action)#K.repeat_elements(layers_action,rep=dim[1]*dim[2],axis=1)
-            layers_action=Reshape((self._n_actions,dim[1],dim[2]))(layers_action)
-            layers_action=Permute((2,3,1), input_shape=(dim[0]+self._n_actions,dim[1],dim[2]))(layers_action)    #data_format='channels_last'
+            layers_action=RepeatVector(-(-dim[1] // self._pooling_encoder)*-(-dim[2] // self._pooling_encoder))(layers_action)#K.repeat_elements(layers_action,rep=dim[1]*dim[2],axis=1)
+            layers_action=Reshape((self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)
+            layers_action=Permute((2,3,1), input_shape=(dim[0]+self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
             
             x = Concatenate(axis=-1)([layers_action,inputs[0]])
             
-            x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
-            x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
-            x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(16, (1, 1), padding='same', activation='tanh')(x) # Try to keep locality as much as possible --> FIXME
+            x = Conv2D(64, (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(16, (1, 1), padding='same', activation='tanh')(x)
             #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-            x = Conv2D(dim[0], (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(dim[0], (1, 1), padding='same', activation='tanh')(x)
             x = Add()([inputs[0],x])
         else:
             inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ]     # x
@@ -259,7 +267,7 @@ def transition_model2(self):
         
         return model
 
-    def diff_Tx_x_(self,encoder_model,transition_model):
+    def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
         """
     
         Parameters
@@ -292,20 +300,20 @@ def diff_Tx_x_(self,encoder_model,transition_model):
         enc_x = encoder_model(inputs[:half]) #s --> x
         enc_x_ = encoder_model(inputs[half:]) #s --> x
 
-        input = Input(shape=(self._n_actions,))
-        inputs.append(input)
+        Tx= enc_x
+        for d in range(plan_depth+1):
+            inputs.append(Input(shape=(self._n_actions,)))
+            Tx= transition_model([Tx,inputs[-1]])
                 
-        Tx= transition_model([enc_x,inputs[-1]])
         print "Tx._keras_shape"
         print Tx._keras_shape
         print enc_x_._keras_shape
+        
+        x = Subtract()([Tx,enc_x_])
 
-        if (self._high_int_dim==True):
-            #Tx=Flatten()(Tx)
-            #enc_x_=Flatten()(enc_x_)
-            x = Subtract()([Tx,enc_x_])   #([Tx,enc_x_])
-        else:
-            x = Subtract()([Tx,enc_x_])
+        input = Input(shape=(1,)) # 1-terminals (0 if transition is terminal)
+        inputs.append(input)
+        x = Multiply()([x,inputs[-1]])# set to 0 if terminal because we don't care about fitting that transition
         
         model = Model(inputs=inputs, outputs=x )
         
@@ -456,15 +464,21 @@ def R_model(self):
         
         if(self._high_int_dim==True):
             dim=self._input_dimensions[0] #FIXME
-            inputs = [ Input(shape=(dim[1],dim[2],dim[0])), Input( shape=(self._n_actions,) ) ]     #data_format='channels_last'
+            inputs = [ Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),dim[0])), Input( shape=(self._n_actions,) ) ]     #data_format='channels_last'
             
             layers_action=inputs[1]
-            layers_action=RepeatVector(dim[1]*dim[2])(layers_action)
-            layers_action=Reshape((self._n_actions,dim[1],dim[2]))(layers_action)
-            layers_action=Permute((2,3,1), input_shape=(dim[0]+self._n_actions,dim[1],dim[2]))(layers_action)    #data_format='channels_last'
+            layers_action=RepeatVector(-(-dim[1] // self._pooling_encoder)*-(-dim[2] // self._pooling_encoder))(layers_action)
+            print layers_action._keras_shape
+            layers_action=Reshape((self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)
+            layers_action=Permute((2,3,1), input_shape=(dim[0]+self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
+            print layers_action._keras_shape
+
             
             x = Concatenate(axis=-1)([layers_action,inputs[0]])
-            x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
+            x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+            x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+            x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
             #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
             x = Conv2D(dim[0], (3, 3), padding='same', activation='tanh')(x)
             x = Flatten()(x)            
@@ -473,8 +487,8 @@ def R_model(self):
             x = Concatenate()(inputs)#,axis=-1)
             x = Dense(10, activation='tanh')(x)
        
+        x = Dense(50, activation='tanh')(x)
         x = Dense(20, activation='tanh')(x)
-        x = Dense(10, activation='tanh')(x)
         
         out = Dense(1)(x)
                 
@@ -482,7 +496,7 @@ def R_model(self):
         
         return model
 
-    def full_R_model(self,encoder_model,R_model):
+    def full_R_model(self,encoder_model,R_model,plan_depth=0,transition_model=None):
         """
         Maps internal state to immediate rewards
 
@@ -490,7 +504,6 @@ def full_R_model(self,encoder_model,R_model):
         -----------
         s
         a
-        (noise in abstract state space) : FIXME
     
         Returns
         -------
@@ -512,12 +525,17 @@ def full_R_model(self,encoder_model,R_model):
                 input = Input(shape=(dim[0],))
                 inputs.append(input)
         
+        enc_x = encoder_model(inputs[:]) #s --> x
+        
+        Tx= enc_x
+        for d in range(plan_depth):
+            inputs.append(Input(shape=(self._n_actions,)))
+            Tx= transition_model([Tx,inputs[-1]])
+
         input = Input(shape=(self._n_actions,))
         inputs.append(input)
         
-        enc_x = encoder_model(inputs[:-1]) #s --> x
-                
-        out = R_model([enc_x]+inputs[-1:])
+        out = R_model([Tx]+inputs[-1:])
 
         model = Model(inputs=inputs, outputs=out)
         
@@ -532,15 +550,17 @@ def Q_model(self):
                 print "dim Q mod"
                 print dim
                 if len(dim) == 3:
-                    input = Input(shape=(dim[1],dim[2],dim[0])) #data_format is already 'channels_last'
+                    input = Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),dim[0])) #data_format is already 'channels_last'
                     inputs.append(input)
                     #reshaped=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)
                     x = input     #data_format is already 'channels_last'
                     print x._keras_shape
             
-                    x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
                     x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
-                    x = Conv2D(1, (2, 2), padding='same', activation='tanh')(x)
+                    x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+                    x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                    x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+                    x = Conv2D(1, (3, 3), padding='same', activation='tanh')(x)
                     out = (x)
                 else:
                     print ("FIXME")
@@ -594,7 +614,7 @@ def Q_model(self):
         return model
 
 
-    def full_Q_model(self, encoder_model, Q_model):
+    def full_Q_model(self, encoder_model, Q_model, plan_depth=0, transition_model=None, R_model=None, discount_model=None):
         """
         Build a network consistent with each type of inputs
 
@@ -623,20 +643,41 @@ def full_Q_model(self, encoder_model, Q_model):
                 inputs.append(input)
         
         out = encoder_model(inputs)
-        print out._keras_shape
 
-        if(self._high_int_dim==True):
-            input = Input(shape=(dim[1],dim[2],dim[0]))
-            inputs.append(input)
-        else:
-            input = Input(shape=(self.internal_dim,))
-            inputs.append(input)
+        disc_plan = None
+        disc_rewards=[]
+        for d in range(plan_depth):
+            inputs.append(Input(shape=(self._n_actions,)))
+            print inputs[-1:]
+            reward=R_model([out]+inputs[-1:])
+            if(disc_plan == None):
+                disc_rewards.append(reward)
+            else:
+                disc_rewards.append(Multiply()([disc_plan,reward]))
+            discount=discount_model([out]+inputs[-1:])
+            if(disc_plan == None):
+                disc_plan=discount
+            else:
+                disc_plan=Multiply()([disc_plan,discount]) #disc_model([out]+inputs[-1:])
+
+            out=transition_model([out]+inputs[-1:])
 
-        x=Add()([out,inputs[-1]]) # adding noise in the abstract state space
+        #if(self._high_int_dim==True):
+        #    input = Input(shape=(dim[1],dim[2],dim[0]))
+        #    inputs.append(input)
+        #else:
+        #    input = Input(shape=(self.internal_dim,))
+        #    inputs.append(input)
+        #
+        #x=Add()([out,inputs[-1]]) # adding noise in the abstract state space
         
-        out = Q_model(out)
+        if(plan_depth==0):
+            Q_estim=Q_model(out)
+        else:
+            Q_estim = Multiply()([disc_plan,Q_model(out)])
+            Q_estim = Add()([Q_estim]+disc_rewards)
 
-        model = Model(inputs=inputs, outputs=out)
+        model = Model(inputs=inputs, outputs=Q_estim)
         
         return model
 
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index f9669332..72675445 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -81,7 +81,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.loss_disambiguate2=0
 
         
-        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=True)
+        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=False)
 
         self.encoder = self.learn_and_plan.encoder_model()
         self.encoder_diff = self.learn_and_plan.encoder_diff_model(self.encoder)
@@ -91,7 +91,9 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.transition = self.learn_and_plan.transition_model()
         self.transition2 = self.learn_and_plan.transition_model2()
 
-        self.full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q)
+        self.full_Qs=[]
+        for i in range(1):
+            self.full_Qs.append(self.learn_and_plan.full_Q_model(self.encoder,self.Q,i,self._df))
         
         # used to fit rewards
         self.full_R = self.learn_and_plan.full_R_model(self.encoder,self.R)
@@ -107,7 +109,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         # used to disentangle actions
         self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)
                 
-        layers=self.full_Q.layers
+        layers=self.full_Qs[0].layers
         # Grab all the parameters together.
         self.params = [ param
                     for layer in layers 
@@ -115,7 +117,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
 
         self._compile()
 
-        self.next_full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q)
+        self.next_full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q) # FIXME
         self.next_full_Q.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.full_Q
 
         layers=self.next_full_Q.layers
@@ -173,14 +175,14 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print "len(states_val)"
             print len(states_val)
             print next_states_val[0][0]
-            print actions_val, rewards_val, terminals_val
-            print "Es,ETs,Es_"
+            print actions_val[0], rewards_val[0], terminals_val[0]
+            print "Es[0],ETs[0],Es_[0]"
             if(Es.ndim==4):
-                print np.transpose(Es, (0, 3, 1, 2)),np.transpose(ETs, (0, 3, 1, 2)),np.transpose(Es_, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
+                print np.transpose(Es, (0, 3, 1, 2))[0],np.transpose(ETs, (0, 3, 1, 2))[0],np.transpose(Es_, (0, 3, 1, 2))[0]    # data_format='channels_last' --> 'channels_first'
             else:
-                print Es,ETs,Es_
-            print "R"
-            print R
+                print Es[0],ETs[0],Es_[0]
+            print "R[0]"
+            print R[0]
             
         # Fit transition
 #        for i in range(10):
@@ -188,7 +190,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 #            print l
 #            self.loss_T2+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
 
-        l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions], np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))
+        l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions]+[(1-terminals_val)], np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))
         self.loss_T+=l
         
 
@@ -263,10 +265,12 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
         
-        next_q_vals = self.next_full_Q.predict([next_states_val[0],np.zeros_like(Es)]) #np.zeros((32,self.learn_and_plan.internal_dim))])
+        #next_q_vals = self.next_full_Q.predict([next_states_val[0],np.zeros_like(Es)]) #np.zeros((32,self.learn_and_plan.internal_dim))])
+        next_q_vals = self.next_full_Q.predict([next_states_val[0]])
         
         if(self._double_Q==True):
-            next_q_vals_current_qnet=self.full_Q.predict(next_states_val.tolist())
+            #next_q_vals_current_qnet=self.full_Qs[0].predict(next_states_val+[np.zeros_like(Es)])
+            next_q_vals_current_qnet=self.full_Qs[0].predict(next_states_val)
             argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1)
             max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1))
         else:
@@ -276,8 +280,9 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         
-        q_vals=self.full_Q.predict([states_val[0],np.zeros_like(Es)]) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])
-
+        #q_vals=self.full_Q.predict([states_val[0],np.zeros_like(Es)]) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])
+        q_vals=self.full_Qs[0].predict([states_val[0]])
+        
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
         q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
         diff = - q_val + target 
@@ -293,7 +298,8 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         noise_to_be_robust=np.zeros_like(Es) #np.random.normal(size=(self._batch_size,self.learn_and_plan.internal_dim))*0.#25
 
         loss=0
-        loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
+        #loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
+        loss=self.full_Qs[0].train_on_batch([states_val[0]] , q_vals ) 
         #print "self.q_vals.optimizer.lr"
         #print K.eval(self.q_vals.optimizer.lr)
         self.loss_Q+=loss
@@ -364,9 +370,10 @@ def qValues(self, state_val):
         -------
         The q values for the provided belief state
         """ 
-        return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
+        #return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
+        return self.full_Qs[0].predict([np.expand_dims(state,axis=0) for state in state_val])[0]
 
-    def qValues_planning(self, state_val, d=2):
+    def qValues_planning(self, state_val, d=5):
         """ Get the q values for one belief state with a planning depth d
 
         Arguments
@@ -379,43 +386,116 @@ def qValues_planning(self, state_val, d=2):
         The q values with planning depth d for the provided belief state
         """ 
         encoded_x = self.encoder.predict([np.expand_dims(state,axis=0) for state in state_val])
+        print encoded_x[0]
+
+        ## DEBUG PURPOSES
+        identity_matrix = np.diag(np.ones(self._n_actions))
+        if(encoded_x.ndim==2):
+            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
+        elif(encoded_x.ndim==4):
+            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1,1,1))
+        else:
+            print ("error")
+
+        repeat_identity=np.repeat(identity_matrix,len(encoded_x),axis=0)
+        #print tile3_encoded_x
+        #print repeat_identity
+        r_vals_d0=np.array(self.R.predict([tile3_encoded_x,repeat_identity]))
+        #print "r_vals_d0"
+        #print r_vals_d0
+        r_vals_d0=r_vals_d0.flatten()
+        print "r_vals_d0"
+        print r_vals_d0
+        next_x_predicted=self.transition.predict([tile3_encoded_x,repeat_identity])
+        print "next_x_predicted"
+        print next_x_predicted
+        next_x_predicted=self.transition.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
+        print "next_x_predicted action 0 t2"
+        print next_x_predicted
+        next_x_predicted=self.transition.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
+        print "next_x_predicted action 0 t3"
+        print next_x_predicted
+        next_x_predicted=self.transition.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
+        print "next_x_predicted action 0 t4"
+        print next_x_predicted
+        ## END DEBUG PURPOSES
+
         QD_plan=0
         for i in range(d+1): #TO DO: improve planning algorithm
-            Qd=self.qValues_planning_abstr(encoded_x, d=i)
+            Qd=self.qValues_planning_abstr(encoded_x, d=i, branching_factor=2)
+            print Qd
             QD_plan+=Qd
-            #print "Qd,i"
-            #print Qd,i
+            print "Qd,i"
+            print Qd,i
         QD_plan=QD_plan/(d+1)
         
-        #print "QD_plan"
-        #print QD_plan
+        print "QD_plan"
+        print QD_plan
 
         return QD_plan
 
-    def qValues_planning_abstr(self, state_abstr_val, d):
-        """ 
+    def qValues_planning_abstr(self, state_abstr_val, d, branching_factor=None):
         """ 
+        """
+        if(branching_factor==None or branching_factor>self._n_actions):
+            branching_factor=self._n_actions
         #print "qValues_planning_abstr d"
         #print d
         n=len(state_abstr_val)
         identity_matrix = np.diag(np.ones(self._n_actions))
+        
+        if (n==1):
+            this_branching_factor=self._n_actions
+        else:
+            this_branching_factor=branching_factor
+                         
         if (d==0):
-            #print self.Q.predict([state_abstr_val])
-            return self.Q.predict([state_abstr_val])
+            if(this_branching_factor<self._n_actions):
+                return np.partition(Q.predict([state_abstr_val]), -this_branching_factor)[:,-this_branching_factor:]
+            else:
+                return Q.predict([state_abstr_val]) # no change in the order of the actions
         else:
-            tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
-            repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0)
+            if(this_branching_factor==self._n_actions):
+                # All actions are considered in the tree
+                repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0)
+                if(state_abstr_val.ndim==2):
+                    tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
+                elif(state_abstr_val.ndim==4):
+                    tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1,1,1))
+                else:
+                    print ("error")
+            else:
+                # A subset of the actions are considered in the tree
+                estim_Q_values=self.Q.predict([state_abstr_val])
+                #print estim_Q_values
+                ind = np.argpartition(estim_Q_values, -this_branching_factor)[:,-this_branching_factor:]
+                #print ind
+                #print identity_matrix[ind]
+                #repeat_identity=np.repeat(identity_matrix[ind],len(state_abstr_val),axis=0)
+                repeat_identity=identity_matrix[ind].reshape(n*this_branching_factor,self._n_actions)
+                #print repeat_identity
+                #if(state_abstr_val.ndim==2):
+                #    tile3_encoded_x=np.tile(state_abstr_val,(this_branching_factor,1))
+                #elif(state_abstr_val.ndim==4):
+                #    tile3_encoded_x=np.tile(state_abstr_val,(this_branching_factor,1,1,1))
+                #else:
+                #    print ("error")
+                tile3_encoded_x=np.repeat(state_abstr_val,this_branching_factor,axis=0)
+                #print "tile3_encoded_x"
+                #print tile3_encoded_x
+            
             #print tile3_encoded_x
             #print repeat_identity
             r_vals_d0=np.array(self.R.predict([tile3_encoded_x,repeat_identity]))
+            #print "r_vals_d0"
             #print r_vals_d0
             r_vals_d0=r_vals_d0.flatten()
             next_x_predicted=self.transition.predict([tile3_encoded_x,repeat_identity])
-            return r_vals_d0+self._df*np.amax(self.qValues_planning_abstr(next_x_predicted,d=d-1).reshape(len(state_abstr_val)*self._n_actions,self._n_actions),axis=1).flatten()
+            return r_vals_d0+self._df*np.amax(self.qValues_planning_abstr(next_x_predicted,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor),axis=1).flatten()
         
 
 
-    def chooseBestAction(self, state):
+    def chooseBestAction(self, state, *args, **kwargs):
         """ Get the best action for a belief state
 
         Arguments
@@ -439,7 +519,7 @@ def _compile(self):
         else:
             raise Exception('The update_rule '+self._update_rule+' is not implemented.')
         
-        self.full_Q.compile(optimizer=optimizer, loss='mse')
+        self.full_Qs[0].compile(optimizer=optimizer, loss='mse')
 
         optimizer1=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # Different optimizers for each network; otherwise not possible to modify each
         optimizer2=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately (e.g. lr)
@@ -480,12 +560,12 @@ def setLearningRate(self, lr):
         Parameters
         -----------
         lr : float
-            The learning rate that has to bet set
+            The learning rate that has to be set
         """
         self._lr = lr
         print "modif lr"
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
-        K.set_value(self.full_Q.optimizer.lr, self._lr*2)
+        K.set_value(self.full_Qs[0].optimizer.lr, self._lr*2)
 
         K.set_value(self.full_R.optimizer.lr, self._lr)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr)
diff --git a/examples/simplest_test_PLI/run_test4.py b/examples/simplest_test_PLI/run_test4.py
index d4c0d0d6..7a83d695 100644
--- a/examples/simplest_test_PLI/run_test4.py
+++ b/examples/simplest_test_PLI/run_test4.py
@@ -11,7 +11,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
+from deer.q_networks.q_net_keras_lp_nstep import MyQNetwork
 from test_env4 import MyEnv as test_env
 import deer.experiment.base_controllers as bc
 
@@ -22,7 +22,7 @@ class Defaults:
     # ----------------------
     # Experiment Parameters
     # ----------------------
-    STEPS_PER_EPOCH = 5000
+    STEPS_PER_EPOCH = 2000
     EPOCHS = 50
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
@@ -80,7 +80,8 @@ class Defaults:
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
-        rng)
+        rng,
+        high_int_dim=False)
     
     test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
 
@@ -133,7 +134,7 @@ class Defaults:
         periodicity=1,
         reset_every='none'))
 
-    agent.run(10, 200)
+    agent.run(10, 200)  #(5, 50)
     print("end gathering data")
 
     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
diff --git a/examples/simplest_test_PLI/test_env4.py b/examples/simplest_test_PLI/test_env4.py
index d5618c22..5e85ea07 100644
--- a/examples/simplest_test_PLI/test_env4.py
+++ b/examples/simplest_test_PLI/test_env4.py
@@ -49,21 +49,22 @@ def create_map(self):
         #self._map[:,2*self._width//3]=1
         #self._map[2,2*self._width//3]=0
         self._pos_agent=[2,2]
-        #self._pos_goal=[2,6]
+        self._pos_goal=[2,6]
         #self._map[3,6]=0.66
 
                 
     def reset(self, mode):
         self.create_map()
 
-        if mode == -1:
-            i=np.random.randint(2)
-            if(i==0):
-                self._map[self._height//2-1,self._width//2]=0
-            if(i==1):
-                self._map[self._height//2+1,self._width//2]=0
-        else:
-            self._map[self._height//2+1,self._width//2]=0
+        self._map[self._height//2,self._width//2]=0
+        #if mode == -1:
+        #    i=np.random.randint(2)
+        #    if(i==0):
+        #        self._map[self._height//2-1,self._width//2]=0
+        #    if(i==1):
+        #        self._map[self._height//2+1,self._width//2]=0
+        #else:
+        #    self._map[self._height//2+1,self._width//2]=0
         
         if mode == MyEnv.VALIDATION_MODE:
             if self._mode != MyEnv.VALIDATION_MODE:
@@ -76,7 +77,11 @@ def reset(self, mode):
         elif self._mode != -1:
             self._mode = -1
         
-        self._pos_agent=[2,2]
+        if self._mode == -1:
+            self._pos_agent=[self._height//2,self._width//2]
+        else:
+            self._pos_agent=[1,1]
+            
         print "reset mode"
         print mode
         print "self._map"
@@ -102,8 +107,8 @@ def act(self, action):
                 self._pos_agent[1]=self._pos_agent[1]+1
         
         self.reward = 0
-        #if (self._pos_agent==self._pos_goal):
-        #    self.reward = 1
+        if (self._pos_agent==self._pos_goal):
+            self.reward = 1
 
         self._mode_score += self.reward
         return self.reward
@@ -112,13 +117,13 @@ def summarizePerformance(self, test_data_set, learning_algo):
         #print "test_data_set.observations.shape"
         #print test_data_set.observations()[0][0:1]
         
-        for i in range(3):
+        for i in range(1):
             all_possib_inp=[]
             self.create_map()
             for x_a in range(self._width):
                 for y_a in range(self._height):
                     state=copy.deepcopy(self._map)
-                    state[self._height//2+i-1,self._width//2]=0
+                    state[self._height//2,self._width//2]=0
 
                     if(state[y_a,x_a]==0):
                         state[y_a,x_a]=0.5
@@ -162,153 +167,153 @@ def summarizePerformance(self, test_data_set, learning_algo):
             print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
                     
             
-#            import matplotlib.pyplot as plt
-#            from mpl_toolkits.mplot3d import Axes3D
-#            import matplotlib.cm as cm
-#            m = cm.ScalarMappable(cmap=cm.jet)
-#            
-#            x = np.array(abs_states)[:,0]
-#            y = np.array(abs_states)[:,1]
-#            if(self.intern_dim>2):
-#                z = np.array(abs_states)[:,2]
-#            
-#            #Colors
-#            #onehot_actions = np.zeros((n, 4))
-#            #onehot_actions[np.arange(n), actions] = 1
-#            
-#            fig = plt.figure()
-#            if(self.intern_dim==2):
-#                ax = fig.add_subplot(111)
-#            else:
-#                ax = fig.add_subplot(111,projection='3d')
-#            
-#            #for j in range(3):
-#            #    # Plot the trajectory
-#            #    for i in xrange(n-1):
-#            #        #ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
-#            #        ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
-#            
-#            # Plot the estimated transitions
-#            for i in range(n-1):
-#                predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0,0]])])
-#                predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0,0]])])
-#                predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1,0]])])
-#                predicted4=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,0,1]])])
-#                if(self.intern_dim==2):
-#                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), color="0.15", alpha=0.75)
-#                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), color="0.4", alpha=0.75)
-#                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), color="0.65", alpha=0.75)
-#                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), color="0.9", alpha=0.75)
-#                else:
-#                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.15", alpha=0.75)
-#                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.4", alpha=0.75)
-#                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.65", alpha=0.75)
-#                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.9", alpha=0.75)
-#            
-##            for xx in np.arange(self._width)-self._width//2:
-##                for yy in np.arange(self._width)-self._width//2:
-##                    for zz in np.arange(self._width)-self._width//2:
-##                        predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
-##                        predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
-##                        ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
-##                        ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
-#            
-#            
-#            ## Plot the colorbar for the trajectory
-#            #fig.subplots_adjust(right=0.7)
-#            #ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
-#            ## Set the colormap and norm to correspond to the data for which the colorbar will be used.
-#            #cmap = matplotlib.cm.cool
-#            #norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
-#            #
-#            ## ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
-#            ## everything needed for a standalone colorbar.  There are many more kwargs, but the
-#            ## following gives a basic continuous colorbar with ticks and labels.
-#            #cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
-#            #                        norm=norm,
-#            #                        orientation='vertical')
-#            #cb1.set_label('Beginning to end of trajectory')
-#            
-#            
-#            # Plot the dots at each time step depending on the action taken
-#            length_block=[[0,15],[15,16],[16,31]]
-#            for i in range(3):
-#                if(self.intern_dim==2):
-#                    line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], s=30, marker='x', edgecolors='k', alpha=0.5)
-#                else:
-#                    line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], s=30, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
-#            #line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/4.,axis=1),(1,3))-0.125, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
-#            #line2 = ax.scatter(x, y, c=np.tile(np.expand_dims(1-actions/4.,axis=1),(1,3))-0.125, s=50, marker='o', edgecolors='k', alpha=0.75)
-#            if(self.intern_dim==2):
-#                axes_lims=[ax.get_xlim(),ax.get_ylim()]
-#            else:
-#                axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
-#            
-#            #zrange=axes_lims[2][1]-axes_lims[2][0]
-#            
-#            # Plot the legend for the dots
-#            from matplotlib.patches import Circle, Rectangle
-#            from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-##            box1 = TextArea(" State representation (action 0, action 1): ", textprops=dict(color="k"))
-##            
-##            box2 = DrawingArea(80, 20, 0, 0)
-##            el1 = Circle((10, 10), 5, fc="0.9", edgecolor="k", alpha=0.75)
-##            el2 = Circle((25, 10), 5, fc="0.65", edgecolor="k", alpha=0.75)
-##            el3 = Circle((40, 10), 5, fc="0.4", edgecolor="k", alpha=0.75)
-##            el4 = Circle((55, 10), 5, fc="0.15", edgecolor="k", alpha=0.75) 
-##            #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
-##            box2.add_artist(el1)
-##            box2.add_artist(el2)
-##            box2.add_artist(el3)
-##            box2.add_artist(el4)
-##           
-##           
-##            box = HPacker(children=[box1, box2],
-##                          align="center",
-##                          pad=0, sep=5)
-##            
-##            anchored_box = AnchoredOffsetbox(loc=3,
-##                                             child=box, pad=0.,
-##                                             frameon=True,
-##                                             bbox_to_anchor=(0., 1.07),
-##                                             bbox_transform=ax.transAxes,
-##                                             borderpad=0.,
-##                                             )
-##            ax.add_artist(anchored_box)
-#            
-#            
-#            # Plot the legend for transition estimates
-#            box1b = TextArea(" Estimated transitions (action 0, 1, 2 and 3): ", textprops=dict(color="k"))
-#            box2b = DrawingArea(90, 20, 0, 0)
-#            el1b = Rectangle((5, 10), 15,2, fc="0.9", alpha=0.75)
-#            el2b = Rectangle((25, 10), 15,2, fc="0.65", alpha=0.75) 
-#            el3b = Rectangle((45, 10), 15,2, fc="0.4", alpha=0.75)
-#            el4b = Rectangle((65, 10), 15,2, fc="0.15", alpha=0.75) 
-#            box2b.add_artist(el1b)
-#            box2b.add_artist(el2b)
-#            box2b.add_artist(el3b)
-#            box2b.add_artist(el4b)
+            import matplotlib.pyplot as plt
+            from mpl_toolkits.mplot3d import Axes3D
+            import matplotlib.cm as cm
+            m = cm.ScalarMappable(cmap=cm.jet)
+            
+            x = np.array(abs_states)[:,0]
+            y = np.array(abs_states)[:,1]
+            if(self.intern_dim>2):
+                z = np.array(abs_states)[:,2]
+            
+            #Colors
+            #onehot_actions = np.zeros((n, 4))
+            #onehot_actions[np.arange(n), actions] = 1
+            
+            fig = plt.figure()
+            if(self.intern_dim==2):
+                ax = fig.add_subplot(111)
+            else:
+                ax = fig.add_subplot(111,projection='3d')
+            
+            #for j in range(3):
+            #    # Plot the trajectory
+            #    for i in xrange(n-1):
+            #        #ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
+            #        ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
+            
+            # Plot the estimated transitions
+            for i in range(n-1):
+                predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0,0]])])
+                predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0,0]])])
+                predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1,0]])])
+                predicted4=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,0,1]])])
+                if(self.intern_dim==2):
+                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), color="0.15", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), color="0.4", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), color="0.65", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), color="0.9", alpha=0.75)
+                else:
+                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.15", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.4", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.65", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.9", alpha=0.75)
+            
+#            for xx in np.arange(self._width)-self._width//2:
+#                for yy in np.arange(self._width)-self._width//2:
+#                    for zz in np.arange(self._width)-self._width//2:
+#                        predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
+#                        predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
+#                        ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
+#                        ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
+            
+            
+            ## Plot the colorbar for the trajectory
+            #fig.subplots_adjust(right=0.7)
+            #ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
+            ## Set the colormap and norm to correspond to the data for which the colorbar will be used.
+            #cmap = matplotlib.cm.cool
+            #norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
+            #
+            ## ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
+            ## everything needed for a standalone colorbar.  There are many more kwargs, but the
+            ## following gives a basic continuous colorbar with ticks and labels.
+            #cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
+            #                        norm=norm,
+            #                        orientation='vertical')
+            #cb1.set_label('Beginning to end of trajectory')
+            
+            
+            # Plot the dots at each time step depending on the action taken
+            length_block=[[0,15],[15,16],[16,31]]
+            for i in range(3):
+                if(self.intern_dim==2):
+                    line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], s=30, marker='x', edgecolors='k', alpha=0.5)
+                else:
+                    line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], s=30, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
+            #line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/4.,axis=1),(1,3))-0.125, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
+            #line2 = ax.scatter(x, y, c=np.tile(np.expand_dims(1-actions/4.,axis=1),(1,3))-0.125, s=50, marker='o', edgecolors='k', alpha=0.75)
+            if(self.intern_dim==2):
+                axes_lims=[ax.get_xlim(),ax.get_ylim()]
+            else:
+                axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
+            
+            #zrange=axes_lims[2][1]-axes_lims[2][0]
+            
+            # Plot the legend for the dots
+            from matplotlib.patches import Circle, Rectangle
+            from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
+#            box1 = TextArea(" State representation (action 0, action 1): ", textprops=dict(color="k"))
 #            
-#            boxb = HPacker(children=[box1b, box2b],
+#            box2 = DrawingArea(80, 20, 0, 0)
+#            el1 = Circle((10, 10), 5, fc="0.9", edgecolor="k", alpha=0.75)
+#            el2 = Circle((25, 10), 5, fc="0.65", edgecolor="k", alpha=0.75)
+#            el3 = Circle((40, 10), 5, fc="0.4", edgecolor="k", alpha=0.75)
+#            el4 = Circle((55, 10), 5, fc="0.15", edgecolor="k", alpha=0.75) 
+#            #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
+#            box2.add_artist(el1)
+#            box2.add_artist(el2)
+#            box2.add_artist(el3)
+#            box2.add_artist(el4)
+#           
+#           
+#            box = HPacker(children=[box1, box2],
 #                          align="center",
 #                          pad=0, sep=5)
 #            
 #            anchored_box = AnchoredOffsetbox(loc=3,
-#                                             child=boxb, pad=0.,
+#                                             child=box, pad=0.,
 #                                             frameon=True,
-#                                             bbox_to_anchor=(0., 0.98),
+#                                             bbox_to_anchor=(0., 1.07),
 #                                             bbox_transform=ax.transAxes,
 #                                             borderpad=0.,
-#                                             )        
+#                                             )
 #            ax.add_artist(anchored_box)
-#            
-#            
-#            
-#            #ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-#            #ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-#            #ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-#            plt.show()
-#            plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+            
+            
+            # Plot the legend for transition estimates
+            box1b = TextArea(" Estimated transitions (action 0, 1, 2 and 3): ", textprops=dict(color="k"))
+            box2b = DrawingArea(90, 20, 0, 0)
+            el1b = Rectangle((5, 10), 15,2, fc="0.9", alpha=0.75)
+            el2b = Rectangle((25, 10), 15,2, fc="0.65", alpha=0.75) 
+            el3b = Rectangle((45, 10), 15,2, fc="0.4", alpha=0.75)
+            el4b = Rectangle((65, 10), 15,2, fc="0.15", alpha=0.75) 
+            box2b.add_artist(el1b)
+            box2b.add_artist(el2b)
+            box2b.add_artist(el3b)
+            box2b.add_artist(el4b)
+            
+            boxb = HPacker(children=[box1b, box2b],
+                          align="center",
+                          pad=0, sep=5)
+            
+            anchored_box = AnchoredOffsetbox(loc=3,
+                                             child=boxb, pad=0.,
+                                             frameon=True,
+                                             bbox_to_anchor=(0., 0.98),
+                                             bbox_transform=ax.transAxes,
+                                             borderpad=0.,
+                                             )        
+            ax.add_artist(anchored_box)
+            
+            
+            
+            #ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+            #ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+            #ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
+            plt.show()
+            plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 
 #        # Plot the Q_vals
@@ -425,6 +430,10 @@ def observe(self):
 
     def inTerminalState(self):
         return False
+        #if (self._pos_agent==self._pos_goal):
+        #    return True
+        #else:
+        #    return False
 
 
 

From edb4d1aa3e7d8294818451529fc2c5bee4c913ae Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 5 Apr 2018 15:51:10 -0400
Subject: [PATCH 41/96] working +- distrib laby

---
 deer/q_networks/NN_keras_lp_high_int_dim.py |  66 ++++----
 examples/simplest_test_PLI/run_test3.py     |  68 ++++----
 examples/simplest_test_PLI/run_test4.py     |  30 +++-
 examples/simplest_test_PLI/test_env3.py     |  56 +++++--
 examples/simplest_test_PLI/test_env4.py     | 169 ++++++++++++++------
 5 files changed, 255 insertions(+), 134 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/q_networks/NN_keras_lp_high_int_dim.py
index 49c9c4b9..8cc61248 100644
--- a/deer/q_networks/NN_keras_lp_high_int_dim.py
+++ b/deer/q_networks/NN_keras_lp_high_int_dim.py
@@ -6,7 +6,7 @@
 import numpy as np
 from keras import backend as K
 from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization, merge, RepeatVector
+from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization, merge, RepeatVector, AveragePooling2D
 from keras import regularizers
 np.random.seed(102912)
 
@@ -34,9 +34,11 @@ def __init__(self, batch_size, input_dimensions, n_actions, random_state, action
         self._action_as_input=action_as_input
         self._high_int_dim=high_int_dim
         if(high_int_dim==True):            
-            self.internal_dim=input_dimensions[0][-2]*input_dimensions[0][-1] # In the case where the observation is a frame (or an history of frames)
+#            self.internal_dim=input_dimensions[0][-2]*input_dimensions[0][-1] # In the case where the observation is a frame (or an history of frames)
+            self.n_channels_internal_dim=2#dim[0]
         else:
-            self.internal_dim=3
+            self.internal_dim=3 #2 for laby
+                                #3 for catcher
 
     def encoder_model(self):
         """
@@ -66,21 +68,22 @@ def encoder_model(self):
                     self._pooling_encoder=6
                     #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
                     #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                    x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
-                    x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                    x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
-                    x = MaxPooling2D(pool_size=(3, 3), strides=None, padding='same')(x)
-                    #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
+                    x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
+                    x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
+                    x = AveragePooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                    x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
+                    x = AveragePooling2D(pool_size=(3, 3), strides=None, padding='same')(x)
+                    #x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
                     #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                     #x = Conv2D(16, (4, 4), padding='same', activation='tanh')(x)
                     #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                 else:
                     self._pooling_encoder=1
-                    x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
-                    x = MaxPooling2D(pool_size=(self._pooling_encoder, self._pooling_encoder), strides=None, padding='same')(x)
+                    #x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
+                    #x = MaxPooling2D(pool_size=(self._pooling_encoder, self._pooling_encoder), strides=None, padding='same')(x)
                     
                 if(self._high_int_dim==True):
-                    x = Conv2D(dim[0], (1, 1), padding='same')(x)
+                    x = Conv2D(self.n_channels_internal_dim, (1, 1), padding='same')(x)
                     out = x
                 else:
                     out = Flatten()(x)
@@ -206,22 +209,24 @@ def transition_model(self):
         """
         if(self._high_int_dim==True):
             dim=self._input_dimensions[0] #FIXME
-            inputs = [ Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),dim[0])), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
+            inputs = [ Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),self.n_channels_internal_dim)), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
             print inputs[0]._keras_shape
             print inputs[1]._keras_shape
             
             layers_action=inputs[1]
             layers_action=RepeatVector(-(-dim[1] // self._pooling_encoder)*-(-dim[2] // self._pooling_encoder))(layers_action)#K.repeat_elements(layers_action,rep=dim[1]*dim[2],axis=1)
             layers_action=Reshape((self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)
-            layers_action=Permute((2,3,1), input_shape=(dim[0]+self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
+            layers_action=Permute((2,3,1), input_shape=(self.n_channels_internal_dim+self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
             
             x = Concatenate(axis=-1)([layers_action,inputs[0]])
             
             x = Conv2D(16, (1, 1), padding='same', activation='tanh')(x) # Try to keep locality as much as possible --> FIXME
+            x = Conv2D(32, (2, 2), padding='same', activation='tanh')(x)
             x = Conv2D(64, (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(32, (2, 2), padding='same', activation='tanh')(x)
             x = Conv2D(16, (1, 1), padding='same', activation='tanh')(x)
             #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-            x = Conv2D(dim[0], (1, 1), padding='same', activation='tanh')(x)
+            x = Conv2D(self.n_channels_internal_dim, (1, 1), padding='same')(x)
             x = Add()([inputs[0],x])
         else:
             inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ]     # x
@@ -269,7 +274,8 @@ def transition_model2(self):
 
     def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
         """
-    
+        Used to fit the transitions
+        
         Parameters
         -----------
         s
@@ -321,7 +327,8 @@ def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
 
     def diff_s_s_(self,encoder_model):
         """
-    
+        Used to force some state representation to be sufficiently different
+        
         Parameters
         -----------
         s
@@ -464,24 +471,27 @@ def R_model(self):
         
         if(self._high_int_dim==True):
             dim=self._input_dimensions[0] #FIXME
-            inputs = [ Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),dim[0])), Input( shape=(self._n_actions,) ) ]     #data_format='channels_last'
+            inputs = [ Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),self.n_channels_internal_dim)), Input( shape=(self._n_actions,) ) ]     #data_format='channels_last'
             
             layers_action=inputs[1]
             layers_action=RepeatVector(-(-dim[1] // self._pooling_encoder)*-(-dim[2] // self._pooling_encoder))(layers_action)
             print layers_action._keras_shape
             layers_action=Reshape((self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)
-            layers_action=Permute((2,3,1), input_shape=(dim[0]+self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
+            layers_action=Permute((2,3,1), input_shape=(self.n_channels_internal_dim+self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
             print layers_action._keras_shape
 
             
             x = Concatenate(axis=-1)([layers_action,inputs[0]])
-            x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
-            x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
+            x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
             x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-            x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+            x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
             #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-            x = Conv2D(dim[0], (3, 3), padding='same', activation='tanh')(x)
-            x = Flatten()(x)            
+            x = Conv2D(4, (1, 1), padding='same', activation='tanh')(x)
+
+            # we stack a deep fully-connected network on top
+            x = Flatten()(x)
+            x = Dense(200, activation='tanh')(x)
         else:
             inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
             x = Concatenate()(inputs)#,axis=-1)
@@ -550,17 +560,17 @@ def Q_model(self):
                 print "dim Q mod"
                 print dim
                 if len(dim) == 3:
-                    input = Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),dim[0])) #data_format is already 'channels_last'
+                    input = Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),self.n_channels_internal_dim)) #data_format is already 'channels_last'
                     inputs.append(input)
                     #reshaped=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)
                     x = input     #data_format is already 'channels_last'
                     print x._keras_shape
             
-                    x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
-                    x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
+                    x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
+                    x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
                     x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                    x = Conv2D(16, (3, 3), padding='same', activation='tanh')(x)
-                    x = Conv2D(1, (3, 3), padding='same', activation='tanh')(x)
+                    x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
+                    x = Conv2D(4, (1, 1), padding='same', activation='tanh')(x)
                     out = (x)
                 else:
                     print ("FIXME")
diff --git a/examples/simplest_test_PLI/run_test3.py b/examples/simplest_test_PLI/run_test3.py
index 41538620..9cd9bf45 100644
--- a/examples/simplest_test_PLI/run_test3.py
+++ b/examples/simplest_test_PLI/run_test3.py
@@ -23,7 +23,7 @@ class Defaults:
     # Experiment Parameters
     # ----------------------
     STEPS_PER_EPOCH = 5000
-    EPOCHS = 500
+    EPOCHS = 50
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
     
@@ -46,7 +46,7 @@ class Defaults:
     MOMENTUM = 0
     CLIP_DELTA = 1.0
     EPSILON_START = 1.0
-    EPSILON_MIN = .3
+    EPSILON_MIN = 1.0
     EPSILON_DECAY = 10000
     UPDATE_FREQUENCY = 1
     REPLAY_MEMORY_SIZE = 1000000
@@ -68,7 +68,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = test_env()
+    env = test_env(rng, higher_dim_obs=True)
     
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
@@ -82,7 +82,7 @@ class Defaults:
         parameters.update_rule,
         rng)
     
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
 
     # --- Instantiate agent ---
     agent = NeuralAgent(
@@ -106,7 +106,28 @@ class Defaults:
     agent.attach(bc.VerboseController(
         evaluate_on='epoch', 
         periodicity=1))
+        
+    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+    # episode or epoch (or never, hence the resetEvery='none').
+    agent.attach(bc.EpsilonController(
+        initial_e=parameters.epsilon_start, 
+        e_decays=parameters.epsilon_decay, 
+        e_min=parameters.epsilon_min,
+        evaluate_on='action',
+        periodicity=1,
+        reset_every='none'))
+        
+    # --- Run the experiment ---
+    try:
+        os.mkdir("params")
+    except Exception:
+        pass
+    dump(vars(parameters), "params/" + fname + ".jldump")
     
+    agent.run(n_epochs=1, epoch_length=20000)
+    print "end gathering data"
     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
     # Plus, we also want to display after each training episode (!= than after every training) the average bellman
     # residual and the average of the V values obtained during the last episode, hence the two last arguments.
@@ -115,7 +136,7 @@ class Defaults:
         periodicity=parameters.update_frequency, 
         show_episode_avg_V_value=True, 
         show_avg_Bellman_residual=True))
-    
+
     # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
     # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
     agent.attach(bc.LearningRateController(
@@ -129,33 +150,7 @@ class Defaults:
         discount_factor_growth=parameters.discount_inc, 
         discount_factor_max=parameters.discount_max,
         periodicity=1))
-    
-    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
-    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
-    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
-    # episode or epoch (or never, hence the resetEvery='none').
-    agent.attach(bc.EpsilonController(
-        initial_e=parameters.epsilon_start, 
-        e_decays=parameters.epsilon_decay, 
-        e_min=parameters.epsilon_min,
-        evaluate_on='action',
-        periodicity=1,
-        reset_every='none'))
-    
-    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
-    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
-    # "true generalization score", or "test score".
-    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
-    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
-    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
-    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
-    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
-    # application.
-    agent.attach(bc.FindBestController(
-        validationID=test_env.VALIDATION_MODE,
-        testID=None,
-        unique_fname=fname))
-    
+        
     # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
     # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
     # these validation epoch to interfere with the training of the agent, which is well established by the 
@@ -171,13 +166,8 @@ class Defaults:
         periodicity=2,
         show_score=True,
         summarize_every=1))
-    
-    # --- Run the experiment ---
-    try:
-        os.mkdir("params")
-    except Exception:
-        pass
-    dump(vars(parameters), "params/" + fname + ".jldump")
+
+    agent.gathering_data=False
     agent.run(parameters.epochs, parameters.steps_per_epoch)
     
     # --- Show results ---
diff --git a/examples/simplest_test_PLI/run_test4.py b/examples/simplest_test_PLI/run_test4.py
index 7a83d695..65556605 100644
--- a/examples/simplest_test_PLI/run_test4.py
+++ b/examples/simplest_test_PLI/run_test4.py
@@ -22,7 +22,7 @@ class Defaults:
     # ----------------------
     # Experiment Parameters
     # ----------------------
-    STEPS_PER_EPOCH = 2000
+    STEPS_PER_EPOCH = 5000
     EPOCHS = 50
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
@@ -36,7 +36,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0002
+    LEARNING_RATE = 0.0002 #0.0001 for high_int_dim, 0.00002 for low_int_dim
     LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
@@ -68,7 +68,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = test_env()
+    env = test_env(rng, higher_dim_obs=False)
     
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
@@ -134,8 +134,30 @@ class Defaults:
         periodicity=1,
         reset_every='none'))
 
-    agent.run(10, 200)  #(5, 50)
+    agent.run(10, 100)  #(5, 50)
     print("end gathering data")
+    #print "agent.DataSet.self._terminals"
+    #print "agent._dataset.terminals()"
+    #print agent._dataset.terminals()
+    #print agent._dataset._terminals._data[0:2000]
+    #print agent._dataset._actions._data[0:2000]
+#    r=agent._dataset._rewards._data[0:2000]
+#    print "r before"
+#    print r
+#    #print agent._dataset._observations[0]._data[0:10]
+#    ind=np.argwhere(r>0)
+#    print "agent._dataset._observations[0]._data[ind[0]]"
+#    print agent._dataset._observations[0]._data[ind[0]]
+#    print ind
+#    agent._dataset._rewards._data=np.delete(agent._dataset._rewards._data,ind)
+#    agent._dataset._terminals._data=np.delete(agent._dataset._terminals._data,ind)
+#    agent._dataset._actions._data=np.delete(agent._dataset._actions._data,ind)
+#    agent._dataset._observations[0]._data=np.delete(agent._dataset._observations[0]._data,ind,axis=0)
+#    r=agent._dataset._rewards._data[0:2000]
+#    print "r after"
+#    print r
+#    print "agent._dataset._observations[0]._data[ind[0]] after"
+#    print agent._dataset._observations[0]._data[ind[0]]
 
     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
     # Plus, we also want to display after each training episode (!= than after every training) the average bellman
diff --git a/examples/simplest_test_PLI/test_env3.py b/examples/simplest_test_PLI/test_env3.py
index c4da5122..fcce67e4 100644
--- a/examples/simplest_test_PLI/test_env3.py
+++ b/examples/simplest_test_PLI/test_env3.py
@@ -17,17 +17,19 @@
 class MyEnv(Environment):
     VALIDATION_MODE = 0
 
-    def __init__(self):
+    def __init__(self, rng, **kwargs):
 
         self._mode = -1
         self._mode_score = 0.0
         self._mode_episode_count = 0
 
         self._actions = [0,1]
-        self._height=15
-        self._width=7 #preferably an odd number so that it's symmetrical
+        self._height=10#15
+        self._width=10 #preferably an odd number so that it's symmetrical
         self._width_paddle=1
-        self._nx_block=3 #number of different x positions of the falling blocks
+        self._nx_block=2 #number of different x positions of the falling blocks
+        self._higher_dim_obs=kwargs["higher_dim_obs"]
+
         if(self._nx_block==1):
             self._x_block=self._width//2
         else:
@@ -68,7 +70,7 @@ def act(self, action):
 
         self.y = self.y-1
               
-        if(self.y==0 and self.x>self._x_block-self._width_paddle and self.x<=self._x_block):
+        if(self.y==0 and self.x>self._x_block-1-self._width_paddle and self.x<=self._x_block+1):
             self.reward = 1
         elif(self.y==0):
             self.reward = -1
@@ -86,9 +88,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         for x_b in range(self._nx_block):#[1]:#range(self._nx_block):
             for y_b in range(self._height):
                 for x_p in range(self._width-self._width_paddle+1):
-                    state=np.zeros((self._height,self._width))
-                    state[y_b,x_b*((self._width-1)//(self._nx_block-1))]=0.5
-                    state[0,x_p-self._width_paddle+1:x_p+1]=1.                    
+                    state=self.get_observation(y_b,x_b*((self._width-1)//(self._nx_block-1)),x_p)
                     all_possib_inp.append(state)
 
         all_possib_inp=np.expand_dims(all_possib_inp,axis=1)
@@ -97,8 +97,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
         print all_possib_inp[self._height*(self._width-self._width_paddle+1)-1]
         print all_possib_inp[self._height*(self._width-self._width_paddle+1)]
         print all_possib_inp[2*self._height*(self._width-self._width_paddle+1)-1]
-        print all_possib_inp[2*self._height*(self._width-self._width_paddle+1)]
-        print all_possib_inp[3*self._height*(self._width-self._width_paddle+1)-1]
+        #print all_possib_inp[2*self._height*(self._width-self._width_paddle+1)]
+        #print all_possib_inp[3*self._height*(self._width-self._width_paddle+1)-1]
         print "all_possib_inp.shape"
         print all_possib_inp.shape
         #print all_possib_inp[self._height*self._width]
@@ -351,7 +351,11 @@ def summarizePerformance(self, test_data_set, learning_algo):
         matplotlib.pyplot.close("all") # avoids memory leaks
 
     def inputDimensions(self):
-        return [(1,self._height,self._width)]
+        if(self._higher_dim_obs==True):
+            return [(1,(self._height+2)*3,(self._width+2)*3)]
+        else:
+            return [(1,self._height,self._width)]
+        
 
     def observationType(self, subject):
         return np.float32
@@ -360,11 +364,35 @@ def nActions(self):
         return len(self._actions)
 
     def observe(self):
-        obs=np.zeros((self._height,self._width))
-        obs[self.y,self._x_block]=0.5
-        obs[0,self.x-self._width_paddle+1:self.x+1]=1
+        obs=self.get_observation(self.y,self._x_block,self.x)
         return [obs]
 
+    def get_observation(self,y,x_block,x):
+        obs=np.zeros((self._height,self._width))
+        obs[y,x_block]=0.5
+        obs[0,x-self._width_paddle+1:x+1]=1
+        
+        if(self._higher_dim_obs==True):
+            y_t=(1+y)*3
+            x_block_t=(1+x_block)*3
+            x_t=(1+x)*3
+            obs=np.zeros(( (self._height+2)*3 , (self._width+2)*3 ))
+            ball=np.array([[0,0,0.6,0.8,0.6,0,0],
+                            [0.,0.6,0.9,1,0.9,0.6,0],
+                            [0.,0.85,1,1,1,0.85,0.],
+                            [0,0.6,0.9,1,0.9,0.6,0],
+                            [0,0,0.6,0.85,0.6,0,0]])
+            paddle=np.array([[0.5,0.95,1,1,1,0.95,0.5],
+                            [0.9,1,1,1,1,1,0.9],
+                            [0.,0.,0,0,0,0.,0.]])
+            
+            obs[y_t-2:y_t+3,x_block_t-3:x_block_t+4]=ball
+            obs[3:6,x_t-3:x_t+4]=paddle
+            plt.imshow(np.flip(obs,axis=0), cmap='gray_r')
+            plt.show()
+
+        return obs
+
     def inTerminalState(self):
         if (self.y==0):
             return True
diff --git a/examples/simplest_test_PLI/test_env4.py b/examples/simplest_test_PLI/test_env4.py
index 5e85ea07..3c72a091 100644
--- a/examples/simplest_test_PLI/test_env4.py
+++ b/examples/simplest_test_PLI/test_env4.py
@@ -17,6 +17,7 @@ def transition_model(self):
 from deer.base_classes import Environment
 
 import matplotlib
+#matplotlib.use('agg')
 matplotlib.use('qt5agg')
 from mpl_toolkits.axes_grid1 import host_subplot
 import mpl_toolkits.axisartist as AA
@@ -26,45 +27,46 @@ def transition_model(self):
 class MyEnv(Environment):
     VALIDATION_MODE = 0
 
-    def __init__(self):
+    def __init__(self, rng, **kwargs):
 
         self._mode = -1
         self._mode_score = 0.0
         self._mode_episode_count = 0
         self._actions = [0,1,2,3]
-        self._height=7
-        self._width=9
+        self._size_maze=8
+        self._higher_dim_obs=kwargs["higher_dim_obs"]
         self.create_map()
-        self.intern_dim=3
+        self.intern_dim=2
 
     def create_map(self):
-        self._map=np.zeros((self._height,self._width))
+        self._map=np.zeros((self._size_maze,self._size_maze))
         self._map[-1,:]=1
         self._map[0,:]=1
         self._map[:,0]=1
         self._map[:,-1]=1
-        self._map[:,self._width//2]=1
-        #self._map[:,self._width//3]=1
-        #self._map[-2,self._width//3]=0
-        #self._map[:,2*self._width//3]=1
-        #self._map[2,2*self._width//3]=0
+        self._map[:,self._size_maze//2]=1
+        self._map[self._size_maze//2,self._size_maze//2]=0
+        #self._map[:,self._size_maze//3]=1
+        #self._map[-2,self._size_maze//3]=0
+        #self._map[:,2*self._size_maze//3]=1
+        #self._map[2,2*self._size_maze//3]=0
         self._pos_agent=[2,2]
-        self._pos_goal=[2,6]
+        self._pos_goal=[self._size_maze-2,self._size_maze-2]
         #self._map[3,6]=0.66
 
                 
     def reset(self, mode):
         self.create_map()
 
-        self._map[self._height//2,self._width//2]=0
+        self._map[self._size_maze//2,self._size_maze//2]=0
         #if mode == -1:
         #    i=np.random.randint(2)
         #    if(i==0):
-        #        self._map[self._height//2-1,self._width//2]=0
+        #        self._map[self._size_maze//2-1,self._size_maze//2]=0
         #    if(i==1):
-        #        self._map[self._height//2+1,self._width//2]=0
+        #        self._map[self._size_maze//2+1,self._size_maze//2]=0
         #else:
-        #    self._map[self._height//2+1,self._width//2]=0
+        #    self._map[self._size_maze//2+1,self._size_maze//2]=0
         
         if mode == MyEnv.VALIDATION_MODE:
             if self._mode != MyEnv.VALIDATION_MODE:
@@ -77,22 +79,22 @@ def reset(self, mode):
         elif self._mode != -1:
             self._mode = -1
         
-        if self._mode == -1:
-            self._pos_agent=[self._height//2,self._width//2]
-        else:
-            self._pos_agent=[1,1]
+        #if self._mode == -1:
+        self._pos_agent=[self._size_maze//2,self._size_maze//2]
+        #else:
+        #    self._pos_agent=[1,1]
             
         print "reset mode"
         print mode
         print "self._map"
         print self._map
                 
-        return [1 * [self._height * [self._width * [0]]]]
+        return [1 * [self._size_maze * [self._size_maze * [0]]]]
         
         
     def act(self, action):
         action = self._actions[action]
-        
+        self._cur_action=action
         if(action==0):
             if(self._map[self._pos_agent[0]-1,self._pos_agent[1]]==0):
                 self._pos_agent[0]=self._pos_agent[0]-1
@@ -107,8 +109,10 @@ def act(self, action):
                 self._pos_agent[1]=self._pos_agent[1]+1
         
         self.reward = 0
-        if (self._pos_agent==self._pos_goal):
-            self.reward = 1
+        #if (self._pos_agent==self._pos_goal):
+        #    self.reward = 1
+        #if (self._pos_agent[1]>=self._size_maze-2 and action==3):
+        #    self.reward = 1 # used to delete those transitions
 
         self._mode_score += self.reward
         return self.reward
@@ -120,40 +124,57 @@ def summarizePerformance(self, test_data_set, learning_algo):
         for i in range(1):
             all_possib_inp=[]
             self.create_map()
-            for x_a in range(self._width):
-                for y_a in range(self._height):
+            for y_a in range(self._size_maze):
+                for x_a in range(self._size_maze):                
                     state=copy.deepcopy(self._map)
-                    state[self._height//2,self._width//2]=0
+                    state[self._size_maze//2,self._size_maze//2]=0
+                    if(state[x_a,y_a]==0):
+                        if(self._higher_dim_obs==True):
+                            all_possib_inp.append(self.get_higher_dim_obs([[x_a,y_a]],[self._pos_goal]))
+                        else:
+                            state[x_a,y_a]=0.5
+                            all_possib_inp.append(state)
 
-                    if(state[y_a,x_a]==0):
-                        state[y_a,x_a]=0.5
-                        all_possib_inp.append(state)
             
-            all_possib_inp=np.expand_dims(all_possib_inp,axis=1)
-            print "all_possib_inp[0:10]"
-            print all_possib_inp[0:10]
+            all_possib_inp=np.expand_dims(np.array(all_possib_inp,dtype='float'),axis=1)
+            #print "all_possib_inp[0:2]"
+            #print all_possib_inp[0:2]
             print "all_possib_inp.shape"
             print all_possib_inp.shape
+            print all_possib_inp.dtype
+            print all_possib_inp[0,0,:]
+            print "learning_algo.encoder.predict(all_possib_inp[0:1,0:1,:])"
+            print learning_algo.encoder.predict(all_possib_inp[0:1,0:1,:])
             all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
             if(all_possib_abs_states.ndim==4):
                 all_possib_abs_states=np.transpose(all_possib_abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
-            print "learning_algo.encoder.predict(all_possib_inp)[0:10]"
-            print all_possib_abs_states[0:10]
+            print "learning_algo.encoder.predict(all_possib_inp)[0:2]"
+            print all_possib_abs_states[0:2]
             
-            print "print test_data_set.observations()[0:10]"
-            print test_data_set.observations()[0:10]
+            #print "print test_data_set.observations()[0:2]"
+            #print test_data_set.observations()[0][0:2]
             n=500
             historics=[]
             for i,observ in enumerate(test_data_set.observations()[0][0:n]):
                 historics.append(np.expand_dims(observ,axis=0))
             historics=np.array(historics)
-            #print "historics[0:10]"
-            #print historics[0:10]
+            #print "historics[0:2]"
+            #print historics[0:2]
+            print "historics.shape"
+            print historics.shape
+            print historics.dtype
+            print historics[0,0,:]
+            print "learning_algo.encoder.predict(historics[0:1,0:1,:])"
+            print learning_algo.encoder.predict(historics[0:1,0:1,:])
+            print learning_algo.encoder.predict(all_possib_inp[0:1,0:1,:])
+            print "all_possib_inp[0:1,0:1,:]==historics[0:1,0:1,:]"
+            print all_possib_inp[0:1,0:1,:]==historics[0:1,0:1,:]
             abs_states=learning_algo.encoder.predict(historics)
             if(abs_states.ndim==4):
                 abs_states=np.transpose(abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
-            print "abs_states[0:10]"
-            print abs_states[0:10]
+            print "abs_states[0:2]"
+            print abs_states[0:2]
+            print abs_states.shape
             actions=test_data_set.actions()[0:n]
             print "actions[0:10]"
             print actions[0:10]
@@ -210,9 +231,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
                     ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.65", alpha=0.75)
                     ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.9", alpha=0.75)
             
-#            for xx in np.arange(self._width)-self._width//2:
-#                for yy in np.arange(self._width)-self._width//2:
-#                    for zz in np.arange(self._width)-self._width//2:
+#            for xx in np.arange(self._size_maze)-self._size_maze//2:
+#                for yy in np.arange(self._size_maze)-self._size_maze//2:
+#                    for zz in np.arange(self._size_maze)-self._size_maze//2:
 #                        predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
 #                        predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
 #                        ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
@@ -236,7 +257,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
             
             
             # Plot the dots at each time step depending on the action taken
-            length_block=[[0,15],[15,16],[16,31]]
+            length_block=[[0,18],[18,19],[19,31]]
             for i in range(3):
                 if(self.intern_dim==2):
                     line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], s=30, marker='x', edgecolors='k', alpha=0.5)
@@ -312,8 +333,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
             #ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
             #ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
             #ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-            plt.show()
-            plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+            #plt.show()
+            plt.savefig('fig_base_explo'+str(learning_algo.update_counter)+'.pdf')
 
 
 #        # Plot the Q_vals
@@ -415,20 +436,70 @@ def summarizePerformance(self, test_data_set, learning_algo):
         matplotlib.pyplot.close("all") # avoids memory leaks
 
     def inputDimensions(self):
-        return [(1,self._height,self._width)]
+        if(self._higher_dim_obs==True):
+            return [(1,self._size_maze*6,self._size_maze*6)]
+        else:
+            return [(1,self._size_maze,self._size_maze)]
 
     def observationType(self, subject):
-        return np.float32
+        return np.float
 
     def nActions(self):
         return len(self._actions)
 
     def observe(self):
         obs=copy.deepcopy(self._map)
-        obs[self._pos_agent[0],self._pos_agent[1]]=0.5
+                
+        obs[self._pos_agent[0],self._pos_agent[1]]=0.5                
+        if(self._higher_dim_obs==True):
+            "self._pos_agent"
+            self._pos_agent
+            obs=self.get_higher_dim_obs([self._pos_agent],[self._pos_goal])
+            
         return [obs]
+    
+    def get_higher_dim_obs(self,indices_agent,indices_reward):
+        obs=copy.deepcopy(self._map)
+        obs=obs/1.
+        obs=np.repeat(np.repeat(obs, 6, axis=0),6, axis=1)
+        # agent repr
+        agent_obs=np.zeros((6,6))
+        agent_obs[0,2]=0.7
+        agent_obs[1,0:5]=0.8
+        agent_obs[2,1:4]=0.8
+        agent_obs[3,1:4]=0.8
+        agent_obs[4,1]=0.8
+        agent_obs[4,3]=0.8
+        agent_obs[5,0:2]=0.8
+        agent_obs[5,3:5]=0.8
+        
+        # reward repr
+        reward_obs=np.zeros((6,6))
+        #reward_obs[:,1]=0.8
+        #reward_obs[0,1:4]=0.7
+        #reward_obs[1,3]=0.8
+        #reward_obs[2,1:4]=0.7
+        #reward_obs[4,2]=0.8
+        #reward_obs[5,2:4]=0.8
+        
+        for i in indices_reward:
+            #print self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]
+            obs[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=reward_obs
+
+        print indices_agent
+        for i in indices_agent:
+            print i
+            obs[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=agent_obs
+            
+        #plt.imshow(obs, cmap='gray_r')
+        #plt.show()
+        return obs
+
 
     def inTerminalState(self):
+#        if((self._pos_agent[0]<=1 and self._cur_action==0) ):#((self._pos_agent==[4,1] and self._cur_action==1) or (self._pos_agent==[5,2] and (self._cur_action==1 or self._cur_action==2)) or (self._pos_agent==[6,3] and self._cur_action==2))):
+#        #(self._pos_agent[1]>=self._size_maze-2 and self._cur_action==1) ):
+#            return True
         return False
         #if (self._pos_agent==self._pos_goal):
         #    return True

From b65a537ba2b98e362bdffd54001eb641f5a0e99b Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 5 Apr 2018 16:05:17 -0400
Subject: [PATCH 42/96] working explo simple laby

---
 deer/q_networks/NN_keras_lp_high_int_dim.py |   2 +-
 deer/q_networks/q_net_keras_lp_nstep.py     | 727 ++++++++++++++++++++
 examples/simplest_test_PLI/test_env4.py     |   6 +-
 3 files changed, 731 insertions(+), 4 deletions(-)
 create mode 100644 deer/q_networks/q_net_keras_lp_nstep.py

diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/q_networks/NN_keras_lp_high_int_dim.py
index 8cc61248..c05b1735 100644
--- a/deer/q_networks/NN_keras_lp_high_int_dim.py
+++ b/deer/q_networks/NN_keras_lp_high_int_dim.py
@@ -37,7 +37,7 @@ def __init__(self, batch_size, input_dimensions, n_actions, random_state, action
 #            self.internal_dim=input_dimensions[0][-2]*input_dimensions[0][-1] # In the case where the observation is a frame (or an history of frames)
             self.n_channels_internal_dim=2#dim[0]
         else:
-            self.internal_dim=3 #2 for laby
+            self.internal_dim=2 #2 for laby
                                 #3 for catcher
 
     def encoder_model(self):
diff --git a/deer/q_networks/q_net_keras_lp_nstep.py b/deer/q_networks/q_net_keras_lp_nstep.py
new file mode 100644
index 00000000..79828847
--- /dev/null
+++ b/deer/q_networks/q_net_keras_lp_nstep.py
@@ -0,0 +1,727 @@
+"""
+Code for general deep Q-learning using Keras that can take as inputs scalars, vectors and matrices
+
+.. Author: Vincent Francois-Lavet
+"""
+
+import numpy as np
+np.set_printoptions(threshold=np.nan)
+from keras.optimizers import SGD,RMSprop
+from keras import backend as K
+from ..base_classes import QNetwork
+from .NN_keras_lp_high_int_dim import NN # Default Neural network used
+import tensorflow as tf
+config = tf.ConfigProto()
+config.gpu_options.allow_growth=True
+sess = tf.Session(config=config)
+
+def mean_squared_error(y_true, y_pred):
+    return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
+    #return K.mean(  K.square( K.clip(K.abs(y_pred - y_true)-1,0.,100.) )  ,  axis=-1  )   # = mse error
+
+def exp_dec_error(y_true, y_pred):
+    return K.exp( - 5.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
+
+#def rms_from_squared_components(y_true, y_pred):
+#    return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
+#
+#def squared_error_from_squared_components(y_true, y_pred):
+#    return - K.sum(  K.clip(y_pred,0.,1)  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
+
+def loss_diff_s_s_(y_true, y_pred):
+    return K.square(   1.    -    K.sqrt(  K.clip( K.sum(y_pred,axis=-1,keepdims=True), 0.000001 , 1. )  )     ) # tend to increase y_pred --> loss -1
+
+class MyQNetwork(QNetwork):
+    """
+    Deep Q-learning network using Keras (with any backend)
+    
+    Parameters
+    -----------
+    environment : object from class Environment
+    rho : float
+        Parameter for rmsprop. Default : 0.9
+    rms_epsilon : float
+        Parameter for rmsprop. Default : 0.0001
+    momentum : float
+        Default : 0
+    clip_delta : float
+        Not implemented.
+    freeze_interval : int
+        Period during which the target network is freezed and after which the target network is updated. Default : 1000
+    batch_size : int
+        Number of tuples taken into account for each iteration of gradient descent. Default : 32
+    update_rule: str
+        {sgd,rmsprop}. Default : rmsprop
+    random_state : numpy random number generator
+    double_Q : bool, optional
+        Activate or not the double_Q learning.
+        More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
+    neural_network : object, optional
+        default is deer.qnetworks.NN_keras
+    """
+
+    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN, **kwargs):
+        """ Initialize environment
+        
+        """
+        QNetwork.__init__(self,environment, batch_size)
+
+        
+        self._rho = rho
+        self._rms_epsilon = rms_epsilon
+        self._momentum = momentum
+        self._update_rule = update_rule
+        self._freeze_interval = freeze_interval
+        self._double_Q = double_Q
+        self._random_state = random_state
+        self.update_counter = 0    
+        self._high_int_dim = kwargs["high_int_dim"]
+        self.loss_T2=0
+        self.loss_disentangle_t=0
+        self.loss_disentangle_a=0
+        self.loss_Q=0
+        self.loss_disambiguate1=0
+        self.loss_disambiguate2=0
+        self.nstep=1 # ! If n>1, training Q, also modifies T, R and gamma which leads to higher loss for them.
+        self.loss_T=np.zeros((self.nstep))
+        self.loss_gamma=np.zeros((self.nstep))
+        self.lossR=np.zeros((self.nstep))
+
+        
+        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim)
+
+        self.encoder = self.learn_and_plan.encoder_model()
+        self.encoder_diff = self.learn_and_plan.encoder_diff_model(self.encoder)
+        
+        self.Q = self.learn_and_plan.Q_model()
+        self.R = self.learn_and_plan.R_model()
+        self.gamma = self.learn_and_plan.R_model()
+        self.transition = self.learn_and_plan.transition_model()
+#        self.transition2 = self.learn_and_plan.transition_model2()
+
+        self.full_Qs=[]
+        for i in range(self.nstep):
+            self.full_Qs.append(self.learn_and_plan.full_Q_model(self.encoder,self.Q,i,self.transition,self.R,self.gamma))
+        
+        # used to fit rewards
+        self.full_Rs=[]
+        for i in range(self.nstep):
+            self.full_Rs.append(self.learn_and_plan.full_R_model(self.encoder,self.R,i,self.transition))
+
+        # used to fit gammas
+        self.full_gammas=[]
+        for i in range(self.nstep):
+            self.full_gammas.append(self.learn_and_plan.full_R_model(self.encoder,self.gamma,i,self.transition))
+        
+        # used to fit transitions
+        self.diff_Tx_x_s=[]
+        for i in range(self.nstep):
+            self.diff_Tx_x_s.append(self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition,i))#full_transition_model(self.encoder,self.transition)
+        
+        # constraint on consecutive t
+        self.diff_s_s_ = self.learn_and_plan.diff_s_s_(self.encoder)
+#        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
+
+        # used to disentangle actions
+        self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)
+                
+        layers=self.encoder.layers+self.Q.layers+self.R.layers+self.gamma.layers+self.transition.layers
+        # Grab all the parameters together.
+        self.params = [ param
+                    for layer in layers 
+                    for param in layer.trainable_weights ]
+
+        self._compile()
+
+        self.learn_and_plan_target = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim)
+        self.encoder_target = self.learn_and_plan_target.encoder_model()
+        self.Q_target = self.learn_and_plan_target.Q_model()
+        self.R_target = self.learn_and_plan_target.R_model()
+        self.gamma_target = self.learn_and_plan_target.R_model()
+        self.transition_target = self.learn_and_plan_target.transition_model()
+        
+        self.full_Q_target = self.learn_and_plan_target.full_Q_model(self.encoder_target,self.Q_target) # FIXME
+        self.full_Q_target.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.full_Q
+
+        layers=self.encoder_target.layers+self.Q_target.layers+self.R_target.layers+self.gamma_target.layers+self.transition_target.layers
+        # Grab all the parameters together.
+        self.params_target = [ param
+                    for layer in layers 
+                    for param in layer.trainable_weights ]
+
+        self._resetQHat()
+
+    def getAllParams(self):
+        params_value=[]
+        for i,p in enumerate(self.params):
+            params_value.append(K.get_value(p))
+        return params_value
+
+    def setAllParams(self, list_of_values):
+        for i,p in enumerate(self.params):
+            K.set_value(p,list_of_values[i])
+
+    def train(self, observations_val, actions_val, rewards_val, terminals_val):
+        """
+        Train one batch.
+
+        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
+        2. perform batch training
+
+        Parameters
+        -----------
+        observations_val : batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
+        actions_val : b x 1 numpy array of integers
+        rewards_val : b x 1 numpy array
+        terminals_val : b x 1 numpy boolean array
+
+        Returns
+        -------
+        Average loss of the batch training (RMSE)
+        Individual (square) losses for each tuple
+        """
+        
+        onehot_actions = [np.zeros((self._batch_size, self._n_actions)) for n in range(self.nstep)]
+        for n in range(self.nstep):
+            onehot_actions[n][np.arange(self._batch_size), actions_val[:,n]] = 1
+        onehot_actions_rand = [np.zeros((self._batch_size, self._n_actions)) for n in range(self.nstep)]
+        for n in range(self.nstep):
+            onehot_actions_rand[n][np.arange(self._batch_size), np.random.randint(0,self._n_actions,(32))] = 1
+        
+        observations_val=list(observations_val)
+        states_val_1=[]
+        states_val=[]
+        next_states_val=[]
+        for obs in observations_val:
+            states_val_1.append(obs[:,0:1]) # t-n+1
+            states_val.append(obs[:,self.nstep-1:-1]) # t
+            next_states_val.append(obs[:,self.nstep:]) # t+1
+        Es_=self.encoder.predict(next_states_val)
+        Es=self.encoder.predict(states_val)
+        
+        if(self.update_counter%500==0):
+            if(self.nstep==2):
+                Es_1=self.encoder.predict(states_val_1)
+                ETs_1=self.transition.predict([Es_1,onehot_actions[0]]) # t+1
+                ETTs_1=self.transition.predict([ETs_1,onehot_actions[1]]) # t+1
+                print "ETTs_1[0]"
+                print ETTs_1[0]
+                print "onehot_actions"
+                print onehot_actions[0][0]
+                print onehot_actions[1][0]
+            
+            
+            ETs=self.transition.predict([Es,onehot_actions[-1]]) # t+1
+            R=self.R.predict([Es[0:1],np.array([[1,0,0,0]])]) # t
+            R1=self.R.predict([Es[0:1],np.array([[0,1,0,0]])]) # t
+            R2=self.R.predict([Es[0:1],np.array([[0,0,1,0]])]) # t
+            gamma=self.gamma.predict([Es,onehot_actions[-1]]) # t
+                   
+            print "states_val[0][0]"
+            print states_val[0][0]
+            #print "len(states_val)"
+            #print len(states_val)
+            #print states_val[0].shape
+            print "next_states_val[0][0]"
+            print next_states_val[0][0]
+            print actions_val[0], rewards_val[0], terminals_val[0]
+            print "Es[0],ETs[0],Es_[0]"
+            if(Es.ndim==4):
+                print np.transpose(Es, (0, 3, 1, 2))[0],np.transpose(ETs, (0, 3, 1, 2))[0],np.transpose(Es_, (0, 3, 1, 2))[0]    # data_format='channels_last' --> 'channels_first'
+            else:
+                print Es[0],ETs[0],Es_[0]
+            print "R[0]"
+            print R[0]
+            print R1[0]
+            print R2[0]
+            print "gamma[0]"
+            print gamma[0]
+            print "self.full_Qs[0].predict(states_val)[0]"
+            print self.full_Qs[0].predict(states_val)[0]
+            print "self.full_Rs[0].predict(states_val)[0]"
+            print self.full_Rs[0].predict(states_val+[np.repeat(np.array([[1,0,0,0]]),32,axis=0)])[0]
+            print self.full_Rs[0].predict(states_val+[np.repeat(np.array([[0,1,0,0]]),32,axis=0)])[0]
+            print self.full_Rs[0].predict(states_val+[np.repeat(np.array([[0,0,1,0]]),32,axis=0)])[0]
+            
+        # Fit transition
+        for n in range(self.nstep):
+            states_val=[]
+            for obs in observations_val:
+                states_val.append(obs[:,-n-2:-n-1]) # t-n
+            #print self.loss_T
+            #print self.loss_T[0]
+            #print self.loss_T[n]
+            self.loss_T[n]=self.loss_T[n]+self.diff_Tx_x_s[n].train_on_batch(states_val+next_states_val+onehot_actions[-1-n:]+[(1-terminals_val[:,-1])], np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))
+    
+        # Fit rewards
+        for n in range(self.nstep):
+            states_val=[]
+            for obs in observations_val:
+                states_val.append(obs[:,-n-2:-n-1]) # t-n
+            self.lossR[n]+=self.full_Rs[n].train_on_batch(states_val+onehot_actions[-1-n:], rewards_val[:,-1]) 
+       
+        # Fit gammas
+        for n in range(self.nstep):
+            states_val=[]
+            for obs in observations_val:
+                states_val.append(obs[:,-n-2:-n-1]) # t-n
+            self.loss_gamma[n]+=self.full_gammas[n].train_on_batch(states_val+onehot_actions[-1-n:], (1-terminals_val[:,-1])*self._df) 
+
+        # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
+        # reduce the squared value of the abstract features
+        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        
+        # Increase the entropy in the abstract features of two states
+        # This is done only when states_val is made up of only one observation --> FIXME
+        rolled=np.roll(states_val[0],1,axis=0)
+#        for i in range(self._batch_size):
+#            j=0
+#            l=0
+#            while((states_val[0][i]==rolled[i+j-l]).all()):
+#                if(i+j==31):
+#                    l=self._batch_size
+#                if(j==31):
+#                    break
+#                j=j+1
+#            rolled[i]=rolled[i+j-l]
+        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        #
+        #
+        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
+        #
+        ## Disentangle actions
+        self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch(states_val+onehot_actions[-1:]+onehot_actions_rand[-1:], np.ones(self._batch_size))
+
+#
+#        # Loss to have all s' following s,a with a to a distance 1 of s,a)
+#        tiled_x=np.tile(Es,(self._n_actions,1))
+#        tiled_onehot_actions=np.tile(onehot_actions,(self._n_actions,1))
+#        tiled_onehot_actions2=np.repeat(np.diag(np.ones(self._n_actions)),self._batch_size,axis=0)
+#        #self.loss_disentangle_a+=self.diff_Tx.train_on_batch([tiled_x,tiled_onehot_actions,tiled_x,tiled_onehot_actions2], np.ones(self._batch_size*self._n_actions)) 
+
+
+        
+        if(self.update_counter%500==0):
+            print "self.loss_Q"
+            print self.loss_Q
+            if(self.nstep>1):
+                print "self.loss_T[0]/100.,self.loss_T[1]/100.,self.lossR[0]/100.,self.lossR[1]/100.,self.loss_gamma[0]/100.,self.loss_gamma[1]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
+                print self.loss_T[0]/100.,self.loss_T[1]/100.,self.lossR[0]/100.,self.lossR[1]/100.,self.loss_gamma[0]/100.,self.loss_gamma[1]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
+            else:
+                print "self.loss_T[0]/100.,self.lossR[0]/100.,self.loss_gamma[0]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
+                print self.loss_T[0]/100.,self.lossR[0]/100.,self.loss_gamma[0]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
+                
+            print K.get_value(self.encoder.optimizer.lr)
+            print K.get_value(self.encoder_diff.optimizer.lr)
+            self.loss_T=np.zeros((self.nstep))
+            self.loss_T2=0
+            self.lossR=np.zeros((self.nstep))
+            self.loss_gamma=np.zeros((self.nstep))
+            self.loss_Q=0
+
+            self.loss_disentangle_t=0
+            self.loss_disentangle_a=0
+            
+            self.loss_disambiguate1=0
+            self.loss_disambiguate2=0
+            
+            print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
+            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es))
+            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es))
+
+            print "self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,self.learn_and_plan.internal_dim)))"
+            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
+            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
+
+            print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
+            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es))
+
+
+        if self.update_counter % self._freeze_interval == 0:
+            self._resetQHat()
+        
+        next_q_vals = self.full_Q_target.predict([next_states_val[0]])
+        #next_q_vals = self.qValues_planning(next_states_val, self.R_target, self.transition_target, self.Q_target, d=self.nstep)#self.full_Q_target.predict([next_states_val[0]])
+        
+        if(self._double_Q==True):
+            next_q_vals_current_qnet=self.full_Qs[0].predict(next_states_val)
+            argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1)
+            max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1))
+        else:
+            max_next_q_vals=np.max(next_q_vals, axis=1, keepdims=True)
+
+        not_terminals=np.ones_like(terminals_val) - terminals_val
+        
+        target = rewards_val[:,-1] + not_terminals[:,-1] * self._df * max_next_q_vals.reshape((-1))
+        
+        
+        q_vals=[]
+        for n in range(self.nstep):
+            states_val=[]
+            for obs in observations_val:
+                states_val.append(obs[:,-n-2:-n-1]) # t
+            q_vals.append(self.full_Qs[n].predict(states_val+onehot_actions[-1-n:-1]))
+
+        # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
+        # FIXME for all n
+        q_val=q_vals[0][np.arange(self._batch_size), actions_val[:,0]]     
+        diff = - q_val + target 
+        loss_ind=pow(diff,2)
+        
+        for n in range(self.nstep):
+            q_vals[n][  np.arange(self._batch_size), actions_val[:,-1]  ] = target
+                
+        # Is it possible to use something more flexible than this? 
+        # Only some elements of next_q_vals are actual value that I target. 
+        # My loss should only take these into account.
+        # Workaround here is that many values are already "exact" in this update
+
+        #print "q_vals"
+        #print q_vals[0][0],q_vals[1][0]
+        loss=0
+        for n in range(self.nstep):
+            states_val=[]
+            for obs in observations_val:
+                states_val.append(obs[:,-n-2:-n-1]) # t-n
+            loss+=self.full_Qs[n].train_on_batch(states_val+onehot_actions[-1-n:-1] , q_vals[n] ) 
+        self.loss_Q+=loss
+
+        if(self.update_counter%100==0):
+            print self.update_counter
+        
+        self.update_counter += 1        
+
+        # loss*self._n_actions = np.average(loss_ind)
+        return np.sqrt(loss),loss_ind
+
+
+#    def train_model(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
+#        """
+#        Train the model based part
+#
+#        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
+#        2. perform batch training
+#
+#        Parameters
+#        -----------
+#        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
+#        actions_val : b x 1 numpy array of integers
+#        rewards_val : b x 1 numpy array
+#        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
+#        terminals_val : b x 1 numpy boolean array
+#
+#        Returns
+#        -------
+#        Average loss of the batch training (RMSE)
+#        Individual (square) losses for each tuple
+#        """
+#
+#        onehot_actions = np.zeros((self._batch_size, self._n_actions))
+#        onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
+#        Es_=self.encoder.predict([next_states_val[0]])
+#        Es=self.encoder.predict([states_val[0]])
+#        ETs=self.transition.predict([Es,onehot_actions])
+#
+##        if(self.update_counter>3000):
+#        self.loss_T2=self.transition2.train_on_batch([Es,onehot_actions], Es_)
+##        if(self.update_counter%100==0):
+##            loss=0.
+##            for i in range (100):
+##                loss+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
+##                if(i%10==0):
+##                    print "loss/(i+1)"
+##                    print loss/(i+1)
+##            print "loss/100."
+##            print loss/100.
+#            #print K.get_value(self.transition2.optimizer.lr)
+#            #print [ K.get_value(param)
+#            #        for layer in self.encoder.layers
+#            #        for param in layer.trainable_weights ][0][0]
+#        return self.loss_T2
+
+
+
+    def qValues(self, state_val):
+        """ Get the q values for one belief state (without planning)
+
+        Arguments
+        ---------
+        state_val : one belief state
+
+        Returns
+        -------
+        The q values for the provided belief state
+        """ 
+        #return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
+        return self.full_Qs[0].predict([np.expand_dims(state,axis=0) for state in state_val])[0]
+
+    def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
+        """ Get the q values for one belief state with a planning depth d
+
+        Arguments
+        ---------
+        state_val : one belief state
+        d : planning depth
+
+        Returns
+        -------
+        The q values with planning depth d for the provided belief state
+        """
+        #print "state_val[0]"
+        #print state_val[0]
+        #print len(state_val)
+        print "state_val[0][0]"
+        print state_val[0][0]
+        print state_val[0].shape
+        print "self.full_Qs[0].predict(state_val)[0]"
+        print self.full_Qs[0].predict(state_val)[0]
+        encoded_x = self.encoder.predict(state_val)
+        ## DEBUG PURPOSES
+        print "encoded_x[0]"
+        print encoded_x[0]
+        
+        identity_matrix = np.diag(np.ones(self._n_actions))
+        if(encoded_x.ndim==2):
+            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
+        elif(encoded_x.ndim==4):
+            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1,1,1))
+        else:
+            print ("error")
+        
+        repeat_identity=np.repeat(identity_matrix,len(encoded_x),axis=0)
+        ##print tile3_encoded_x
+        ##print repeat_identity
+        r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
+        #print "r_vals_d0"
+        #print r_vals_d0
+        r_vals_d0=r_vals_d0.flatten()
+        print "r_vals_d0"
+        print r_vals_d0
+        next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
+        print "next_x_predicted"
+        print next_x_predicted
+        next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
+        #print "next_x_predicted action 0 t2"
+        #print next_x_predicted
+        next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
+        #print "next_x_predicted action 0 t3"
+        #print next_x_predicted
+        next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
+        print "next_x_predicted action 0 t4"
+        print next_x_predicted
+        ## END DEBUG PURPOSES
+
+        QD_plan=0
+        for i in range(d+1): #TO DO: improve planning algorithm
+            #print encoded_x
+            Qd=self.qValues_planning_abstr(encoded_x, R, gamma, T, Q, d=i, branching_factor=2).reshape(len(encoded_x),-1)
+            print "Qd,i"
+            print Qd,i
+            QD_plan+=Qd
+        QD_plan=QD_plan/(d+1)
+        
+        print "QD_plan"
+        print QD_plan
+
+        return QD_plan
+
+#    def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
+#        """ 
+#        """
+#        branching_factor=self._n_actions #TO IMPROVE, use MCTS, etc...
+#        n=len(state_abstr_val)
+#        identity_matrix = np.diag(np.ones(self._n_actions))
+#        
+#        this_branching_factor=branching_factor
+#                         
+#        if (d==0):
+#            return Q.predict([state_abstr_val]) # no change in the order of the actions
+#        else:
+#            # All actions are considered in the tree
+#            repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0) # no change in the order of the actions
+#            if(state_abstr_val.ndim==2):
+#                tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
+#            elif(state_abstr_val.ndim==4):
+#                tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1,1,1))
+#            else:
+#                print ("error")
+#            
+#            #print tile3_encoded_x
+#            #print repeat_identity
+#            r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
+#            #print "r_vals_d0"
+#            #print r_vals_d0
+#            r_vals_d0=r_vals_d0.flatten()
+#            
+#            gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
+#            #print "r_vals_d0"
+#            #print r_vals_d0
+#            gamma_vals_vals_d0=gamma_vals_d0.flatten()
+#
+#            next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
+#            return r_vals_d0+gamma_vals_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor),axis=1).flatten()
+  
+  
+    def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
+        """ 
+        """
+        if(branching_factor==None or branching_factor>self._n_actions):
+            branching_factor=self._n_actions
+        #print "qValues_planning_abstr d"
+        #print d
+        n=len(state_abstr_val)
+        identity_matrix = np.diag(np.ones(self._n_actions))
+        
+        if (n==1):
+            this_branching_factor=self._n_actions
+        else:
+            this_branching_factor=branching_factor
+                         
+        if (d==0):
+            if(this_branching_factor<self._n_actions):
+                return np.partition(Q.predict([state_abstr_val]), -this_branching_factor)[:,-this_branching_factor:]
+            else:
+                return Q.predict([state_abstr_val]) # no change in the order of the actions
+        else:
+            if(this_branching_factor==self._n_actions):
+                # All actions are considered in the tree
+                repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0)
+                if(state_abstr_val.ndim==2):
+                    tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
+                elif(state_abstr_val.ndim==4):
+                    tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1,1,1))
+                else:
+                    print ("error")
+            else:
+                # A subset of the actions are considered in the tree
+                estim_Q_values=Q.predict([state_abstr_val])
+                #print estim_Q_values
+                ind = np.argpartition(estim_Q_values, -this_branching_factor)[:,-this_branching_factor:]
+                #print ind
+                #print identity_matrix[ind]
+                #repeat_identity=np.repeat(identity_matrix[ind],len(state_abstr_val),axis=0)
+                repeat_identity=identity_matrix[ind].reshape(n*this_branching_factor,self._n_actions)
+                #print repeat_identity
+                #if(state_abstr_val.ndim==2):
+                #    tile3_encoded_x=np.tile(state_abstr_val,(this_branching_factor,1))
+                #elif(state_abstr_val.ndim==4):
+                #    tile3_encoded_x=np.tile(state_abstr_val,(this_branching_factor,1,1,1))
+                #else:
+                #    print ("error")
+                tile3_encoded_x=np.repeat(state_abstr_val,this_branching_factor,axis=0)
+                #print "tile3_encoded_x"
+                #print tile3_encoded_x
+            
+            #print tile3_encoded_x
+            #print repeat_identity
+            r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
+            #print "r_vals_d0"
+            #print r_vals_d0
+            r_vals_d0=r_vals_d0.flatten()
+            
+            gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
+            #print "r_vals_d0"
+            #print r_vals_d0
+            gamma_vals_d0=gamma_vals_d0.flatten()
+
+            next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
+            return r_vals_d0+gamma_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor),axis=1).flatten()
+        
+
+
+    def chooseBestAction(self, state, mode):
+        """ Get the best action for a belief state
+
+        Arguments
+        ---------
+        state : one belief state
+
+        Returns
+        -------
+        The best action : int
+        """
+        if(mode>0):
+            # We use the mode to define the planning depth
+            q_vals = self.qValues_planning([np.expand_dims(s,axis=0) for s in state],self.R,self.gamma, self.transition, self.Q, d=mode*3)#self.qValues(state)#
+        else:
+            q_vals = self.qValues_planning([np.expand_dims(s,axis=0) for s in state],self.R,self.gamma, self.transition, self.Q, d=0)
+        return np.argmax(q_vals),np.max(q_vals)
+        
+    def _compile(self):
+        """ compile self.q_vals
+        """
+        if (self._update_rule=="sgd"):
+            optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False)
+        elif (self._update_rule=="rmsprop"):
+            optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon)
+        else:
+            raise Exception('The update_rule '+self._update_rule+' is not implemented.')
+        
+        optimizer1=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # Different optimizers for each network; otherwise not possible to modify each
+        optimizer2=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately (e.g. lr)
+        optimizer3=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer5=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer6=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer7=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+
+        for i in range(self.nstep):
+            #for l in self.R.layers+self.gamma.layers+self.transition.layers:
+            #    l.trainable=False
+            self.full_Qs[i].compile(optimizer=optimizer, loss='mse')
+            #for l in self.R.layers+self.gamma.layers:
+            #    l.trainable=True
+            self.full_Rs[i].compile(optimizer=optimizer3, loss='mse') # Fit rewards
+            self.full_gammas[i].compile(optimizer=optimizer3, loss='mse') # Fit gammas
+            #for l in self.transition.layers:
+            #    l.trainable=True            
+            self.diff_Tx_x_s[i].compile(optimizer=optimizer1, loss='mse') # Fit transitions
+
+#        self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
+
+        self.encoder.compile(optimizer=optimizer4,
+                  loss=mean_squared_error)
+        self.encoder_diff.compile(optimizer=optimizer5,
+                  loss=exp_dec_error)
+                  #metrics=['accuracy'])
+
+        self.diff_s_s_.compile(optimizer=optimizer6,
+                  loss=exp_dec_error)#'mse')#loss_diff_s_s_)
+                  #metrics=['accuracy'])
+
+        self.diff_sa_sa.compile(optimizer=optimizer7,
+                  loss=exp_dec_error)#loss_diff_s_s_)
+
+#        self.diff_Tx.compile(optimizer=optimizer,
+#                  loss=mean_squared_error)
+#                  #metrics=['accuracy'])
+
+    def _resetQHat(self):
+        for i,(param,param_target) in enumerate(zip(self.params, self.params_target)):
+            K.set_value(param_target,K.get_value(param))
+
+    def setLearningRate(self, lr):
+        """ Setting the learning rate
+
+        Parameters
+        -----------
+        lr : float
+            The learning rate that has to be set
+        """
+        self._lr = lr
+        print "modif lr"
+        # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
+        for i in range(self.nstep):
+            K.set_value(self.full_Qs[i].optimizer.lr, self._lr)
+            K.set_value(self.full_Rs[i].optimizer.lr, self._lr)
+            K.set_value(self.full_gammas[i].optimizer.lr, self._lr)
+            K.set_value(self.diff_Tx_x_s[i].optimizer.lr, self._lr)
+        
+#        K.set_value(self.transition2.optimizer.lr, self._lr/2.)
+
+        K.set_value(self.encoder.optimizer.lr, self._lr)
+        K.set_value(self.encoder_diff.optimizer.lr, self._lr)
+
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.) # /10. for simple laby; /2 for distrib of laby
+        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.) # /10. for simple laby; /2 for distrib of laby
+#        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
diff --git a/examples/simplest_test_PLI/test_env4.py b/examples/simplest_test_PLI/test_env4.py
index 3c72a091..e0f10d50 100644
--- a/examples/simplest_test_PLI/test_env4.py
+++ b/examples/simplest_test_PLI/test_env4.py
@@ -497,9 +497,9 @@ def get_higher_dim_obs(self,indices_agent,indices_reward):
 
 
     def inTerminalState(self):
-#        if((self._pos_agent[0]<=1 and self._cur_action==0) ):#((self._pos_agent==[4,1] and self._cur_action==1) or (self._pos_agent==[5,2] and (self._cur_action==1 or self._cur_action==2)) or (self._pos_agent==[6,3] and self._cur_action==2))):
-#        #(self._pos_agent[1]>=self._size_maze-2 and self._cur_action==1) ):
-#            return True
+        if((self._pos_agent[0]<=1 and self._cur_action==0) ):#((self._pos_agent==[4,1] and self._cur_action==1) or (self._pos_agent==[5,2] and (self._cur_action==1 or self._cur_action==2)) or (self._pos_agent==[6,3] and self._cur_action==2))):
+        #(self._pos_agent[1]>=self._size_maze-2 and self._cur_action==1) ):
+            return True
         return False
         #if (self._pos_agent==self._pos_goal):
         #    return True

From 4cbc7f0d935282e44812a6e635c53470f253f2b8 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Mon, 16 Apr 2018 16:54:20 -0400
Subject: [PATCH 43/96] introduce force_features and beginning transfer

---
 deer/q_networks/NN_keras_lp_high_int_dim.py | 150 +++++++++++++++-----
 deer/q_networks/q_net_keras_lp_nstep.py     |  81 ++++++++---
 examples/simplest_test_PLI/run_test3.py     |  89 +++++++++++-
 examples/simplest_test_PLI/run_test4.py     |   5 +-
 examples/simplest_test_PLI/test_env3.py     |  21 ++-
 5 files changed, 279 insertions(+), 67 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/q_networks/NN_keras_lp_high_int_dim.py
index c05b1735..7580f1c6 100644
--- a/deer/q_networks/NN_keras_lp_high_int_dim.py
+++ b/deer/q_networks/NN_keras_lp_high_int_dim.py
@@ -26,19 +26,18 @@ class NN():
     high_int_dim : Boolean
         Whether the abstract state should be high dimensional in the form of frames/vectors or whether it should be low-dimensional
     """
-    def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False, high_int_dim=False):
+    def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False, **kwargs):
         self._input_dimensions=input_dimensions
         self._batch_size=batch_size
         self._random_state=random_state
         self._n_actions=n_actions
         self._action_as_input=action_as_input
-        self._high_int_dim=high_int_dim
-        if(high_int_dim==True):            
-#            self.internal_dim=input_dimensions[0][-2]*input_dimensions[0][-1] # In the case where the observation is a frame (or an history of frames)
-            self.n_channels_internal_dim=2#dim[0]
+        self._high_int_dim=kwargs["high_int_dim"]
+        if(self._high_int_dim==True):
+            self.n_channels_internal_dim=kwargs["internal_dim"] #dim[0]
         else:
-            self.internal_dim=2 #2 for laby
-                                #3 for catcher
+            self.internal_dim=kwargs["internal_dim"]    #2 for laby
+                                                        #3 for catcher
 
     def encoder_model(self):
         """
@@ -325,15 +324,15 @@ def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
         
         return model
 
-    def diff_s_s_(self,encoder_model):
+    def force_features(self,encoder_model,transition_model,plan_depth=0):
         """
-        Used to force some state representation to be sufficiently different
+        Used to force some transitions'directions
         
         Parameters
         -----------
         s
         a
-        random z
+        s'
     
         Returns
         -------
@@ -341,35 +340,118 @@ def diff_s_s_(self,encoder_model):
     
         """
         inputs=[]
+        for i, dim in enumerate(self._input_dimensions):
+            if len(dim) == 3:
+                input = Input(shape=(dim[0],dim[1],dim[2]))
+                inputs.append(input)
         
-        for j in range(2):
-            for i, dim in enumerate(self._input_dimensions):
-                if len(dim) == 3:
-                    input = Input(shape=(dim[0],dim[1],dim[2]))
-                    inputs.append(input)
-            
-                elif len(dim) == 2:
-                    input = Input(shape=(dim[0],dim[1]))
-                    inputs.append(input)
+            elif len(dim) == 2:
+                input = Input(shape=(dim[0],dim[1]))
+                inputs.append(input)
+        
+            else:
+                input = Input(shape=(dim[0],))
+                inputs.append(input)
+
+        enc_x = encoder_model(inputs[:]) #s --> x
+        
+        Tx= enc_x
+        for d in range(plan_depth+1):
+            inputs.append(Input(shape=(self._n_actions,)))
+            Tx= transition_model([Tx,inputs[-1]])
+        
+        print "Tx._keras_shape"
+        print Tx._keras_shape
+        
+#        input = Input(shape=(self.internal_dim,self._n_actions))
+#        inputs.append(input)
+#        
+#        #if(high_int_dim==True):
+#        #    Tx_tiled=K.tile(Tx,(self._n_actions,1,1,1))
+#        #else:
+#        #    Tx_tiled=K.tile(Tx,(self._n_actions,1))
+#        
+#        for i in range self._n_actions:            
+#            #constants = np.zeros((self._n_actions))
+#            #k_constants = K.variable(constants)
+#            #fixed_input = Input(tensor=k_constants)
+#            Tx= transition_model([Tx,constants])
+#        Tx_tiled=Dot(axes=(-1))([Tx,fixed_input])
+#
+#        print "Tx_tiled._keras_shape"
+#        print Tx_tiled._keras_shape
             
-                else:
-                    input = Input(shape=(dim[0],))
-                    inputs.append(input)
+        diff_features = Subtract()([Tx,enc_x]) # Modification of the features after (sequence of) action(s)
+
+        #print "K.eval(diff_features)"
+        #print diff_features.output
+        #inputs.append(Input(shape=(self.internal_dim,)))
+        #cos_proxi=Dot(axes=(-1),normalize=True)([diff_features,inputs[-1]]) # Cosine proximity between diff_features and target_modif_features
         
-        half = len(inputs)/2
-        enc_x = encoder_model(inputs[:half]) #s --> x #FIXME
-        enc_x_ = encoder_model(inputs[half:]) #s --> x
+        #constants = np.ones((self.internal_dim,))#((self._batch_size*self._n_actions,self.internal_dim,))
+        #k_constants = K.variable(constants)
+        #fixed_input = Input(tensor=k_constants)
+        #inputs.append(fixed_input)
+        #print "fixed_input._keras_shape"
+        #print fixed_input._keras_shape
+        #cos_proxi_add1=Subtract()([fixed_input,cos_proxi])
         
-        if (self._high_int_dim==True):
-            x = Subtract()([Flatten()(enc_x),Flatten()(enc_x_)])
-        else:
-            x = Subtract()([enc_x,enc_x_])
-        x = Dot(axes=-1, normalize=False)([x,x])
+        #print "cos_proxi.output"
+        #print cos_proxi.output
+        #print "cos_proxi._keras_shape"
+        #print cos_proxi._keras_shape
         
-        model = Model(inputs=inputs, outputs=x )
+        model = Model(inputs=inputs, outputs=diff_features )
         
         return model
 
+
+#    def diff_s_s_(self,encoder_model):
+#        """
+#        Used to force some state representation to be sufficiently different
+#        
+#        Parameters
+#        -----------
+#        s
+#        a
+#        random z
+#    
+#        Returns
+#        -------
+#        model with output Tx (= model estimate of x')
+#    
+#        """
+#        inputs=[]
+#        
+#        for j in range(2):
+#            for i, dim in enumerate(self._input_dimensions):
+#                if len(dim) == 3:
+#                    input = Input(shape=(dim[0],dim[1],dim[2]))
+#                    inputs.append(input)
+#            
+#                elif len(dim) == 2:
+#                    input = Input(shape=(dim[0],dim[1]))
+#                    inputs.append(input)
+#            
+#                else:
+#                    input = Input(shape=(dim[0],))
+#                    inputs.append(input)
+#        
+#        half = len(inputs)/2
+#        enc_x = encoder_model(inputs[:half]) #s --> x #FIXME
+#        enc_x_ = encoder_model(inputs[half:]) #s --> x
+#        
+#        if (self._high_int_dim==True):
+#            enc_x=Flatten()(enc_x)
+#            enc_x_=Flatten()(enc_x_)
+#        x = Subtract()([enc_x,enc_x_])
+#
+#        #x = Dot(axes=-1, normalize=False)([x,x])
+#        
+#        model = Model(inputs=inputs, outputs=x )
+#        
+#        return model
+
     def diff_sa_sa(self,encoder_model,transition_model):
         """
     
@@ -416,9 +498,9 @@ def diff_sa_sa(self,encoder_model,transition_model):
             x = Subtract()([Tx,rand_Tx])
         print "x._keras_shape"
         print x._keras_shape
-        x = Dot(axes=-1, normalize=False)([x,x])
-        print "x._keras_shape"
-        print x._keras_shape
+        #x = Dot(axes=-1, normalize=False)([x,x])
+        #print "x._keras_shape"
+        #print x._keras_shape
         
         model = Model(inputs=inputs, outputs=x )
         
diff --git a/deer/q_networks/q_net_keras_lp_nstep.py b/deer/q_networks/q_net_keras_lp_nstep.py
index 79828847..b58c9012 100644
--- a/deer/q_networks/q_net_keras_lp_nstep.py
+++ b/deer/q_networks/q_net_keras_lp_nstep.py
@@ -22,6 +22,11 @@ def mean_squared_error(y_true, y_pred):
 def exp_dec_error(y_true, y_pred):
     return K.exp( - 5.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
 
+def cosine_proximity2(y_true, y_pred):
+    y_true = K.l2_normalize(y_true[:,0:2], axis=-1)
+    y_pred = K.l2_normalize(y_pred[:,0:2], axis=-1)
+    return -K.sum(y_true * y_pred, axis=-1)
+
 #def rms_from_squared_components(y_true, y_pred):
 #    return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
 #
@@ -75,7 +80,9 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._double_Q = double_Q
         self._random_state = random_state
         self.update_counter = 0    
-        self._high_int_dim = kwargs["high_int_dim"]
+        self._high_int_dim = kwargs.get('high_int_dim',False)
+        self._internal_dim = kwargs.get('internal_dim',2)
+        self.loss_interpret=0
         self.loss_T2=0
         self.loss_disentangle_t=0
         self.loss_disentangle_a=0
@@ -88,7 +95,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.lossR=np.zeros((self.nstep))
 
         
-        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim)
+        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
 
         self.encoder = self.learn_and_plan.encoder_model()
         self.encoder_diff = self.learn_and_plan.encoder_diff_model(self.encoder)
@@ -118,8 +125,12 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         for i in range(self.nstep):
             self.diff_Tx_x_s.append(self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition,i))#full_transition_model(self.encoder,self.transition)
         
+        # used to force features variations
+        if(self._high_int_dim==False):
+            self.force_features=self.learn_and_plan.force_features(self.encoder,self.transition)
+        
         # constraint on consecutive t
-        self.diff_s_s_ = self.learn_and_plan.diff_s_s_(self.encoder)
+        self.diff_s_s_ = self.learn_and_plan.encoder_diff_model(self.encoder)#diff_s_s_(self.encoder)
 #        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
 
         # used to disentangle actions
@@ -133,7 +144,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
 
         self._compile()
 
-        self.learn_and_plan_target = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim)
+        self.learn_and_plan_target = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
         self.encoder_target = self.learn_and_plan_target.encoder_model()
         self.Q_target = self.learn_and_plan_target.Q_model()
         self.R_target = self.learn_and_plan_target.R_model()
@@ -253,6 +264,21 @@ def train(self, observations_val, actions_val, rewards_val, terminals_val):
             #print self.loss_T[n]
             self.loss_T[n]=self.loss_T[n]+self.diff_Tx_x_s[n].train_on_batch(states_val+next_states_val+onehot_actions[-1-n:]+[(1-terminals_val[:,-1])], np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))
     
+        # Interpretable AI
+        if(self._high_int_dim==False):
+            target_modif_features=np.zeros((self._n_actions,self._internal_dim))
+            target_modif_features[0,0]=1    # dir
+            target_modif_features[1,0]=-1   # opposite dir
+            target_modif_features[0:2,1]=1    # temps
+            target_modif_features=np.repeat(target_modif_features,self._batch_size,axis=0)
+            states_val_tiled=[]
+            for obs in observations_val:
+                states_val_tiled.append(np.tile(obs[:,-2:-1],(self._n_actions,1,1,1)))
+            onehot_actions_tiled = np.diag(np.ones(self._n_actions))#np.zeros((self._batch_size*self._n_actions, self._n_actions))
+            onehot_actions_tiled = np.repeat(onehot_actions_tiled,self._batch_size,axis=0)
+                
+            self.loss_interpret+=self.force_features.train_on_batch(states_val_tiled+[onehot_actions_tiled], target_modif_features)
+
         # Fit rewards
         for n in range(self.nstep):
             states_val=[]
@@ -267,8 +293,7 @@ def train(self, observations_val, actions_val, rewards_val, terminals_val):
                 states_val.append(obs[:,-n-2:-n-1]) # t-n
             self.loss_gamma[n]+=self.full_gammas[n].train_on_batch(states_val+onehot_actions[-1-n:], (1-terminals_val[:,-1])*self._df) 
 
-        # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
-        # reduce the squared value of the abstract features
+        # Loss to ensure limited volume in abstract state space
         self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
         
         # Increase the entropy in the abstract features of two states
@@ -284,13 +309,14 @@ def train(self, observations_val, actions_val, rewards_val, terminals_val):
 #                    break
 #                j=j+1
 #            rolled[i]=rolled[i+j-l]
+        # Loss to ensure entropy in abstract state space
         self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
         #
         #
-        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
+        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.reshape(np.zeros_like(Es),(self._batch_size,-1)))#np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
         #
         ## Disentangle actions
-        self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch(states_val+onehot_actions[-1:]+onehot_actions_rand[-1:], np.ones(self._batch_size))
+        self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch(states_val+onehot_actions[-1:]+onehot_actions_rand[-1:], np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.ones(self._batch_size))
 
 #
 #        # Loss to have all s' following s,a with a to a distance 1 of s,a)
@@ -310,10 +336,14 @@ def train(self, observations_val, actions_val, rewards_val, terminals_val):
             else:
                 print "self.loss_T[0]/100.,self.lossR[0]/100.,self.loss_gamma[0]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
                 print self.loss_T[0]/100.,self.lossR[0]/100.,self.loss_gamma[0]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
+            if(self._high_int_dim==False):
+                print "self.loss_interpret/100."
+                print self.loss_interpret/100.
                 
             print K.get_value(self.encoder.optimizer.lr)
             print K.get_value(self.encoder_diff.optimizer.lr)
             self.loss_T=np.zeros((self.nstep))
+            self.loss_interpret=0
             self.loss_T2=0
             self.lossR=np.zeros((self.nstep))
             self.loss_gamma=np.zeros((self.nstep))
@@ -498,17 +528,13 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         print "r_vals_d0"
         print r_vals_d0
         next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
-        print "next_x_predicted"
-        print next_x_predicted
-        next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
-        #print "next_x_predicted action 0 t2"
+        #print "next_x_predicted"
         #print next_x_predicted
         next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
-        #print "next_x_predicted action 0 t3"
-        #print next_x_predicted
         next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
-        print "next_x_predicted action 0 t4"
-        print next_x_predicted
+        next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
+        #print "next_x_predicted action 0 t4"
+        #print next_x_predicted
         ## END DEBUG PURPOSES
 
         QD_plan=0
@@ -664,6 +690,7 @@ def _compile(self):
         optimizer5=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer6=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer7=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer8=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
 
         for i in range(self.nstep):
             #for l in self.R.layers+self.gamma.layers+self.transition.layers:
@@ -677,6 +704,9 @@ def _compile(self):
             #    l.trainable=True            
             self.diff_Tx_x_s[i].compile(optimizer=optimizer1, loss='mse') # Fit transitions
 
+        if(self._high_int_dim==False):
+            self.force_features.compile(optimizer=optimizer8,
+                  loss=cosine_proximity2)
 #        self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
 
         self.encoder.compile(optimizer=optimizer4,
@@ -719,9 +749,24 @@ def setLearningRate(self, lr):
         
 #        K.set_value(self.transition2.optimizer.lr, self._lr/2.)
 
+        if(self._high_int_dim==False):
+            K.set_value(self.force_features.optimizer.lr, self._lr)
+
         K.set_value(self.encoder.optimizer.lr, self._lr)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.) # /10. for simple laby; /2 for distrib of laby
-        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.) # /10. for simple laby; /2 for distrib of laby
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
+        K.set_value(self.diff_sa_sa.optimizer.lr, 0) # 0 !
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
+
+    def transfer(self, original, transfer, epochs=1):
+        # First, make sure that the target network and the current network are the same
+        self._resetQHat()
+        
+        # 
+        x_original=self.encoder_target.predict(original)#[0]
+        print x_original
+        for i in range(epochs):
+            print self.encoder.train_on_batch(transfer , x_original )
+            print self.encoder.train_on_batch(original , x_original )
+         
diff --git a/examples/simplest_test_PLI/run_test3.py b/examples/simplest_test_PLI/run_test3.py
index 9cd9bf45..c64cf632 100644
--- a/examples/simplest_test_PLI/run_test3.py
+++ b/examples/simplest_test_PLI/run_test3.py
@@ -1,6 +1,5 @@
-"""ALE launcher. See Wiki for more details about this experiment.
+"""Catcher
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import sys
@@ -11,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
+from deer.q_networks.q_net_keras_lp_nstep import MyQNetwork
 from test_env3 import MyEnv as test_env
 import deer.experiment.base_controllers as bc
 
@@ -23,7 +22,7 @@ class Defaults:
     # Experiment Parameters
     # ----------------------
     STEPS_PER_EPOCH = 5000
-    EPOCHS = 50
+    EPOCHS = 10#50
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
     
@@ -36,7 +35,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0005
+    LEARNING_RATE = 0.0002
     LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
@@ -68,7 +67,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = test_env(rng, higher_dim_obs=True)
+    env = test_env(rng, higher_dim_obs=True, reverse=False)
     
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
@@ -80,7 +79,9 @@ class Defaults:
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
-        rng)
+        rng,
+        high_int_dim=False,
+        internal_dim=3)
     
     test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
 
@@ -170,6 +171,80 @@ class Defaults:
     agent.gathering_data=False
     agent.run(parameters.epochs, parameters.steps_per_epoch)
     
+
+
+    rand_ind=np.random.random_integers(0,20000,10)
+    original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(10)]) for o in range(1)]
+    transfer=[np.array([[1-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(10)]) for o in range(1)]
+
+    print "original, transfer"
+    print original, transfer
+
+    # Transfer between the two repr
+    qnetwork.transfer(original, transfer, 100)
+
+
+    # --- Instantiate environment ---
+    env = test_env(rng, higher_dim_obs=False, reverse=True)
+
+    # --- Instantiate agent ---
+    agent = NeuralAgent(
+        env,
+        qnetwork,
+        parameters.replay_memory_size,
+        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
+        parameters.batch_size,
+        rng,
+        test_policy=test_policy)
+
+    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+    # these validation epoch to interfere with the training of the agent, which is well established by the 
+    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+    # [parameters.period_btw_summary_perfs] *validation* epochs.
+    agent.attach(bc.InterleavedTestEpochController(
+        id=test_env.VALIDATION_MODE, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[],#[0, 1, 2, 3, 4],
+        periodicity=1,
+        show_score=True,
+        summarize_every=1))
+
+
+    agent.gathering_data=False
+    agent.run(parameters.epochs, parameters.steps_per_epoch)
+
+
+    #print "agent.DataSet.self._terminals"
+    #print "agent._dataset.terminals()"
+    #print agent._dataset.terminals()
+    #print agent._dataset._terminals._data[0:2000]
+    #print agent._dataset._actions._data[0:2000]
+#    r=agent._dataset._rewards._data[0:2000]
+#    print "r before"
+#    print r
+    print agent._dataset._observations[0]._data[0:10]
+#    ind=np.argwhere(r>0)
+#    print "agent._dataset._observations[0]._data[ind[0]]"
+#    print agent._dataset._observations[0]._data[ind[0]]
+#    print ind
+#    agent._dataset._rewards._data=np.delete(agent._dataset._rewards._data,ind)
+#    agent._dataset._terminals._data=np.delete(agent._dataset._terminals._data,ind)
+#    agent._dataset._actions._data=np.delete(agent._dataset._actions._data,ind)
+#    agent._dataset._observations[0]._data=np.delete(agent._dataset._observations[0]._data,ind,axis=0)
+#    r=agent._dataset._rewards._data[0:2000]
+#    print "r after"
+#    print r
+#    print "agent._dataset._observations[0]._data[ind[0]] after"
+#    print agent._dataset._observations[0]._data[ind[0]]
+#
+
+
+
+    
     # --- Show results ---
     basename = "scores/" + fname
     scores = joblib.load(basename + "_scores.jldump")
diff --git a/examples/simplest_test_PLI/run_test4.py b/examples/simplest_test_PLI/run_test4.py
index 65556605..0c711b01 100644
--- a/examples/simplest_test_PLI/run_test4.py
+++ b/examples/simplest_test_PLI/run_test4.py
@@ -36,7 +36,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0002 #0.0001 for high_int_dim, 0.00002 for low_int_dim
+    LEARNING_RATE = 0.0002
     LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
@@ -81,7 +81,8 @@ class Defaults:
         parameters.batch_size,
         parameters.update_rule,
         rng,
-        high_int_dim=False)
+        high_int_dim=False,
+        internal_dim=2)
     
     test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
 
diff --git a/examples/simplest_test_PLI/test_env3.py b/examples/simplest_test_PLI/test_env3.py
index fcce67e4..1dba7cc7 100644
--- a/examples/simplest_test_PLI/test_env3.py
+++ b/examples/simplest_test_PLI/test_env3.py
@@ -29,6 +29,7 @@ def __init__(self, rng, **kwargs):
         self._width_paddle=1
         self._nx_block=2 #number of different x positions of the falling blocks
         self._higher_dim_obs=kwargs["higher_dim_obs"]
+        self._reverse=kwargs["reverse"]
 
         if(self._nx_block==1):
             self._x_block=self._width//2
@@ -109,8 +110,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
         print "learning_algo.encoder.predict(all_possib_inp)"
         print all_possib_abs_states
         
-        print "print test_data_set.observations()"
-        print test_data_set.observations()
+        #print "print test_data_set.observations()"
+        #print test_data_set.observations()
         n=self._height-1
         historics=[]
         for i,observ in enumerate(test_data_set.observations()[0][0:n]):
@@ -150,6 +151,10 @@ def summarizePerformance(self, test_data_set, learning_algo):
         
         fig = plt.figure()
         ax = fig.add_subplot(111,projection='3d')
+        ax.set_xlabel(r'$X_1$')
+        ax.set_ylabel(r'$X_2$')
+        ax.set_zlabel(r'$X_3$')
+
         for j in range(3):
             # Plot the trajectory
             for i in xrange(n-1):
@@ -274,8 +279,10 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
         cb1.set_label('Estimated expected return')
 
-        plt.show()
-        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
+        #plt.show()
+        for ii in xrange(-15,345,30):
+            ax.view_init(elev=20., azim=ii)
+            plt.savefig('fig_w_V_div5'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
 
 
         # fig_visuV
@@ -388,8 +395,10 @@ def get_observation(self,y,x_block,x):
             
             obs[y_t-2:y_t+3,x_block_t-3:x_block_t+4]=ball
             obs[3:6,x_t-3:x_t+4]=paddle
-            plt.imshow(np.flip(obs,axis=0), cmap='gray_r')
-            plt.show()
+            if(self._reverse==True):
+                obs=1-obs
+            #plt.imshow(np.flip(obs,axis=0), cmap='gray_r')
+            #plt.show()
 
         return obs
 

From a5100edf20972c0487fe2afd9acf378102e3a082 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 18 Apr 2018 22:20:25 -0400
Subject: [PATCH 44/96] modifs

---
 deer/q_networks/q_net_keras_lp_nstep.py | 5 +++--
 examples/simplest_test_PLI/run_test3.py | 3 ++-
 examples/simplest_test_PLI/test_env3.py | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/deer/q_networks/q_net_keras_lp_nstep.py b/deer/q_networks/q_net_keras_lp_nstep.py
index b58c9012..76a755d1 100644
--- a/deer/q_networks/q_net_keras_lp_nstep.py
+++ b/deer/q_networks/q_net_keras_lp_nstep.py
@@ -16,7 +16,8 @@
 sess = tf.Session(config=config)
 
 def mean_squared_error(y_true, y_pred):
-    return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
+    return K.clip(K.max(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
+    #return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
     #return K.mean(  K.square( K.clip(K.abs(y_pred - y_true)-1,0.,100.) )  ,  axis=-1  )   # = mse error
 
 def exp_dec_error(y_true, y_pred):
@@ -750,7 +751,7 @@ def setLearningRate(self, lr):
 #        K.set_value(self.transition2.optimizer.lr, self._lr/2.)
 
         if(self._high_int_dim==False):
-            K.set_value(self.force_features.optimizer.lr, self._lr)
+            K.set_value(self.force_features.optimizer.lr, self._lr*0.75)
 
         K.set_value(self.encoder.optimizer.lr, self._lr)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
diff --git a/examples/simplest_test_PLI/run_test3.py b/examples/simplest_test_PLI/run_test3.py
index c64cf632..65272950 100644
--- a/examples/simplest_test_PLI/run_test3.py
+++ b/examples/simplest_test_PLI/run_test3.py
@@ -67,7 +67,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = test_env(rng, higher_dim_obs=True, reverse=False)
+    env = test_env(rng, higher_dim_obs=False, reverse=False)
     
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
@@ -80,6 +80,7 @@ class Defaults:
         parameters.batch_size,
         parameters.update_rule,
         rng,
+        double_Q=True,
         high_int_dim=False,
         internal_dim=3)
     
diff --git a/examples/simplest_test_PLI/test_env3.py b/examples/simplest_test_PLI/test_env3.py
index 1dba7cc7..ea8cc7ac 100644
--- a/examples/simplest_test_PLI/test_env3.py
+++ b/examples/simplest_test_PLI/test_env3.py
@@ -279,7 +279,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
         cb1.set_label('Estimated expected return')
 
-        #plt.show()
+        plt.show()
         for ii in xrange(-15,345,30):
             ax.view_init(elev=20., azim=ii)
             plt.savefig('fig_w_V_div5'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')

From 1a56de031c4372b700bd8d000241ae45446cc085 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 24 Apr 2018 10:17:59 -0400
Subject: [PATCH 45/96] modifs

---
 deer/q_networks/NN_keras_lp_high_int_dim.py |  5 +-
 deer/q_networks/q_net_keras_lp_nstep.py     | 50 +++++++++-----
 examples/simplest_test_PLI/run_test3.py     | 75 ++++++++++++++++-----
 examples/simplest_test_PLI/test_env3.py     | 11 +--
 4 files changed, 103 insertions(+), 38 deletions(-)

diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/q_networks/NN_keras_lp_high_int_dim.py
index 7580f1c6..5ad8b1fb 100644
--- a/deer/q_networks/NN_keras_lp_high_int_dim.py
+++ b/deer/q_networks/NN_keras_lp_high_int_dim.py
@@ -69,9 +69,10 @@ def encoder_model(self):
                     #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                     x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
                     x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
-                    x = AveragePooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
+                    x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
+                    x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                     x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
-                    x = AveragePooling2D(pool_size=(3, 3), strides=None, padding='same')(x)
+                    x = MaxPooling2D(pool_size=(3, 3), strides=None, padding='same')(x)
                     #x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
                     #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                     #x = Conv2D(16, (4, 4), padding='same', activation='tanh')(x)
diff --git a/deer/q_networks/q_net_keras_lp_nstep.py b/deer/q_networks/q_net_keras_lp_nstep.py
index 76a755d1..fa2a20a0 100644
--- a/deer/q_networks/q_net_keras_lp_nstep.py
+++ b/deer/q_networks/q_net_keras_lp_nstep.py
@@ -15,7 +15,7 @@
 config.gpu_options.allow_growth=True
 sess = tf.Session(config=config)
 
-def mean_squared_error(y_true, y_pred):
+def mean_squared_error_p(y_true, y_pred):
     return K.clip(K.max(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
     #return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
     #return K.mean(  K.square( K.clip(K.abs(y_pred - y_true)-1,0.,100.) )  ,  axis=-1  )   # = mse error
@@ -531,9 +531,11 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
         #print "next_x_predicted"
         #print next_x_predicted
-        next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
-        next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
-        next_x_predicted=T.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
+        one_hot_first_action=np.zeros((1,self._n_actions))
+        one_hot_first_action[0]=1
+        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
+        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
+        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
         #print "next_x_predicted action 0 t4"
         #print next_x_predicted
         ## END DEBUG PURPOSES
@@ -541,7 +543,7 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         QD_plan=0
         for i in range(d+1): #TO DO: improve planning algorithm
             #print encoded_x
-            Qd=self.qValues_planning_abstr(encoded_x, R, gamma, T, Q, d=i, branching_factor=2).reshape(len(encoded_x),-1)
+            Qd=self.qValues_planning_abstr(encoded_x, R, gamma, T, Q, d=i, branching_factor=[self._n_actions,2,2,2,2,2,2,2]).reshape(len(encoded_x),-1)
             print "Qd,i"
             print Qd,i
             QD_plan+=Qd
@@ -592,17 +594,20 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
     def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
         """ 
         """
-        if(branching_factor==None or branching_factor>self._n_actions):
-            branching_factor=self._n_actions
+        #if(branching_factor==None or branching_factor>self._n_actions):
+        #    branching_factor=self._n_actions
+        
         #print "qValues_planning_abstr d"
         #print d
         n=len(state_abstr_val)
         identity_matrix = np.diag(np.ones(self._n_actions))
         
+        this_branching_factor=branching_factor.pop(0)
         if (n==1):
+            # We require that the first branching factor is self._n_actions so that QD_plan has the right dimension
             this_branching_factor=self._n_actions
-        else:
-            this_branching_factor=branching_factor
+        #else:
+        #    this_branching_factor=branching_factor
                          
         if (d==0):
             if(this_branching_factor<self._n_actions):
@@ -652,7 +657,7 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
             gamma_vals_d0=gamma_vals_d0.flatten()
 
             next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
-            return r_vals_d0+gamma_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor),axis=1).flatten()
+            return r_vals_d0+gamma_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor[0]),axis=1).flatten()
         
 
 
@@ -711,7 +716,7 @@ def _compile(self):
 #        self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
 
         self.encoder.compile(optimizer=optimizer4,
-                  loss=mean_squared_error)
+                  loss=mean_squared_error_p)
         self.encoder_diff.compile(optimizer=optimizer5,
                   loss=exp_dec_error)
                   #metrics=['accuracy'])
@@ -756,18 +761,33 @@ def setLearningRate(self, lr):
         K.set_value(self.encoder.optimizer.lr, self._lr)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/1.) # /5. for simple laby or simple catcher; /1 for distrib of laby
         K.set_value(self.diff_sa_sa.optimizer.lr, 0) # 0 !
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
 
     def transfer(self, original, transfer, epochs=1):
         # First, make sure that the target network and the current network are the same
         self._resetQHat()
+        # modify the loss of the encoder
+        #self.encoder=self.learn_and_plan.encoder_model()
+        optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        self.encoder.compile(optimizer=optimizer4,
+                  loss='mse')
         
-        # 
+        # Then, train the encoder such that the original and transfer states are mapped into the same abstract representation
         x_original=self.encoder_target.predict(original)#[0]
         print x_original
         for i in range(epochs):
-            print self.encoder.train_on_batch(transfer , x_original )
-            print self.encoder.train_on_batch(original , x_original )
+            size = original[0].shape[0]
+            #print size
+            #print transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)]
+            print "train"
+            print self.encoder.train_on_batch(transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
+            #print self.encoder.train_on_batch(original[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
+            print "validation"
+            print self.encoder.test_on_batch(transfer[0][int(size*0.8):] , x_original[int(size*0.8):])
+            #print self.encoder.test_on_batch(original[0][int(size*0.8):] , x_original[int(size*0.8):] )
          
+        # recompile with original loss
+        self.encoder.compile(optimizer=optimizer4,
+                  loss=mean_squared_error_p)
diff --git a/examples/simplest_test_PLI/run_test3.py b/examples/simplest_test_PLI/run_test3.py
index 65272950..0ab34ce5 100644
--- a/examples/simplest_test_PLI/run_test3.py
+++ b/examples/simplest_test_PLI/run_test3.py
@@ -21,8 +21,8 @@ class Defaults:
     # ----------------------
     # Experiment Parameters
     # ----------------------
-    STEPS_PER_EPOCH = 5000
-    EPOCHS = 10#50
+    STEPS_PER_EPOCH = 1000
+    EPOCHS = 50
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
     
@@ -36,7 +36,7 @@ class Defaults:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
     LEARNING_RATE = 0.0002
-    LEARNING_RATE_DECAY = 0.98
+    LEARNING_RATE_DECAY = 1
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
     DISCOUNT_MAX = 0.99
@@ -84,7 +84,7 @@ class Defaults:
         high_int_dim=False,
         internal_dim=3)
     
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.2)#1.)
 
     # --- Instantiate agent ---
     agent = NeuralAgent(
@@ -128,8 +128,9 @@ class Defaults:
         pass
     dump(vars(parameters), "params/" + fname + ".jldump")
     
-    agent.run(n_epochs=1, epoch_length=20000)
-    print "end gathering data"
+    #agent.run(n_epochs=1, epoch_length=20000)
+    #print "end gathering data"
+    
     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
     # Plus, we also want to display after each training episode (!= than after every training) the average bellman
     # residual and the average of the V values obtained during the last episode, hence the two last arguments.
@@ -169,26 +170,26 @@ class Defaults:
         show_score=True,
         summarize_every=1))
 
-    agent.gathering_data=False
+    #agent.gathering_data=False
     agent.run(parameters.epochs, parameters.steps_per_epoch)
     
 
-
-    rand_ind=np.random.random_integers(0,20000,10)
-    original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(10)]) for o in range(1)]
-    transfer=[np.array([[1-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(10)]) for o in range(1)]
+    samples_transfer=200
+    rand_ind=np.random.random_integers(0,20000,samples_transfer)
+    original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
+    transfer=[np.array([[1-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
 
     print "original, transfer"
     print original, transfer
 
     # Transfer between the two repr
-    qnetwork.transfer(original, transfer, 100)
+    qnetwork.transfer(original, transfer, 5000)
 
 
-    # --- Instantiate environment ---
+    # --- Instantiate environment with reverse=True ---
     env = test_env(rng, higher_dim_obs=False, reverse=True)
 
-    # --- Instantiate agent ---
+    # --- Re instantiate agent ---
     agent = NeuralAgent(
         env,
         qnetwork,
@@ -198,6 +199,48 @@ class Defaults:
         rng,
         test_policy=test_policy)
 
+    # --- Bind controllers to the agent ---
+    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+    # learning rate as well as the training epoch number.
+    agent.attach(bc.VerboseController(
+        evaluate_on='epoch', 
+        periodicity=1))
+        
+    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+    # episode or epoch (or never, hence the resetEvery='none').
+    agent.attach(bc.EpsilonController(
+        initial_e=parameters.epsilon_start, 
+        e_decays=parameters.epsilon_decay, 
+        e_min=parameters.epsilon_min,
+        evaluate_on='action',
+        periodicity=1,
+        reset_every='none'))
+
+    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+    agent.attach(bc.TrainerController(
+        evaluate_on='action', 
+        periodicity=parameters.update_frequency, 
+        show_episode_avg_V_value=True, 
+        show_avg_Bellman_residual=True))
+
+    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+    agent.attach(bc.LearningRateController(
+        initial_learning_rate=parameters.learning_rate, 
+        learning_rate_decay=parameters.learning_rate_decay,
+        periodicity=1))
+    
+    # Same for the discount factor.
+    agent.attach(bc.DiscountFactorController(
+        initial_discount_factor=parameters.discount, 
+        discount_factor_growth=parameters.discount_inc, 
+        discount_factor_max=parameters.discount_max,
+        periodicity=1))
+
     # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
     # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
     # these validation epoch to interfere with the training of the agent, which is well established by the 
@@ -209,8 +252,8 @@ class Defaults:
     agent.attach(bc.InterleavedTestEpochController(
         id=test_env.VALIDATION_MODE, 
         epoch_length=parameters.steps_per_test,
-        controllers_to_disable=[],#[0, 1, 2, 3, 4],
-        periodicity=1,
+        controllers_to_disable=[0, 1, 2, 3, 4],
+        periodicity=2,
         show_score=True,
         summarize_every=1))
 
diff --git a/examples/simplest_test_PLI/test_env3.py b/examples/simplest_test_PLI/test_env3.py
index ea8cc7ac..b2461568 100644
--- a/examples/simplest_test_PLI/test_env3.py
+++ b/examples/simplest_test_PLI/test_env3.py
@@ -27,7 +27,7 @@ def __init__(self, rng, **kwargs):
         self._height=10#15
         self._width=10 #preferably an odd number so that it's symmetrical
         self._width_paddle=1
-        self._nx_block=2 #number of different x positions of the falling blocks
+        self._nx_block=3#self._width#2 #number of different x positions of the falling blocks
         self._higher_dim_obs=kwargs["higher_dim_obs"]
         self._reverse=kwargs["reverse"]
 
@@ -157,7 +157,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
 
         for j in range(3):
             # Plot the trajectory
-            for i in xrange(n-1):
+            for i in xrange(30):#(n-1):
                 ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
 
         # Plot the estimated transitions
@@ -279,7 +279,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
         cb1.set_label('Estimated expected return')
 
-        plt.show()
+        #plt.show()
         for ii in xrange(-15,345,30):
             ax.view_init(elev=20., azim=ii)
             plt.savefig('fig_w_V_div5'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
@@ -395,8 +395,9 @@ def get_observation(self,y,x_block,x):
             
             obs[y_t-2:y_t+3,x_block_t-3:x_block_t+4]=ball
             obs[3:6,x_t-3:x_t+4]=paddle
-            if(self._reverse==True):
-                obs=1-obs
+        
+        if(self._reverse==True):
+            obs=1-obs
             #plt.imshow(np.flip(obs,axis=0), cmap='gray_r')
             #plt.show()
 

From 19152543b2a4905bbecdcbfd76071f3ada45fb6b Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 4 May 2018 10:58:34 -0400
Subject: [PATCH 46/96] modifs

---
 deer/agent.py                               |  15 +-
 deer/base_classes/Environment.py            |   1 -
 deer/base_classes/QNetwork.py               |   1 -
 deer/q_networks/NN_keras_lp_high_int_dim.py | 108 +++---
 deer/q_networks/q_net_keras_lp.py           | 359 +++++++++++++++-----
 deer/q_networks/q_net_keras_lp_nstep.py     |  42 ++-
 examples/ALE/ALE_env.py                     |   4 +-
 examples/ALE/run_ALE.py                     |  64 +++-
 examples/simplest_test_PLI/run_test3.py     |  48 ++-
 examples/simplest_test_PLI/run_test4.py     |   8 +-
 examples/simplest_test_PLI/test_env3.py     |  15 +-
 examples/simplest_test_PLI/test_env4.py     |  37 +-
 12 files changed, 473 insertions(+), 229 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index cec332af..c5b4ee5b 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -5,7 +5,6 @@
 .. Authors: Vincent Francois-Lavet, David Taralla
 """
 
-from theano import config
 import os
 import numpy as np
 import copy
@@ -77,7 +76,7 @@ def __init__(self, environment, q_network, replay_memory_size=1000000, replay_st
         self._selected_action = -1
         self._state = []
         for i in range(len(inputDims)):
-            self._state.append(np.zeros(inputDims[i], dtype=config.floatX))
+            self._state.append(np.zeros(inputDims[i]))
         if (train_policy==None):
             self._train_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.1)
         else:
@@ -142,12 +141,12 @@ def totalRewardOverLastTest(self):
         """
         return self._total_mode_reward/self._totalModeNbrEpisode, self._totalModeNbrEpisode
 
-    def bestAction(self):
-        """ Returns the best Action
-        """
-        action = self._network.chooseBestAction(self._state)
-        V = max(self._network.qValues(self._state))
-        return action, V
+#    def bestAction(self):
+#        """ Returns the best Action
+#        """
+#        action = self._network.chooseBestAction(self._state)
+#        V = max(self._network.qValues(self._state))
+#        return action, V
      
     def attach(self, controller):
         if (isinstance(controller, controllers.Controller)):
diff --git a/deer/base_classes/Environment.py b/deer/base_classes/Environment.py
index 5a9bc9e6..e7f7af89 100644
--- a/deer/base_classes/Environment.py
+++ b/deer/base_classes/Environment.py
@@ -2,7 +2,6 @@
 .. Authors: Vincent Francois-Lavet, David Taralla
 """
 
-from theano import config
 import numpy as np
 
 class Environment(object): 
diff --git a/deer/base_classes/QNetwork.py b/deer/base_classes/QNetwork.py
index 19a147c6..bb24145b 100644
--- a/deer/base_classes/QNetwork.py
+++ b/deer/base_classes/QNetwork.py
@@ -2,7 +2,6 @@
 .. Authors: Vincent Francois-Lavet, David Taralla
 """
 
-from theano import config
 import numpy as np
 
 class QNetwork(object):
diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/q_networks/NN_keras_lp_high_int_dim.py
index 5ad8b1fb..176a1d33 100644
--- a/deer/q_networks/NN_keras_lp_high_int_dim.py
+++ b/deer/q_networks/NN_keras_lp_high_int_dim.py
@@ -34,7 +34,7 @@ def __init__(self, batch_size, input_dimensions, n_actions, random_state, action
         self._action_as_input=action_as_input
         self._high_int_dim=kwargs["high_int_dim"]
         if(self._high_int_dim==True):
-            self.n_channels_internal_dim=kwargs["internal_dim"] #dim[0]
+            self.n_channels_internal_dim=kwargs["internal_dim"] #dim[-3]
         else:
             self.internal_dim=kwargs["internal_dim"]    #2 for laby
                                                         #3 for catcher
@@ -59,16 +59,16 @@ def encoder_model(self):
             # - observation[i] is a FRAME
             print "dim enc"
             print dim
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
+            if len(dim) == 3 or len(dim) == 4:
+                input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
-                x=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)    #data_format='channels_last'
-                if(dim[1]>8 and dim[2]>8):
+                x=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
+                if(dim[-2]>8 and dim[-1]>8):
                     self._pooling_encoder=6
                     #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
                     #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                    x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
-                    x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
+                    #x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
+                    x = Conv2D(8, (2, 2), padding='same', activation='tanh')(x)
                     x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
                     x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                     x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
@@ -90,10 +90,10 @@ def encoder_model(self):
                 
             # - observation[i] is a VECTOR
             elif len(dim) == 2:
-                if dim[0] > 3:
-                    input = Input(shape=(dim[0],dim[1]))
+                if dim[-3] > 3:
+                    input = Input(shape=(dim[-3],dim[-2]))
                     inputs.append(input)
-                    reshaped=Reshape((dim[0],dim[1],1), input_shape=(dim[0],dim[1]))(input)     #data_format='channels_last'
+                    reshaped=Reshape((dim[-3],dim[-2],1), input_shape=(dim[-3],dim[-2]))(input)     #data_format='channels_last'
                     x = Conv2D(16, (2, 1), activation='relu', border_mode='valid')(reshaped)    #Conv on the history
                     x = Conv2D(16, (2, 2), activation='relu', border_mode='valid')(x)           #Conv on the history & features
             
@@ -102,17 +102,17 @@ def encoder_model(self):
                     else:
                         out = Flatten()(x)
                 else:
-                    input = Input(shape=(dim[0],dim[1]))
+                    input = Input(shape=(dim[-3],dim[-2]))
                     inputs.append(input)
                     out = Flatten()(input)
             
             # - observation[i] is a SCALAR -
             else:
-                if dim[0] > 3:
+                if dim[-3] > 3:
                     # this returns a tensor
-                    input = Input(shape=(dim[0],))
+                    input = Input(shape=(dim[-3],))
                     inputs.append(input)
-                    reshaped=Reshape((1,dim[0],1), input_shape=(dim[0],))(input)            #data_format='channels_last'
+                    reshaped=Reshape((1,dim[-3],1), input_shape=(dim[-3],))(input)            #data_format='channels_last'
                     x = Conv2D(8, (1,2), activation='relu', border_mode='valid')(reshaped)  #Conv on the history
                     x = Conv2D(8, (1,2), activation='relu', border_mode='valid')(x)         #Conv on the history
                     
@@ -122,7 +122,7 @@ def encoder_model(self):
                         out = Flatten()(x)
                                         
                 else:
-                    input = Input(shape=(dim[0],))
+                    input = Input(shape=(dim[-3],))
                     inputs.append(input)
                     out=input
                     
@@ -170,16 +170,16 @@ def encoder_diff_model(self,encoder_model):
         
         for j in range(2):
             for i, dim in enumerate(self._input_dimensions):
-                if len(dim) == 3:
-                    input = Input(shape=(dim[0],dim[1],dim[2]))
+                if len(dim) == 3 or len(dim) == 4:
+                    input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                     inputs.append(input)
             
                 elif len(dim) == 2:
-                    input = Input(shape=(dim[0],dim[1]))
+                    input = Input(shape=(dim[-3],dim[-2]))
                     inputs.append(input)
             
                 else:
-                    input = Input(shape=(dim[0],))
+                    input = Input(shape=(dim[-3],))
                     inputs.append(input)
         
         half = len(inputs)/2
@@ -209,14 +209,14 @@ def transition_model(self):
         """
         if(self._high_int_dim==True):
             dim=self._input_dimensions[0] #FIXME
-            inputs = [ Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),self.n_channels_internal_dim)), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
+            inputs = [ Input(shape=(-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder),self.n_channels_internal_dim)), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
             print inputs[0]._keras_shape
             print inputs[1]._keras_shape
             
             layers_action=inputs[1]
-            layers_action=RepeatVector(-(-dim[1] // self._pooling_encoder)*-(-dim[2] // self._pooling_encoder))(layers_action)#K.repeat_elements(layers_action,rep=dim[1]*dim[2],axis=1)
-            layers_action=Reshape((self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)
-            layers_action=Permute((2,3,1), input_shape=(self.n_channels_internal_dim+self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
+            layers_action=RepeatVector(-(-dim[-2] // self._pooling_encoder)*-(-dim[-1] // self._pooling_encoder))(layers_action)#K.repeat_elements(layers_action,rep=dim[-2]*dim[-1],axis=1)
+            layers_action=Reshape((self._n_actions,-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder)))(layers_action)
+            layers_action=Permute((2,3,1), input_shape=(self.n_channels_internal_dim+self._n_actions,-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
             
             x = Concatenate(axis=-1)([layers_action,inputs[0]])
             
@@ -290,16 +290,16 @@ def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
         inputs=[]
         for j in range(2):
             for i, dim in enumerate(self._input_dimensions):
-                if len(dim) == 3:
-                    input = Input(shape=(dim[0],dim[1],dim[2]))
+                if len(dim) == 3 or len(dim) == 4:
+                    input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                     inputs.append(input)
             
                 elif len(dim) == 2:
-                    input = Input(shape=(dim[0],dim[1]))
+                    input = Input(shape=(dim[-3],dim[-2]))
                     inputs.append(input)
             
                 else:
-                    input = Input(shape=(dim[0],))
+                    input = Input(shape=(dim[-3],))
                     inputs.append(input)
 
         half = len(inputs)/2
@@ -342,16 +342,16 @@ def force_features(self,encoder_model,transition_model,plan_depth=0):
         """
         inputs=[]
         for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
+            if len(dim) == 3 or len(dim) == 4:
+                input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
         
             elif len(dim) == 2:
-                input = Input(shape=(dim[0],dim[1]))
+                input = Input(shape=(dim[-3],dim[-2]))
                 inputs.append(input)
         
             else:
-                input = Input(shape=(dim[0],))
+                input = Input(shape=(dim[-3],))
                 inputs.append(input)
 
         enc_x = encoder_model(inputs[:]) #s --> x
@@ -427,15 +427,15 @@ def force_features(self,encoder_model,transition_model,plan_depth=0):
 #        for j in range(2):
 #            for i, dim in enumerate(self._input_dimensions):
 #                if len(dim) == 3:
-#                    input = Input(shape=(dim[0],dim[1],dim[2]))
+#                    input = Input(shape=(dim[-3],dim[-2],dim[-1]))
 #                    inputs.append(input)
 #            
 #                elif len(dim) == 2:
-#                    input = Input(shape=(dim[0],dim[1]))
+#                    input = Input(shape=(dim[-3],dim[-2]))
 #                    inputs.append(input)
 #            
 #                else:
-#                    input = Input(shape=(dim[0],))
+#                    input = Input(shape=(dim[-3],))
 #                    inputs.append(input)
 #        
 #        half = len(inputs)/2
@@ -470,16 +470,16 @@ def diff_sa_sa(self,encoder_model,transition_model):
         inputs=[]
         
         for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
+            if len(dim) == 3 or len(dim) == 4:
+                input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
 
             elif len(dim) == 2:
-                input = Input(shape=(dim[0],dim[1]))
+                input = Input(shape=(dim[-3],dim[-2]))
                 inputs.append(input)
 
             else:
-                input = Input(shape=(dim[0],))
+                input = Input(shape=(dim[-3],))
                 inputs.append(input)
         
         input = Input(shape=(self._n_actions,))
@@ -554,13 +554,13 @@ def R_model(self):
         
         if(self._high_int_dim==True):
             dim=self._input_dimensions[0] #FIXME
-            inputs = [ Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),self.n_channels_internal_dim)), Input( shape=(self._n_actions,) ) ]     #data_format='channels_last'
+            inputs = [ Input(shape=(-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder),self.n_channels_internal_dim)), Input( shape=(self._n_actions,) ) ]     #data_format='channels_last'
             
             layers_action=inputs[1]
-            layers_action=RepeatVector(-(-dim[1] // self._pooling_encoder)*-(-dim[2] // self._pooling_encoder))(layers_action)
+            layers_action=RepeatVector(-(-dim[-2] // self._pooling_encoder)*-(-dim[-1] // self._pooling_encoder))(layers_action)
             print layers_action._keras_shape
-            layers_action=Reshape((self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)
-            layers_action=Permute((2,3,1), input_shape=(self.n_channels_internal_dim+self._n_actions,-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
+            layers_action=Reshape((self._n_actions,-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder)))(layers_action)
+            layers_action=Permute((2,3,1), input_shape=(self.n_channels_internal_dim+self._n_actions,-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
             print layers_action._keras_shape
 
             
@@ -606,16 +606,16 @@ def full_R_model(self,encoder_model,R_model,plan_depth=0,transition_model=None):
         inputs=[]
         
         for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
+            if len(dim) == 3 or len(dim) == 4:
+                input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
 
             elif len(dim) == 2:
-                input = Input(shape=(dim[0],dim[1]))
+                input = Input(shape=(dim[-3],dim[-2]))
                 inputs.append(input)
 
             else:
-                input = Input(shape=(dim[0],))
+                input = Input(shape=(dim[-3],))
                 inputs.append(input)
         
         enc_x = encoder_model(inputs[:]) #s --> x
@@ -642,10 +642,10 @@ def Q_model(self):
                 # - observation[i] is a FRAME
                 print "dim Q mod"
                 print dim
-                if len(dim) == 3:
-                    input = Input(shape=(-(-dim[1] // self._pooling_encoder),-(-dim[2] // self._pooling_encoder),self.n_channels_internal_dim)) #data_format is already 'channels_last'
+                if len(dim) == 3 or len(dim) == 4:
+                    input = Input(shape=(-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder),self.n_channels_internal_dim)) #data_format is already 'channels_last'
                     inputs.append(input)
-                    #reshaped=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)
+                    #reshaped=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)
                     x = input     #data_format is already 'channels_last'
                     print x._keras_shape
             
@@ -723,16 +723,16 @@ def full_Q_model(self, encoder_model, Q_model, plan_depth=0, transition_model=No
         inputs=[]
         
         for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))                
+            if len(dim) == 3 or len(dim) == 4:
+                input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
 
             elif len(dim) == 2:
-                input = Input(shape=(dim[0],dim[1]))
+                input = Input(shape=(dim[-3],dim[-2]))
                 inputs.append(input)
 
             else:
-                input = Input(shape=(dim[0],))
+                input = Input(shape=(dim[-3],))
                 inputs.append(input)
         
         out = encoder_model(inputs)
@@ -756,7 +756,7 @@ def full_Q_model(self, encoder_model, Q_model, plan_depth=0, transition_model=No
             out=transition_model([out]+inputs[-1:])
 
         #if(self._high_int_dim==True):
-        #    input = Input(shape=(dim[1],dim[2],dim[0]))
+        #    input = Input(shape=(dim[-2],dim[-1],dim[-3]))
         #    inputs.append(input)
         #else:
         #    input = Input(shape=(self.internal_dim,))
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 72675445..c08e8a7f 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -10,19 +10,30 @@
 from keras import backend as K
 from ..base_classes import QNetwork
 from .NN_keras_lp_high_int_dim import NN # Default Neural network used
-
-def mean_squared_error(y_true, y_pred):
-    return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
+import tensorflow as tf
+config = tf.ConfigProto()
+config.gpu_options.allow_growth=True
+sess = tf.Session(config=config)
+import copy
+
+def mean_squared_error_p(y_true, y_pred):
+    return K.clip(K.max(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
+    #return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
     #return K.mean(  K.square( K.clip(K.abs(y_pred - y_true)-1,0.,100.) )  ,  axis=-1  )   # = mse error
 
 def exp_dec_error(y_true, y_pred):
     return K.exp( - 5.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
 
-def rms_from_squared_components(y_true, y_pred):
-    return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
+def cosine_proximity2(y_true, y_pred):
+    y_true = K.l2_normalize(y_true[:,0:2], axis=-1)
+    y_pred = K.l2_normalize(y_pred[:,0:2], axis=-1)
+    return -K.sum(y_true * y_pred, axis=-1)
 
-def squared_error_from_squared_components(y_true, y_pred):
-    return - K.sum(  K.clip(y_pred,0.,1)  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
+#def rms_from_squared_components(y_true, y_pred):
+#    return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1#
+#
+#def squared_error_from_squared_components(y_true, y_pred):
+#    return - K.sum(  K.clip(y_pred,0.,1)  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
 
 def loss_diff_s_s_(y_true, y_pred):
     return K.square(   1.    -    K.sqrt(  K.clip( K.sum(y_pred,axis=-1,keepdims=True), 0.000001 , 1. )  )     ) # tend to increase y_pred --> loss -1
@@ -56,13 +67,12 @@ class MyQNetwork(QNetwork):
         default is deer.qnetworks.NN_keras
     """
 
-    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN):
+    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN, **kwargs):
         """ Initialize environment
         
         """
         QNetwork.__init__(self,environment, batch_size)
 
-        
         self._rho = rho
         self._rms_epsilon = rms_epsilon
         self._momentum = momentum
@@ -71,6 +81,9 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._double_Q = double_Q
         self._random_state = random_state
         self.update_counter = 0    
+        self._high_int_dim = kwargs.get('high_int_dim',False)
+        self._internal_dim = kwargs.get('internal_dim',2)
+        self.loss_interpret=0
         self.loss_T=0
         self.loss_T2=0
         self.loss_disentangle_t=0
@@ -79,37 +92,43 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.loss_Q=0
         self.loss_disambiguate1=0
         self.loss_disambiguate2=0
+        self.loss_gamma=0
 
         
-        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=False)
+        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
 
         self.encoder = self.learn_and_plan.encoder_model()
         self.encoder_diff = self.learn_and_plan.encoder_diff_model(self.encoder)
         
-        self.Q = self.learn_and_plan.Q_model()
         self.R = self.learn_and_plan.R_model()
+        self.Q = self.learn_and_plan.Q_model()
+        self.gamma = self.learn_and_plan.R_model()
         self.transition = self.learn_and_plan.transition_model()
-        self.transition2 = self.learn_and_plan.transition_model2()
+#        self.transition2 = self.learn_and_plan.transition_model2()
 
-        self.full_Qs=[]
-        for i in range(1):
-            self.full_Qs.append(self.learn_and_plan.full_Q_model(self.encoder,self.Q,i,self._df))
+        self.full_Q=self.learn_and_plan.full_Q_model(self.encoder,self.Q,0,self._df)
         
         # used to fit rewards
         self.full_R = self.learn_and_plan.full_R_model(self.encoder,self.R)
         
+        # used to fit gamma
+        self.full_gamma = self.learn_and_plan.full_R_model(self.encoder,self.gamma)
+        
         # used to fit transitions
         self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)#full_transition_model(self.encoder,self.transition)
         
-        
+        # used to force features variations
+        if(self._high_int_dim==False):
+            self.force_features=self.learn_and_plan.force_features(self.encoder,self.transition)
+                
         # constraint on consecutive t
-        self.diff_s_s_ = self.learn_and_plan.diff_s_s_(self.encoder)
+        self.diff_s_s_ = self.learn_and_plan.encoder_diff_model(self.encoder)
 #        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
 
         # used to disentangle actions
         self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)
                 
-        layers=self.full_Qs[0].layers
+        layers=self.encoder.layers+self.Q.layers+self.R.layers+self.gamma.layers+self.transition.layers
         # Grab all the parameters together.
         self.params = [ param
                     for layer in layers 
@@ -117,12 +136,19 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
 
         self._compile()
 
-        self.next_full_Q = self.learn_and_plan.full_Q_model(self.encoder,self.Q) # FIXME
-        self.next_full_Q.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.full_Q
+        self.learn_and_plan_target = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
+        self.encoder_target = self.learn_and_plan_target.encoder_model()
+        self.Q_target = self.learn_and_plan_target.Q_model()
+        self.R_target = self.learn_and_plan_target.R_model()
+        self.gamma_target = self.learn_and_plan_target.R_model()
+        self.transition_target = self.learn_and_plan_target.transition_model()
+
+        self.full_Q_target = self.learn_and_plan_target.full_Q_model(self.encoder,self.Q) # FIXME
+        self.full_Q_target.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.full_Q
 
-        layers=self.next_full_Q.layers
+        layers=self.encoder_target.layers+self.Q_target.layers+self.R_target.layers+self.gamma_target.layers+self.transition_target.layers
         # Grab all the parameters together.
-        self.next_params = [ param
+        self.params_target = [ param
                     for layer in layers 
                     for param in layer.trainable_weights ]
 
@@ -165,6 +191,13 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1
         states_val=list(states_val)
         next_states_val=list(next_states_val)
+        for i,o in enumerate(states_val):
+            if(o.ndim==5): #FIXME
+                states_val[i]=states_val[i][:,0,:,:,:]/128.-1
+        for i,o in enumerate(next_states_val):
+            if(o.ndim==5): #FIXME
+                next_states_val[i]=next_states_val[i][:,0,:,:,:]/128.-1
+            
         Es_=self.encoder.predict(next_states_val)
         Es=self.encoder.predict(states_val)
         ETs=self.transition.predict([Es,onehot_actions])
@@ -185,22 +218,50 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print R[0]
             
         # Fit transition
-#        for i in range(10):
-#            l=self.transition2.train_on_batch([Es,onehot_actions], Es_)
-#            print l
-#            self.loss_T2+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
-
+        #print "states_val+next_states_val+[onehot_actions]+[(1-terminals_val)]"
+        #print states_val+next_states_val+[onehot_actions]+[(1-terminals_val)]
         l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions]+[(1-terminals_val)], np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))
+        #print "l"
+        #print l
         self.loss_T+=l
         
+        # Interpretable AI
+        if(self._high_int_dim==False):
+            target_modif_features=np.zeros((self._n_actions,self._internal_dim))
+            ## Catcher
+            #target_modif_features[0,0]=1    # dir
+            #target_modif_features[1,0]=-1   # opposite dir
+            #target_modif_features[0:2,1]=1    # temps
+            ## Laby
+            target_modif_features[0,0]=1
+            target_modif_features[1,0]=0
+            #target_modif_features[2,1]=0
+            #target_modif_features[3,1]=0
+            target_modif_features=np.repeat(target_modif_features,self._batch_size,axis=0)
+            states_val_tiled=[]
+            for obs in states_val:
+                states_val_tiled.append(np.tile(obs,(self._n_actions,1,1,1)))
+            onehot_actions_tiled = np.diag(np.ones(self._n_actions))#np.zeros((self._batch_size*self._n_actions, self._n_actions))
+            onehot_actions_tiled = np.repeat(onehot_actions_tiled,self._batch_size,axis=0)
+                
+            self.loss_interpret+=self.force_features.train_on_batch(states_val_tiled+[onehot_actions_tiled], target_modif_features)
 
     
         # Fit rewards
         self.lossR+=self.full_R.train_on_batch(states_val+[onehot_actions], rewards_val) 
-    
+
+        # Fit gammas
+        self.loss_gamma+=self.full_gamma.train_on_batch(states_val+[onehot_actions], (1-terminals_val[:])*self._df)
+
         # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
         # reduce the squared value of the abstract features
-        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        #print "states_val[0][0:2]"
+        #print states_val[0][0:2]
+        #print "self.encoder.predict(states_val)"
+        #print self.encoder.predict(states_val)
+        l=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+        #print l
+        self.loss_disambiguate1+=l
         
         # Increase the entropy in the abstract features of two states
         # This is done only when states_val is made up of only one observation --> FIXME
@@ -218,10 +279,10 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
 
 
-        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch([states_val[0],next_states_val[0]], np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
+        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.reshape(np.zeros_like(Es),(self._batch_size,-1)))#np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
 
         # Disentangle actions
-        self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch([states_val[0],onehot_actions,onehot_actions_rand], np.ones(self._batch_size))
+        self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch(states_val+[onehot_actions,onehot_actions_rand], np.reshape(np.zeros_like(Es),(self._batch_size,-1)))#np.ones(self._batch_size))
 
 #
 #        # Loss to have all s' following s,a with a to a distance 1 of s,a)
@@ -232,17 +293,21 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
 
         
-        if(self.update_counter%100==0):
+        if(self.update_counter%500==0):
             print "self.loss_Q"
             print self.loss_Q
-            print "self.loss_T/100.,self.loss_T2/100.,self.lossR/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
-            print self.loss_T/100.,self.loss_T2/100.,self.lossR/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
-            print K.get_value(self.encoder.optimizer.lr)
-            print K.get_value(self.encoder_diff.optimizer.lr)
-            self.loss_T=0
-            self.loss_T2=0
+            print "self.loss_T/100.,self.lossR/100.,self.loss_gamma/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
+            print self.loss_T/100.,self.lossR/100.,self.loss_gamma/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
+            
+            if(self._high_int_dim==False):
+                print "self.loss_interpret/100."
+                print self.loss_interpret/100.
+
             self.lossR=0
+            self.loss_gamma=0
             self.loss_Q=0
+            self.loss_T=0
+            self.loss_interpret=0
 
             self.loss_disentangle_t=0
             self.loss_disentangle_a=0
@@ -266,11 +331,11 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             self._resetQHat()
         
         #next_q_vals = self.next_full_Q.predict([next_states_val[0],np.zeros_like(Es)]) #np.zeros((32,self.learn_and_plan.internal_dim))])
-        next_q_vals = self.next_full_Q.predict([next_states_val[0]])
+        next_q_vals = self.full_Q_target.predict(next_states_val)
         
         if(self._double_Q==True):
-            #next_q_vals_current_qnet=self.full_Qs[0].predict(next_states_val+[np.zeros_like(Es)])
-            next_q_vals_current_qnet=self.full_Qs[0].predict(next_states_val)
+            #next_q_vals_current_qnet=self.full_Q.predict(next_states_val+[np.zeros_like(Es)])
+            next_q_vals_current_qnet=self.full_Q.predict(next_states_val)
             argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1)
             max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1))
         else:
@@ -281,7 +346,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         
         #q_vals=self.full_Q.predict([states_val[0],np.zeros_like(Es)]) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])
-        q_vals=self.full_Qs[0].predict([states_val[0]])
+        q_vals=self.full_Q.predict([states_val[0]])
         
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
         q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
@@ -299,7 +364,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         loss=0
         #loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
-        loss=self.full_Qs[0].train_on_batch([states_val[0]] , q_vals ) 
+        loss=self.full_Q.train_on_batch(states_val , q_vals ) 
         #print "self.q_vals.optimizer.lr"
         #print K.eval(self.q_vals.optimizer.lr)
         self.loss_Q+=loss
@@ -370,10 +435,15 @@ def qValues(self, state_val):
         -------
         The q values for the provided belief state
         """ 
+        copy_state=copy.deepcopy(state_val) #Required because of the "hack" below
+        for i,o in enumerate(state):
+            if(o.ndim==4): #FIXME
+                copy_state[i]=copy_state[i][0,:,:,:]/128.-1
+
         #return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
-        return self.full_Qs[0].predict([np.expand_dims(state,axis=0) for state in state_val])[0]
+        return self.full_Q.predict([np.expand_dims(state,axis=0) for state in copy_state])[0]
 
-    def qValues_planning(self, state_val, d=5):
+    def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         """ Get the q values for one belief state with a planning depth d
 
         Arguments
@@ -384,11 +454,20 @@ def qValues_planning(self, state_val, d=5):
         Returns
         -------
         The q values with planning depth d for the provided belief state
-        """ 
-        encoded_x = self.encoder.predict([np.expand_dims(state,axis=0) for state in state_val])
-        print encoded_x[0]
-
+        """
+        #print "state_val[0]"
+        #print state_val[0]
+        #print len(state_val)
+#        print "state_val[0][0]"
+#        print state_val[0][0]
+#        print state_val[0].shape
+        print "self.full_Q.predict(state_val)[0]"
+        print self.full_Q.predict(state_val)[0]
+        encoded_x = self.encoder.predict(state_val)
         ## DEBUG PURPOSES
+#        print "encoded_x[0]"
+#        print encoded_x[0]
+        
         identity_matrix = np.diag(np.ones(self._n_actions))
         if(encoded_x.ndim==2):
             tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
@@ -396,37 +475,35 @@ def qValues_planning(self, state_val, d=5):
             tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1,1,1))
         else:
             print ("error")
-
+        
         repeat_identity=np.repeat(identity_matrix,len(encoded_x),axis=0)
-        #print tile3_encoded_x
-        #print repeat_identity
-        r_vals_d0=np.array(self.R.predict([tile3_encoded_x,repeat_identity]))
+        ##print tile3_encoded_x
+        ##print repeat_identity
+        r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
         #print "r_vals_d0"
         #print r_vals_d0
         r_vals_d0=r_vals_d0.flatten()
         print "r_vals_d0"
         print r_vals_d0
-        next_x_predicted=self.transition.predict([tile3_encoded_x,repeat_identity])
-        print "next_x_predicted"
-        print next_x_predicted
-        next_x_predicted=self.transition.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
-        print "next_x_predicted action 0 t2"
-        print next_x_predicted
-        next_x_predicted=self.transition.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
-        print "next_x_predicted action 0 t3"
-        print next_x_predicted
-        next_x_predicted=self.transition.predict([next_x_predicted[0:1],np.array([[1,0,0,0]])])
-        print "next_x_predicted action 0 t4"
-        print next_x_predicted
+        next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
+        #print "next_x_predicted"
+        #print next_x_predicted
+        one_hot_first_action=np.zeros((1,self._n_actions))
+        one_hot_first_action[0]=1
+        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
+        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
+        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
+        #print "next_x_predicted action 0 t4"
+        #print next_x_predicted
         ## END DEBUG PURPOSES
 
         QD_plan=0
         for i in range(d+1): #TO DO: improve planning algorithm
-            Qd=self.qValues_planning_abstr(encoded_x, d=i, branching_factor=2)
-            print Qd
-            QD_plan+=Qd
+            #print encoded_x
+            Qd=self.qValues_planning_abstr(encoded_x, R, gamma, T, Q, d=i, branching_factor=[self._n_actions,2,2,2,2,2,2,2]).reshape(len(encoded_x),-1)
             print "Qd,i"
             print Qd,i
+            QD_plan+=Qd
         QD_plan=QD_plan/(d+1)
         
         print "QD_plan"
@@ -434,20 +511,60 @@ def qValues_planning(self, state_val, d=5):
 
         return QD_plan
 
-    def qValues_planning_abstr(self, state_abstr_val, d, branching_factor=None):
+#    def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
+#        """ 
+#        """
+#        branching_factor=self._n_actions #TO IMPROVE, use MCTS, etc...
+#        n=len(state_abstr_val)
+#        identity_matrix = np.diag(np.ones(self._n_actions))
+#        
+#        this_branching_factor=branching_factor
+#                         
+#        if (d==0):
+#            return Q.predict([state_abstr_val]) # no change in the order of the actions
+#        else:
+#            # All actions are considered in the tree
+#            repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0) # no change in the order of the actions
+#            if(state_abstr_val.ndim==2):
+#                tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
+#            elif(state_abstr_val.ndim==4):
+#                tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1,1,1))
+#            else:
+#                print ("error")
+#            
+#            #print tile3_encoded_x
+#            #print repeat_identity
+#            r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
+#            #print "r_vals_d0"
+#            #print r_vals_d0
+#            r_vals_d0=r_vals_d0.flatten()
+#            
+#            gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
+#            #print "r_vals_d0"
+#            #print r_vals_d0
+#            gamma_vals_vals_d0=gamma_vals_d0.flatten()
+#
+#            next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
+#            return r_vals_d0+gamma_vals_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor),axis=1).flatten()
+  
+  
+    def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
         """ 
         """
-        if(branching_factor==None or branching_factor>self._n_actions):
-            branching_factor=self._n_actions
+        #if(branching_factor==None or branching_factor>self._n_actions):
+        #    branching_factor=self._n_actions
+        
         #print "qValues_planning_abstr d"
         #print d
         n=len(state_abstr_val)
         identity_matrix = np.diag(np.ones(self._n_actions))
         
+        this_branching_factor=branching_factor.pop(0)
         if (n==1):
+            # We require that the first branching factor is self._n_actions so that QD_plan has the right dimension
             this_branching_factor=self._n_actions
-        else:
-            this_branching_factor=branching_factor
+        #else:
+        #    this_branching_factor=branching_factor
                          
         if (d==0):
             if(this_branching_factor<self._n_actions):
@@ -466,7 +583,7 @@ def qValues_planning_abstr(self, state_abstr_val, d, branching_factor=None):
                     print ("error")
             else:
                 # A subset of the actions are considered in the tree
-                estim_Q_values=self.Q.predict([state_abstr_val])
+                estim_Q_values=Q.predict([state_abstr_val])
                 #print estim_Q_values
                 ind = np.argpartition(estim_Q_values, -this_branching_factor)[:,-this_branching_factor:]
                 #print ind
@@ -486,16 +603,20 @@ def qValues_planning_abstr(self, state_abstr_val, d, branching_factor=None):
             
             #print tile3_encoded_x
             #print repeat_identity
-            r_vals_d0=np.array(self.R.predict([tile3_encoded_x,repeat_identity]))
+            r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
             #print "r_vals_d0"
             #print r_vals_d0
             r_vals_d0=r_vals_d0.flatten()
-            next_x_predicted=self.transition.predict([tile3_encoded_x,repeat_identity])
-            return r_vals_d0+self._df*np.amax(self.qValues_planning_abstr(next_x_predicted,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor),axis=1).flatten()
-        
+            
+            gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
+            #print "r_vals_d0"
+            #print r_vals_d0
+            gamma_vals_d0=gamma_vals_d0.flatten()
 
+            next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
+            return r_vals_d0+gamma_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor[0]),axis=1).flatten()
 
-    def chooseBestAction(self, state, *args, **kwargs):
+    def chooseBestAction(self, state, mode):
         """ Get the best action for a belief state
 
         Arguments
@@ -505,8 +626,18 @@ def chooseBestAction(self, state, *args, **kwargs):
         Returns
         -------
         The best action : int
-        """        
-        q_vals = self.qValues_planning(state)#self.qValues(state)#
+        """
+        copy_state=copy.deepcopy(state) #Required because of the "hack" below
+        for i,o in enumerate(state):
+            if(o.ndim==4): #FIXME
+                copy_state[i]=copy_state[i][0,:,:,:]/128.-1
+
+        if(mode==None):
+            mode=0
+        di=[0,1,3,6]
+        # We use the mode to define the planning depth
+        q_vals = self.qValues_planning([np.expand_dims(s,axis=0) for s in copy_state],self.R,self.gamma, self.transition, self.Q, d=di[mode])
+
         return np.argmax(q_vals),np.max(q_vals)
         
     def _compile(self):
@@ -519,7 +650,7 @@ def _compile(self):
         else:
             raise Exception('The update_rule '+self._update_rule+' is not implemented.')
         
-        self.full_Qs[0].compile(optimizer=optimizer, loss='mse')
+        self.full_Q.compile(optimizer=optimizer, loss='mse')
 
         optimizer1=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # Different optimizers for each network; otherwise not possible to modify each
         optimizer2=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately (e.g. lr)
@@ -528,13 +659,19 @@ def _compile(self):
         optimizer5=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer6=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer7=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer8=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
 
         self.diff_Tx_x_.compile(optimizer=optimizer1, loss='mse') # Fit transitions
-        self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
+        #self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
         self.full_R.compile(optimizer=optimizer3, loss='mse') # Fit rewards
+        self.full_gamma.compile(optimizer=optimizer3, loss='mse') # Fit discount
+
+        if(self._high_int_dim==False):
+            self.force_features.compile(optimizer=optimizer8,
+                  loss=cosine_proximity2)
 
         self.encoder.compile(optimizer=optimizer4,
-                  loss=mean_squared_error)
+                  loss=mean_squared_error_p)
         self.encoder_diff.compile(optimizer=optimizer5,
                   loss=exp_dec_error)
                   #metrics=['accuracy'])
@@ -551,7 +688,7 @@ def _compile(self):
 #                  #metrics=['accuracy'])
 
     def _resetQHat(self):
-        for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
+        for i,(param,next_param) in enumerate(zip(self.params, self.params_target)):
             K.set_value(next_param,K.get_value(param))
 
     def setLearningRate(self, lr):
@@ -565,16 +702,56 @@ def setLearningRate(self, lr):
         self._lr = lr
         print "modif lr"
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
-        K.set_value(self.full_Qs[0].optimizer.lr, self._lr*2)
+        K.set_value(self.full_Q.optimizer.lr, 0)#self._lr)
 
-        K.set_value(self.full_R.optimizer.lr, self._lr)
+        K.set_value(self.full_R.optimizer.lr, 0)#self._lr)
+        K.set_value(self.full_gamma.optimizer.lr, 0)#self._lr)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr)
         
-        K.set_value(self.transition2.optimizer.lr, self._lr/2.)
+        if(self._high_int_dim==False):
+            K.set_value(self.force_features.optimizer.lr, 0)#self._lr/2.)
 
         K.set_value(self.encoder.optimizer.lr, self._lr)
-        K.set_value(self.encoder_diff.optimizer.lr, self._lr)
+        K.set_value(self.encoder_diff.optimizer.lr, self._lr/2.)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/10.)
-        K.set_value(self.diff_sa_sa.optimizer.lr, self._lr/10.)
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
+        K.set_value(self.diff_sa_sa.optimizer.lr, 0) # 0 !
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
+
+    def transfer(self, original, transfer, epochs=1):
+        # First, make sure that the target network and the current network are the same
+        self._resetQHat()
+        # modify the loss of the encoder
+        #self.encoder=self.learn_and_plan.encoder_model()
+        #for l in self.encoder.layers[-5:]:
+        #    l.trainable = False # Freeze dense layers # DOES NOT SEEM TO HELP (transfer on catcher)
+        #print "self.encoder.layers[-1].get_weights()"
+        #print self.encoder.layers[-1].get_weights()
+        
+        optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        self.encoder.compile(optimizer=optimizer4,
+                  loss='mse')
+        
+        # Then, train the encoder such that the original and transfer states are mapped into the same abstract representation
+        x_original=self.encoder.predict(original)#[0]
+        print "x_original[0:10]"
+        print x_original[0:10]
+        for i in range(epochs):
+            size = original[0].shape[0]
+            #print size
+            #print transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)]
+            print "train"
+            print self.encoder.train_on_batch(transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
+            #print self.encoder.train_on_batch(original[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
+            print "validation"
+            print self.encoder.test_on_batch(transfer[0][int(size*0.8):] , x_original[int(size*0.8):])
+            #print self.encoder.test_on_batch(original[0][int(size*0.8):] , x_original[int(size*0.8):] )
+         
+        #print "self.encoder.layers[-1].get_weights()"
+        #print self.encoder.layers[-1].get_weights()
+        #for l in self.encoder.layers[-5:]:
+        #    l.trainable = True
+        # recompile with original loss
+        self.encoder.compile(optimizer=optimizer4,
+                  loss=mean_squared_error_p)
+
diff --git a/deer/q_networks/q_net_keras_lp_nstep.py b/deer/q_networks/q_net_keras_lp_nstep.py
index fa2a20a0..04d55114 100644
--- a/deer/q_networks/q_net_keras_lp_nstep.py
+++ b/deer/q_networks/q_net_keras_lp_nstep.py
@@ -268,9 +268,15 @@ def train(self, observations_val, actions_val, rewards_val, terminals_val):
         # Interpretable AI
         if(self._high_int_dim==False):
             target_modif_features=np.zeros((self._n_actions,self._internal_dim))
-            target_modif_features[0,0]=1    # dir
-            target_modif_features[1,0]=-1   # opposite dir
-            target_modif_features[0:2,1]=1    # temps
+            ## Catcher
+            #target_modif_features[0,0]=1    # dir
+            #target_modif_features[1,0]=-1   # opposite dir
+            #target_modif_features[0:2,1]=1    # temps
+            ## Laby
+            target_modif_features[0,0]=1
+            target_modif_features[1,0]=0
+            #target_modif_features[2,1]=0
+            #target_modif_features[3,1]=0
             target_modif_features=np.repeat(target_modif_features,self._batch_size,axis=0)
             states_val_tiled=[]
             for obs in observations_val:
@@ -501,15 +507,15 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         #print "state_val[0]"
         #print state_val[0]
         #print len(state_val)
-        print "state_val[0][0]"
-        print state_val[0][0]
-        print state_val[0].shape
+#        print "state_val[0][0]"
+#        print state_val[0][0]
+#        print state_val[0].shape
         print "self.full_Qs[0].predict(state_val)[0]"
         print self.full_Qs[0].predict(state_val)[0]
         encoded_x = self.encoder.predict(state_val)
         ## DEBUG PURPOSES
-        print "encoded_x[0]"
-        print encoded_x[0]
+#        print "encoded_x[0]"
+#        print encoded_x[0]
         
         identity_matrix = np.diag(np.ones(self._n_actions))
         if(encoded_x.ndim==2):
@@ -661,7 +667,7 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
         
 
 
-    def chooseBestAction(self, state, mode):
+    def chooseBestAction(self, state, mode=0):
         """ Get the best action for a belief state
 
         Arguments
@@ -674,7 +680,7 @@ def chooseBestAction(self, state, mode):
         """
         if(mode>0):
             # We use the mode to define the planning depth
-            q_vals = self.qValues_planning([np.expand_dims(s,axis=0) for s in state],self.R,self.gamma, self.transition, self.Q, d=mode*3)#self.qValues(state)#
+            q_vals = self.qValues_planning([np.expand_dims(s,axis=0) for s in state],self.R,self.gamma, self.transition, self.Q, d=mode*2)#self.qValues(state)#
         else:
             q_vals = self.qValues_planning([np.expand_dims(s,axis=0) for s in state],self.R,self.gamma, self.transition, self.Q, d=0)
         return np.argmax(q_vals),np.max(q_vals)
@@ -761,7 +767,7 @@ def setLearningRate(self, lr):
         K.set_value(self.encoder.optimizer.lr, self._lr)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/1.) # /5. for simple laby or simple catcher; /1 for distrib of laby
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
         K.set_value(self.diff_sa_sa.optimizer.lr, 0) # 0 !
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
 
@@ -770,13 +776,19 @@ def transfer(self, original, transfer, epochs=1):
         self._resetQHat()
         # modify the loss of the encoder
         #self.encoder=self.learn_and_plan.encoder_model()
+        #for l in self.encoder.layers[-5:]:
+        #    l.trainable = False # Freeze dense layers # DOES NOT SEEM TO HELP (transfer on catcher)
+        #print "self.encoder.layers[-1].get_weights()"
+        #print self.encoder.layers[-1].get_weights()
+        
         optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         self.encoder.compile(optimizer=optimizer4,
                   loss='mse')
         
         # Then, train the encoder such that the original and transfer states are mapped into the same abstract representation
-        x_original=self.encoder_target.predict(original)#[0]
-        print x_original
+        x_original=self.encoder.predict(original)#[0]
+        print "x_original[0:10]"
+        print x_original[0:10]
         for i in range(epochs):
             size = original[0].shape[0]
             #print size
@@ -788,6 +800,10 @@ def transfer(self, original, transfer, epochs=1):
             print self.encoder.test_on_batch(transfer[0][int(size*0.8):] , x_original[int(size*0.8):])
             #print self.encoder.test_on_batch(original[0][int(size*0.8):] , x_original[int(size*0.8):] )
          
+        #print "self.encoder.layers[-1].get_weights()"
+        #print self.encoder.layers[-1].get_weights()
+        #for l in self.encoder.layers[-5:]:
+        #    l.trainable = True
         # recompile with original loss
         self.encoder.compile(optimizer=optimizer4,
                   loss=mean_squared_error_p)
diff --git a/examples/ALE/ALE_env.py b/examples/ALE/ALE_env.py
index 2c063473..78a23b3c 100644
--- a/examples/ALE/ALE_env.py
+++ b/examples/ALE/ALE_env.py
@@ -68,8 +68,8 @@ def act(self, action):
         action = self._actions[action]
         
         reward = self._ale.act(action)
-        if self.inTerminalState():
-            break
+        #if self.inTerminalState():
+        #    break
             
         self._ale.getScreenGrayscale(self._screen)
         cv2.resize(self._screen, (84, 84), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index 05a4fd02..47667d3c 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -11,8 +11,8 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_theano import MyQNetwork
-from ALE_env import MyEnv as ALE_env
+from deer.q_networks.q_net_keras_lp import MyQNetwork
+from ALE_env_gym import MyEnv as ALE_env
 import deer.experiment.base_controllers as bc
 
 from deer.policies import EpsilonGreedyPolicy
@@ -21,9 +21,9 @@ class Defaults:
     # ----------------------
     # Experiment Parameters
     # ----------------------
-    STEPS_PER_EPOCH = 250000
-    EPOCHS = 40
-    STEPS_PER_TEST = 125000
+    STEPS_PER_EPOCH = 10000#250000
+    EPOCHS = 500#40
+    STEPS_PER_TEST = 2000#125000
     PERIOD_BTW_SUMMARY_PERFS = 1
     
     # ----------------------
@@ -34,8 +34,8 @@ class Defaults:
     # ----------------------
     # DQN Agent parameters:
     # ----------------------
-    UPDATE_RULE = 'deepmind_rmsprop'
-    LEARNING_RATE = 0.01
+    UPDATE_RULE = 'rmsprop'
+    LEARNING_RATE = 0.001
     LEARNING_RATE_DECAY = 0.99
     DISCOUNT = 0.95
     DISCOUNT_INC = 0.99
@@ -53,7 +53,7 @@ class Defaults:
     FREEZE_INTERVAL = 10000
     DETERMINISTIC = True
 
-
+HIGH_INT_DIM = True
 
 
 if __name__ == "__main__":
@@ -67,11 +67,13 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = ALE_env(rng, frame_skip=parameters.frame_skip, 
-                ale_options=[{"key": "random_seed", "value": rng.randint(9999)}, 
-                             {"key": "color_averaging", "value": True},
-                             {"key": "repeat_action_probability", "value": 0.}])
-
+    #env = ALE_env(rng, frame_skip=parameters.frame_skip, 
+    #            ale_options=[{"key": "random_seed", "value": rng.randint(9999)}, 
+    #                         {"key": "color_averaging", "value": True},
+    #                         {"key": "repeat_action_probability", "value": 0.}])
+    
+    env = ALE_env(rng, frame_skip=parameters.frame_skip)
+    
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
         env,
@@ -82,9 +84,13 @@ class Defaults:
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
-        rng)
+        rng,     
+        double_Q=True,
+        high_int_dim=HIGH_INT_DIM,
+        internal_dim=3)
     
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
+    train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.)
 
     # --- Instantiate agent ---
     agent = NeuralAgent(
@@ -94,6 +100,7 @@ class Defaults:
         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
         parameters.batch_size,
         rng,
+        train_policy=train_policy,
         test_policy=test_policy)
 
     # --- Create unique filename for FindBestController ---
@@ -169,11 +176,36 @@ class Defaults:
     agent.attach(bc.InterleavedTestEpochController(
         id=ALE_env.VALIDATION_MODE, 
         epoch_length=parameters.steps_per_test,
-        controllers_to_disable=[0, 1, 2, 3, 4],
+        controllers_to_disable=[0, 1, 2, 3, 4, 6,7,8],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+
+    agent.attach(bc.InterleavedTestEpochController(
+        id=ALE_env.VALIDATION_MODE+1, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4, 5, 7,8],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+
+    agent.attach(bc.InterleavedTestEpochController(
+        id=ALE_env.VALIDATION_MODE+2, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6,8],
         periodicity=2,
         show_score=True,
         summarize_every=1))
     
+    agent.attach(bc.InterleavedTestEpochController(
+        id=ALE_env.VALIDATION_MODE+3, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6, 7],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+
+
     # --- Run the experiment ---
     try:
         os.mkdir("params")
diff --git a/examples/simplest_test_PLI/run_test3.py b/examples/simplest_test_PLI/run_test3.py
index 0ab34ce5..ed92a4ea 100644
--- a/examples/simplest_test_PLI/run_test3.py
+++ b/examples/simplest_test_PLI/run_test3.py
@@ -22,7 +22,7 @@ class Defaults:
     # Experiment Parameters
     # ----------------------
     STEPS_PER_EPOCH = 1000
-    EPOCHS = 50
+    EPOCHS = 30
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
     
@@ -54,7 +54,8 @@ class Defaults:
     DETERMINISTIC = False
 
 
-
+HIGHER_DIM_OBS = False#True
+HIGH_INT_DIM = False
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
@@ -67,7 +68,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = test_env(rng, higher_dim_obs=False, reverse=False)
+    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=False)
     
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
@@ -81,10 +82,10 @@ class Defaults:
         parameters.update_rule,
         rng,
         double_Q=True,
-        high_int_dim=False,
+        high_int_dim=HIGH_INT_DIM,
         internal_dim=3)
     
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.2)#1.)
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.1)#1.)
 
     # --- Instantiate agent ---
     agent = NeuralAgent(
@@ -174,20 +175,43 @@ class Defaults:
     agent.run(parameters.epochs, parameters.steps_per_epoch)
     
 
-    samples_transfer=200
+    ###
+    # TRANSFER
+    ###
+    optimized_params=qnetwork.getAllParams()
+    print "optimized_params"
+    print optimized_params
+
+    # --- Instantiate qnetwork ---
+    qnetwork = MyQNetwork(
+        env,
+        parameters.rms_decay,
+        parameters.rms_epsilon,
+        parameters.momentum,
+        parameters.clip_delta,
+        parameters.freeze_interval,
+        parameters.batch_size,
+        parameters.update_rule,
+        rng,
+        double_Q=True,
+        high_int_dim=HIGH_INT_DIM,
+        internal_dim=3)
+    qnetwork.setAllParams(optimized_params)
+
+    samples_transfer=500
     rand_ind=np.random.random_integers(0,20000,samples_transfer)
     original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
-    transfer=[np.array([[1-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
+    transfer=[np.array([[-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
 
-    print "original, transfer"
-    print original, transfer
+    print "original[0][0:10], transfer[0][0:10]"
+    print original[0][0:10], transfer[0][0:10]
 
     # Transfer between the two repr
     qnetwork.transfer(original, transfer, 5000)
 
-
+    
     # --- Instantiate environment with reverse=True ---
-    env = test_env(rng, higher_dim_obs=False, reverse=True)
+    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
 
     # --- Re instantiate agent ---
     agent = NeuralAgent(
@@ -258,7 +282,7 @@ class Defaults:
         summarize_every=1))
 
 
-    agent.gathering_data=False
+    #agent.gathering_data=False
     agent.run(parameters.epochs, parameters.steps_per_epoch)
 
 
diff --git a/examples/simplest_test_PLI/run_test4.py b/examples/simplest_test_PLI/run_test4.py
index 0c711b01..65a29c8e 100644
--- a/examples/simplest_test_PLI/run_test4.py
+++ b/examples/simplest_test_PLI/run_test4.py
@@ -1,6 +1,6 @@
-"""ALE launcher. See Wiki for more details about this experiment.
+"""Simple maze launcher
 
-Authors: Vincent Francois-Lavet, David Taralla
+Authors: Vincent Francois-Lavet
 """
 
 import sys
@@ -11,7 +11,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp_nstep import MyQNetwork
+from deer.q_networks.q_net_keras_lp import MyQNetwork
 from test_env4 import MyEnv as test_env
 import deer.experiment.base_controllers as bc
 
@@ -36,7 +36,7 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0002
+    LEARNING_RATE = 0.0005
     LEARNING_RATE_DECAY = 0.98
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
diff --git a/examples/simplest_test_PLI/test_env3.py b/examples/simplest_test_PLI/test_env3.py
index b2461568..d9636817 100644
--- a/examples/simplest_test_PLI/test_env3.py
+++ b/examples/simplest_test_PLI/test_env3.py
@@ -12,6 +12,7 @@
 from mpl_toolkits.axes_grid1 import host_subplot
 import mpl_toolkits.axisartist as AA
 import matplotlib.pyplot as plt
+plt.switch_backend('agg') # For remote servers
 import copy 
 
 class MyEnv(Environment):
@@ -254,7 +255,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
         ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+        #plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 
         # Plot the Q_vals
@@ -280,9 +281,9 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1.set_label('Estimated expected return')
 
         #plt.show()
-        for ii in xrange(-15,345,30):
-            ax.view_init(elev=20., azim=ii)
-            plt.savefig('fig_w_V_div5'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
+        #for ii in xrange(-15,345,30):
+        #    ax.view_init(elev=20., azim=ii)
+        #    plt.savefig('fig_w_V_div5'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
 
 
         # fig_visuV
@@ -314,7 +315,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1.set_label('Estimated expected return')
 
         #plt.show()
-        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
+        #plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
 
 
         # fig_visuR
@@ -353,7 +354,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1.set_label('Estimated expected return')
 
         #plt.show()
-        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
+        #plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
 
         matplotlib.pyplot.close("all") # avoids memory leaks
 
@@ -397,7 +398,7 @@ def get_observation(self,y,x_block,x):
             obs[3:6,x_t-3:x_t+4]=paddle
         
         if(self._reverse==True):
-            obs=1-obs
+            obs=-obs
             #plt.imshow(np.flip(obs,axis=0), cmap='gray_r')
             #plt.show()
 
diff --git a/examples/simplest_test_PLI/test_env4.py b/examples/simplest_test_PLI/test_env4.py
index e0f10d50..340e36c4 100644
--- a/examples/simplest_test_PLI/test_env4.py
+++ b/examples/simplest_test_PLI/test_env4.py
@@ -1,15 +1,7 @@
-""" Interface with the test environment
+""" Simple maze environment
 
 Authors: Vincent Francois-Lavet
 
-def encoder_model(self):
-
-def transition_model(self):
-    x = Dense(10, activation='tanh')(x) #5,15
-    x = Dense(30, activation='tanh')(x) # ,30
-    x = Dense(30, activation='tanh')(x) # ,30
-    x = Dense(10, activation='tanh')(x) # ,30
-
 """
 import numpy as np
 import cv2
@@ -205,8 +197,13 @@ def summarizePerformance(self, test_data_set, learning_algo):
             fig = plt.figure()
             if(self.intern_dim==2):
                 ax = fig.add_subplot(111)
+                ax.set_xlabel(r'$X_1$')
+                ax.set_ylabel(r'$X_2$')
             else:
                 ax = fig.add_subplot(111,projection='3d')
+                ax.set_xlabel(r'$X_1$')
+                ax.set_ylabel(r'$X_2$')
+                ax.set_zlabel(r'$X_3$')
             
             #for j in range(3):
             #    # Plot the trajectory
@@ -221,15 +218,15 @@ def summarizePerformance(self, test_data_set, learning_algo):
                 predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1,0]])])
                 predicted4=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,0,1]])])
                 if(self.intern_dim==2):
-                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), color="0.15", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), color="0.4", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), color="0.65", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), color="0.9", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), color="0.9", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), color="0.65", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), color="0.4", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), color="0.15", alpha=0.75)
                 else:
-                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.15", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.4", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.65", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.9", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.9", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.65", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.4", alpha=0.75)
+                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.15", alpha=0.75)
             
 #            for xx in np.arange(self._size_maze)-self._size_maze//2:
 #                for yy in np.arange(self._size_maze)-self._size_maze//2:
@@ -497,9 +494,9 @@ def get_higher_dim_obs(self,indices_agent,indices_reward):
 
 
     def inTerminalState(self):
-        if((self._pos_agent[0]<=1 and self._cur_action==0) ):#((self._pos_agent==[4,1] and self._cur_action==1) or (self._pos_agent==[5,2] and (self._cur_action==1 or self._cur_action==2)) or (self._pos_agent==[6,3] and self._cur_action==2))):
-        #(self._pos_agent[1]>=self._size_maze-2 and self._cur_action==1) ):
-            return True
+#        if((self._pos_agent[0]<=1 and self._cur_action==0) ):#((self._pos_agent==[4,1] and self._cur_action==1) or (self._pos_agent==[5,2] and (self._cur_action==1 or self._cur_action==2)) or (self._pos_agent==[6,3] and self._cur_action==2))):
+#        #(self._pos_agent[1]>=self._size_maze-2 and self._cur_action==1) ):
+#            return True
         return False
         #if (self._pos_agent==self._pos_goal):
         #    return True

From 970b23ff8ab1541a5bb7b785cfc58306dcaa0bd5 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 4 May 2018 13:52:21 -0400
Subject: [PATCH 47/96] fix lr and ALE

---
 deer/q_networks/q_net_keras_lp.py | 17 ++++++-----------
 examples/ALE/run_ALE.py           |  8 ++++----
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index c08e8a7f..b18bf6b9 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -702,19 +702,19 @@ def setLearningRate(self, lr):
         self._lr = lr
         print "modif lr"
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
-        K.set_value(self.full_Q.optimizer.lr, 0)#self._lr)
+        K.set_value(self.full_Q.optimizer.lr, self._lr)
 
-        K.set_value(self.full_R.optimizer.lr, 0)#self._lr)
-        K.set_value(self.full_gamma.optimizer.lr, 0)#self._lr)
+        K.set_value(self.full_R.optimizer.lr, self._lr)
+        K.set_value(self.full_gamma.optimizer.lr, self._lr)
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr)
         
         if(self._high_int_dim==False):
-            K.set_value(self.force_features.optimizer.lr, 0)#self._lr/2.)
+            K.set_value(self.force_features.optimizer.lr, self._lr)
 
         K.set_value(self.encoder.optimizer.lr, self._lr)
-        K.set_value(self.encoder_diff.optimizer.lr, self._lr/2.)
+        K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/1.) # /5. for simple laby or simple catcher; /1 for distrib of laby
         K.set_value(self.diff_sa_sa.optimizer.lr, 0) # 0 !
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
 
@@ -747,11 +747,6 @@ def transfer(self, original, transfer, epochs=1):
             print self.encoder.test_on_batch(transfer[0][int(size*0.8):] , x_original[int(size*0.8):])
             #print self.encoder.test_on_batch(original[0][int(size*0.8):] , x_original[int(size*0.8):] )
          
-        #print "self.encoder.layers[-1].get_weights()"
-        #print self.encoder.layers[-1].get_weights()
-        #for l in self.encoder.layers[-5:]:
-        #    l.trainable = True
-        # recompile with original loss
         self.encoder.compile(optimizer=optimizer4,
                   loss=mean_squared_error_p)
 
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index 47667d3c..0e9f2737 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -160,10 +160,10 @@ class Defaults:
     # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
     # of the validation and test scores (see below) or simply recover the resulting neural network for your 
     # application.
-    agent.attach(bc.FindBestController(
-        validationID=ALE_env.VALIDATION_MODE,
-        testID=None,
-        unique_fname=fname))
+#    agent.attach(bc.FindBestController(
+#        validationID=ALE_env.VALIDATION_MODE,
+#        testID=None,
+#        unique_fname=fname))
     
     # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
     # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 

From 9d246ea280acb9a95e8a6c27ccaefdd60a74cf81 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 10 May 2018 16:05:12 -0400
Subject: [PATCH 48/96] fix parser

---
 deer/default_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deer/default_parser.py b/deer/default_parser.py
index 036ccf47..5c8f5b8c 100644
--- a/deer/default_parser.py
+++ b/deer/default_parser.py
@@ -91,7 +91,7 @@ def process_args(args, defaults):
                         help=('If fixed seed (default: %(default)s)'))
     parser.add_argument('--no-deterministic', dest='deterministic', action='store_false',
                         help=('If no fixed seed'))
-    parser.set_defaults(deterministic=True)
+    parser.set_defaults(deterministic=defaults.DETERMINISTIC)
 
     parameters = parser.parse_args(args)
 

From 1ace48955557b50d0e4bf779a886f2e398a56f51 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 19 Jun 2018 10:46:11 -0400
Subject: [PATCH 49/96] some clean up

---
 deer/agent.py                           |  2 +-
 deer/q_networks/q_net_keras_lp.py       | 30 +++++++++++++------------
 examples/ALE/run_ALE.py                 |  2 +-
 examples/simplest_test_PLI/run_test3.py |  8 +++----
 examples/simplest_test_PLI/test_env3.py |  6 ++---
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index c5b4ee5b..0dd1fcdd 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -76,7 +76,7 @@ def __init__(self, environment, q_network, replay_memory_size=1000000, replay_st
         self._selected_action = -1
         self._state = []
         for i in range(len(inputDims)):
-            self._state.append(np.zeros(inputDims[i]))
+            self._state.append(np.zeros(inputDims[i], dtype=float))
         if (train_policy==None):
             self._train_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.1)
         else:
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index b18bf6b9..1fa32bc4 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -191,23 +191,25 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1
         states_val=list(states_val)
         next_states_val=list(next_states_val)
-        for i,o in enumerate(states_val):
-            if(o.ndim==5): #FIXME
-                states_val[i]=states_val[i][:,0,:,:,:]/128.-1
-        for i,o in enumerate(next_states_val):
-            if(o.ndim==5): #FIXME
-                next_states_val[i]=next_states_val[i][:,0,:,:,:]/128.-1
+        #for i,o in enumerate(states_val):
+        #    if(o.ndim==5): #FIXME
+        #        states_val[i]=states_val[i][:,0,:,:,:]/128.-1
+        #for i,o in enumerate(next_states_val):
+        #    if(o.ndim==5): #FIXME
+        #        next_states_val[i]=next_states_val[i][:,0,:,:,:]/128.-1
             
         Es_=self.encoder.predict(next_states_val)
         Es=self.encoder.predict(states_val)
         ETs=self.transition.predict([Es,onehot_actions])
         R=self.R.predict([Es,onehot_actions])
                    
-        if(self.update_counter%100==0):
+        if(self.update_counter%500==0):
             print states_val[0][0]
             print "len(states_val)"
             print len(states_val)
+            print "next_states_val[0][0]"
             print next_states_val[0][0]
+            print "actions_val[0], rewards_val[0], terminals_val[0]"
             print actions_val[0], rewards_val[0], terminals_val[0]
             print "Es[0],ETs[0],Es_[0]"
             if(Es.ndim==4):
@@ -341,7 +343,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         else:
             max_next_q_vals=np.max(next_q_vals, axis=1, keepdims=True)
 
-        not_terminals=np.ones_like(terminals_val) - terminals_val
+        not_terminals=np.ones_like(terminals_val,dtype=float) - terminals_val
         
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         
@@ -436,9 +438,9 @@ def qValues(self, state_val):
         The q values for the provided belief state
         """ 
         copy_state=copy.deepcopy(state_val) #Required because of the "hack" below
-        for i,o in enumerate(state):
-            if(o.ndim==4): #FIXME
-                copy_state[i]=copy_state[i][0,:,:,:]/128.-1
+        #for i,o in enumerate(state):
+        #    if(o.ndim==4): #FIXME
+        #        copy_state[i]=copy_state[i][0,:,:,:]/128.-1
 
         #return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
         return self.full_Q.predict([np.expand_dims(state,axis=0) for state in copy_state])[0]
@@ -628,9 +630,9 @@ def chooseBestAction(self, state, mode):
         The best action : int
         """
         copy_state=copy.deepcopy(state) #Required because of the "hack" below
-        for i,o in enumerate(state):
-            if(o.ndim==4): #FIXME
-                copy_state[i]=copy_state[i][0,:,:,:]/128.-1
+        #for i,o in enumerate(state):
+        #    if(o.ndim==4): #FIXME
+        #        copy_state[i]=copy_state[i][0,:,:,:]/128.-1
 
         if(mode==None):
             mode=0
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index 0e9f2737..c210df58 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -90,7 +90,7 @@ class Defaults:
         internal_dim=3)
     
     train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.)
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
 
     # --- Instantiate agent ---
     agent = NeuralAgent(
diff --git a/examples/simplest_test_PLI/run_test3.py b/examples/simplest_test_PLI/run_test3.py
index ed92a4ea..6bc306c1 100644
--- a/examples/simplest_test_PLI/run_test3.py
+++ b/examples/simplest_test_PLI/run_test3.py
@@ -10,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp_nstep import MyQNetwork
+from deer.q_networks.q_net_keras_lp import MyQNetwork
 from test_env3 import MyEnv as test_env
 import deer.experiment.base_controllers as bc
 
@@ -21,8 +21,8 @@ class Defaults:
     # ----------------------
     # Experiment Parameters
     # ----------------------
-    STEPS_PER_EPOCH = 1000
-    EPOCHS = 30
+    STEPS_PER_EPOCH = 2000
+    EPOCHS = 20
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
     
@@ -54,7 +54,7 @@ class Defaults:
     DETERMINISTIC = False
 
 
-HIGHER_DIM_OBS = False#True
+HIGHER_DIM_OBS = True
 HIGH_INT_DIM = False
 
 if __name__ == "__main__":
diff --git a/examples/simplest_test_PLI/test_env3.py b/examples/simplest_test_PLI/test_env3.py
index d9636817..649b29e4 100644
--- a/examples/simplest_test_PLI/test_env3.py
+++ b/examples/simplest_test_PLI/test_env3.py
@@ -28,7 +28,7 @@ def __init__(self, rng, **kwargs):
         self._height=10#15
         self._width=10 #preferably an odd number so that it's symmetrical
         self._width_paddle=1
-        self._nx_block=3#self._width#2 #number of different x positions of the falling blocks
+        self._nx_block=2#self._width#2 #number of different x positions of the falling blocks
         self._higher_dim_obs=kwargs["higher_dim_obs"]
         self._reverse=kwargs["reverse"]
 
@@ -196,7 +196,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         # Plot the dots at each time step depending on the action taken
         length_block=self._height*(self._width-self._width_paddle+1)
         for i in range(self._nx_block):
-            line3 = ax.scatter(all_possib_abs_states[i*length_block:(i+1)*length_block,0], all_possib_abs_states[i*length_block:(i+1)*length_block,1] ,all_possib_abs_states[i*length_block:(i+1)*length_block,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.2)
+            line3 = ax.scatter(all_possib_abs_states[i*length_block:(i+1)*length_block,0], all_possib_abs_states[i*length_block:(i+1)*length_block,1] ,all_possib_abs_states[i*length_block:(i+1)*length_block,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.3)
         line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
         axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
         zrange=axes_lims[2][1]-axes_lims[2][0]
@@ -283,7 +283,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
         #plt.show()
         #for ii in xrange(-15,345,30):
         #    ax.view_init(elev=20., azim=ii)
-        #    plt.savefig('fig_w_V_div5'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
+        #    plt.savefig('fig_w_V_div5_'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
 
 
         # fig_visuV

From 889e51b12ebf9456e9f446a31deff7ebeb823a29 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 20 Jun 2018 16:14:02 -0400
Subject: [PATCH 50/96] changing names simple envs

---
 deer/q_networks/q_net_keras_lp.py                  |  4 ++--
 .../{test_env3.py => catcher_env.py}               | 10 +++++-----
 .../{run_test3.py => run_catcher.py}               | 14 +++++++-------
 .../{run_test4.py => run_simple_maze.py}           |  8 ++++----
 .../{test_env4.py => simple_maze_env.py}           |  2 +-
 5 files changed, 19 insertions(+), 19 deletions(-)
 rename examples/simplest_test_PLI/{test_env3.py => catcher_env.py} (98%)
 rename examples/simplest_test_PLI/{run_test3.py => run_catcher.py} (97%)
 rename examples/simplest_test_PLI/{run_test4.py => run_simple_maze.py} (97%)
 rename examples/simplest_test_PLI/{test_env4.py => simple_maze_env.py} (99%)

diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/q_networks/q_net_keras_lp.py
index 1fa32bc4..0628508c 100644
--- a/deer/q_networks/q_net_keras_lp.py
+++ b/deer/q_networks/q_net_keras_lp.py
@@ -711,12 +711,12 @@ def setLearningRate(self, lr):
         K.set_value(self.diff_Tx_x_.optimizer.lr, self._lr)
         
         if(self._high_int_dim==False):
-            K.set_value(self.force_features.optimizer.lr, self._lr)
+            K.set_value(self.force_features.optimizer.lr, 0)#self._lr)
 
         K.set_value(self.encoder.optimizer.lr, self._lr)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/1.) # /5. for simple laby or simple catcher; /1 for distrib of laby
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
         K.set_value(self.diff_sa_sa.optimizer.lr, 0) # 0 !
 #        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
 
diff --git a/examples/simplest_test_PLI/test_env3.py b/examples/simplest_test_PLI/catcher_env.py
similarity index 98%
rename from examples/simplest_test_PLI/test_env3.py
rename to examples/simplest_test_PLI/catcher_env.py
index 649b29e4..933e397a 100644
--- a/examples/simplest_test_PLI/test_env3.py
+++ b/examples/simplest_test_PLI/catcher_env.py
@@ -1,4 +1,4 @@
-""" Interface with the test environment
+""" Interface with the catcher environment
 
 Authors: Vincent Francois-Lavet
 """
@@ -280,10 +280,10 @@ def summarizePerformance(self, test_data_set, learning_algo):
         cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
         cb1.set_label('Estimated expected return')
 
-        #plt.show()
-        #for ii in xrange(-15,345,30):
-        #    ax.view_init(elev=20., azim=ii)
-        #    plt.savefig('fig_w_V_div5_'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
+        plt.show()
+        for ii in xrange(-15,345,30):
+            ax.view_init(elev=20., azim=ii)
+            plt.savefig('fig_w_V_div5_'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
 
 
         # fig_visuV
diff --git a/examples/simplest_test_PLI/run_test3.py b/examples/simplest_test_PLI/run_catcher.py
similarity index 97%
rename from examples/simplest_test_PLI/run_test3.py
rename to examples/simplest_test_PLI/run_catcher.py
index 6bc306c1..bc75386f 100644
--- a/examples/simplest_test_PLI/run_test3.py
+++ b/examples/simplest_test_PLI/run_catcher.py
@@ -1,4 +1,4 @@
-"""Catcher
+""" Catcher launcher
 
 """
 
@@ -11,7 +11,7 @@
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
 from deer.q_networks.q_net_keras_lp import MyQNetwork
-from test_env3 import MyEnv as test_env
+from catcher_env import MyEnv as catcher_env
 import deer.experiment.base_controllers as bc
 
 from deer.policies import EpsilonGreedyPolicy
@@ -54,7 +54,7 @@ class Defaults:
     DETERMINISTIC = False
 
 
-HIGHER_DIM_OBS = True
+HIGHER_DIM_OBS = False
 HIGH_INT_DIM = False
 
 if __name__ == "__main__":
@@ -68,7 +68,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=False)
+    env = catcher_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=False)
     
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
@@ -164,7 +164,7 @@ class Defaults:
     # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
     # [parameters.period_btw_summary_perfs] *validation* epochs.
     agent.attach(bc.InterleavedTestEpochController(
-        id=test_env.VALIDATION_MODE, 
+        id=catcher_env.VALIDATION_MODE, 
         epoch_length=parameters.steps_per_test,
         controllers_to_disable=[0, 1, 2, 3, 4],
         periodicity=2,
@@ -211,7 +211,7 @@ class Defaults:
 
     
     # --- Instantiate environment with reverse=True ---
-    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
+    env = catcher_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
 
     # --- Re instantiate agent ---
     agent = NeuralAgent(
@@ -274,7 +274,7 @@ class Defaults:
     # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
     # [parameters.period_btw_summary_perfs] *validation* epochs.
     agent.attach(bc.InterleavedTestEpochController(
-        id=test_env.VALIDATION_MODE, 
+        id=catcher_env.VALIDATION_MODE, 
         epoch_length=parameters.steps_per_test,
         controllers_to_disable=[0, 1, 2, 3, 4],
         periodicity=2,
diff --git a/examples/simplest_test_PLI/run_test4.py b/examples/simplest_test_PLI/run_simple_maze.py
similarity index 97%
rename from examples/simplest_test_PLI/run_test4.py
rename to examples/simplest_test_PLI/run_simple_maze.py
index 65a29c8e..ab72a1f7 100644
--- a/examples/simplest_test_PLI/run_test4.py
+++ b/examples/simplest_test_PLI/run_simple_maze.py
@@ -12,7 +12,7 @@
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
 from deer.q_networks.q_net_keras_lp import MyQNetwork
-from test_env4 import MyEnv as test_env
+from simple_maze_env import MyEnv as simple_maze_env
 import deer.experiment.base_controllers as bc
 
 from deer.policies import EpsilonGreedyPolicy
@@ -68,7 +68,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = test_env(rng, higher_dim_obs=False)
+    env = simple_maze_env(rng, higher_dim_obs=False)
     
     # --- Instantiate qnetwork ---
     qnetwork = MyQNetwork(
@@ -179,7 +179,7 @@ class Defaults:
     # of the validation and test scores (see below) or simply recover the resulting neural network for your 
     # application.
     agent.attach(bc.FindBestController(
-        validationID=test_env.VALIDATION_MODE,
+        validationID=simple_maze_env.VALIDATION_MODE,
         testID=None,
         unique_fname=fname))
     
@@ -192,7 +192,7 @@ class Defaults:
     # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
     # [parameters.period_btw_summary_perfs] *validation* epochs.
     agent.attach(bc.InterleavedTestEpochController(
-        id=test_env.VALIDATION_MODE, 
+        id=simple_maze_env.VALIDATION_MODE, 
         epoch_length=parameters.steps_per_test,
         controllers_to_disable=[0, 1, 2, 3, 4],
         periodicity=2,
diff --git a/examples/simplest_test_PLI/test_env4.py b/examples/simplest_test_PLI/simple_maze_env.py
similarity index 99%
rename from examples/simplest_test_PLI/test_env4.py
rename to examples/simplest_test_PLI/simple_maze_env.py
index 340e36c4..37673dac 100644
--- a/examples/simplest_test_PLI/test_env4.py
+++ b/examples/simplest_test_PLI/simple_maze_env.py
@@ -330,7 +330,7 @@ def summarizePerformance(self, test_data_set, learning_algo):
             #ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
             #ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
             #ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-            #plt.show()
+            plt.show()
             plt.savefig('fig_base_explo'+str(learning_algo.update_counter)+'.pdf')
 
 

From 576633b95bf9e7294566c1ed64ed5b74cf38729f Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 13 Jul 2018 15:46:42 -0400
Subject: [PATCH 51/96] ALE_env_gym

---
 examples/ALE/ALE_env_gym.py | 123 ++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 examples/ALE/ALE_env_gym.py

diff --git a/examples/ALE/ALE_env_gym.py b/examples/ALE/ALE_env_gym.py
new file mode 100644
index 00000000..885494c8
--- /dev/null
+++ b/examples/ALE/ALE_env_gym.py
@@ -0,0 +1,123 @@
+""" Interface with the ALE environment
+
+Authors: Vincent Francois-Lavet
+"""
+import numpy as np
+np.set_printoptions(threshold=np.nan)
+import cv2
+#from ale_python_interface import ALEInterface
+import gym
+from deer.base_classes import Environment
+
+import matplotlib
+matplotlib.use('qt5agg')
+from mpl_toolkits.axes_grid1 import host_subplot
+import mpl_toolkits.axisartist as AA
+import matplotlib.pyplot as plt
+from PIL import Image
+
+import gym
+    
+class MyEnv(Environment):
+    VALIDATION_MODE = 0
+
+    def __init__(self, rng, **kwargs):
+        """ Initialize environment.
+
+        Arguments:
+            rng - the numpy random number generator            
+        """
+        self.env = gym.make('MontezumaRevenge-v4')#SpaceInvaders-v4')#Breakout-v4')#BeamRider-v4')#Qbert-v4')#Seaquest-v4')#Freeway-v4')
+        self._random_state=rng
+        self.env.reset()
+        frame_skip=kwargs.get('frame_skip',1)
+        self._frame_skip = frame_skip if frame_skip >= 1 else 1
+        
+        self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1)
+        self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
+        
+        #plt.imshow(self._reduced_screen, cmap='gray')
+        #plt.show()
+        
+        self._mode = -1
+        self._mode_score = 0.0
+        self._mode_episode_count = 0
+
+
+                
+    def reset(self, mode):
+        if mode == self._mode:
+            # already in the right mode
+            self._mode_episode_count += 1
+        else:
+            # switching mode
+            self._mode = mode
+            self._mode_score = 0.0
+            self._mode_episode_count = 0
+
+        self.env.reset()
+        for _ in range(self._random_state.randint(15)):
+            action = self.env.action_space.sample()
+
+            # this executes the environment with an action,
+            # and returns the observation of the environment,
+            # the reward, if the env is over, and other info.
+            observation, reward, self.terminal, info = self.env.step(action)
+
+        self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1)
+        self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
+        self.state=np.zeros((84,84), dtype=np.uint8)#np.zeros((4,84,84), dtype=np.uint8) #FIXME
+        
+        return [1*[84 * [84 * [0]]]]#[1*[4 * [84 * [84 * [0]]]]]
+        
+        
+    def act(self, action):
+        #print "action"
+        #print action
+        
+        self.state=np.zeros((84,84), dtype=np.uint8)#np.zeros((4,84,84), dtype=np.uint8)
+        reward=0
+        for t in range(4):
+            observation, r, self.terminal, info = self.env.step(action)
+            #print "observation, reward, self.terminal"
+            #print observation, reward, self.terminal
+            reward+=r
+            self._screen=np.average(observation,axis=-1) # Gray levels
+            self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_NEAREST)  # 84*84
+            #plt.imshow(self._screen, cmap='gray')
+            #plt.show()
+            if self.inTerminalState():
+                break
+            #self.state[t]=self._reduced_screen
+        self.state=self._reduced_screen
+            
+        self._mode_score += reward
+        return np.sign(reward)
+
+    def summarizePerformance(self, test_data_set, learning_algo):
+        if self.inTerminalState() == False:
+            self._mode_episode_count += 1
+        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
+
+
+    def inputDimensions(self):
+        return [(1, 84, 84)]#[(1, 4, 84, 84)]
+
+    def observationType(self, subject):
+        return np.uint8
+
+    def nActions(self):
+        print "self.env.action_space"
+        print self.env.action_space
+        return self.env.action_space.n
+
+    def observe(self):
+        return [np.array(self.state, dtype=np.uint8)]
+
+    def inTerminalState(self):
+        return self.terminal
+                
+
+
+if __name__ == "__main__":
+    pass
\ No newline at end of file

From 378a7f52b6abd7c987c8b368c61b0bbcfbdd8ee5 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 10:24:19 -0400
Subject: [PATCH 52/96] Modif summarizePerformance function and mark
 terminal=True when end of episode because max number steps reached

---
 deer/agent.py                                   | 10 +++++++---
 examples/ALE/ALE_env.py                         |  2 +-
 examples/MG_two_storages/MG_two_storages_env.py |  2 +-
 examples/PLE/PLE_env.py                         |  2 +-
 examples/pendulum/pendulum_env.py               |  2 +-
 examples/simplest_test_PLI/catcher_env.py       |  2 +-
 examples/simplest_test_PLI/simple_maze_env.py   |  6 +++---
 examples/toy_env/Toy_env.py                     |  2 +-
 8 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index 0dd1fcdd..8d774f10 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -180,7 +180,7 @@ def summarizeTestPerformance(self):
         if self._mode == -1:
             raise AgentError("Cannot summarize test performance outside test environment.")
 
-        self._environment.summarizePerformance(self._tmp_dataset, self._network)
+        self._environment.summarizePerformance(self._tmp_dataset, self._network, train_data_set=self._dataset)
 
     def train(self):
         """
@@ -334,9 +334,13 @@ def _runEpisode(self, maxSteps):
                 if self._mode != -1:
                     self._total_mode_reward += reward
                 
-                is_terminal = self._environment.inTerminalState()
+                is_terminal = self._environment.inTerminalState()   # If the transition ends up in a terminal state, mark transition as terminal
+                                                                    # Note that the new obs will not be stored, as it is unnecessary.
                     
-                self._addSample(obs, action, reward, is_terminal)
+                if(maxSteps>0):
+                    self._addSample(obs, action, reward, is_terminal)
+                else:
+                    self._addSample(obs, action, reward, True)      # If the episode ends because max number of steps is reached, mark the transition as terminal
             
             for c in self._controllers: c.onActionTaken(self)
             
diff --git a/examples/ALE/ALE_env.py b/examples/ALE/ALE_env.py
index 78a23b3c..7180a565 100644
--- a/examples/ALE/ALE_env.py
+++ b/examples/ALE/ALE_env.py
@@ -77,7 +77,7 @@ def act(self, action):
         self._mode_score += reward
         return np.sign(reward)
 
-    def summarizePerformance(self, test_data_set):
+    def summarizePerformance(self, test_data_set, *args, **kwargs):
         if self.inTerminalState() == False:
             self._mode_episode_count += 1
         print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
diff --git a/examples/MG_two_storages/MG_two_storages_env.py b/examples/MG_two_storages/MG_two_storages_env.py
index a74bb96a..5d3cfbbf 100644
--- a/examples/MG_two_storages/MG_two_storages_env.py
+++ b/examples/MG_two_storages/MG_two_storages_env.py
@@ -256,7 +256,7 @@ def nActions(self):
     def observe(self):
         return copy.deepcopy(self._last_ponctual_observation)
 
-    def summarizePerformance(self, test_data_set):
+    def summarizePerformance(self, test_data_set, *args, **kwargs):
         print("summary perf")
         print("self.hydrogen_storage: {}".format(self.hydrogen_storage))
         observations = test_data_set.observations()
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 26fc9164..30b9cc78 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -89,7 +89,7 @@ def act(self, action):
         self._mode_score += self.reward
         return np.sign(self.reward)
 
-    def summarizePerformance(self, test_data_set, learning_algo):
+    def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         all_possib_inp=np.expand_dims(np.array(plot_all_frames.get_all_possib_inp(self.width,self.height)),axis=1)/256.
         #print "all_possib_inp[0]"
         print "all_possib_inp.shape"
diff --git a/examples/pendulum/pendulum_env.py b/examples/pendulum/pendulum_env.py
index 97da485e..eef3f302 100644
--- a/examples/pendulum/pendulum_env.py
+++ b/examples/pendulum/pendulum_env.py
@@ -113,7 +113,7 @@ def reset(self, mode=0):
 
         return self._last_observation
         
-    def summarizePerformance(self, test_data_set):
+    def summarizePerformance(self, test_data_set, *args, **kwargs):
         """ This function is called at every PERIOD_BTW_SUMMARY_PERFS.
 
         Arguments:
diff --git a/examples/simplest_test_PLI/catcher_env.py b/examples/simplest_test_PLI/catcher_env.py
index 933e397a..983d80fb 100644
--- a/examples/simplest_test_PLI/catcher_env.py
+++ b/examples/simplest_test_PLI/catcher_env.py
@@ -82,7 +82,7 @@ def act(self, action):
         self._mode_score += self.reward
         return self.reward
 
-    def summarizePerformance(self, test_data_set, learning_algo):
+    def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         #print "test_data_set.observations.shape"
         #print test_data_set.observations()[0][0:1]
         
diff --git a/examples/simplest_test_PLI/simple_maze_env.py b/examples/simplest_test_PLI/simple_maze_env.py
index 37673dac..82499be5 100644
--- a/examples/simplest_test_PLI/simple_maze_env.py
+++ b/examples/simplest_test_PLI/simple_maze_env.py
@@ -109,7 +109,7 @@ def act(self, action):
         self._mode_score += self.reward
         return self.reward
 
-    def summarizePerformance(self, test_data_set, learning_algo):
+    def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         #print "test_data_set.observations.shape"
         #print test_data_set.observations()[0][0:1]
         
@@ -330,8 +330,8 @@ def summarizePerformance(self, test_data_set, learning_algo):
             #ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
             #ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
             #ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-            plt.show()
-            plt.savefig('fig_base_explo'+str(learning_algo.update_counter)+'.pdf')
+            #plt.show()
+            plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 
 #        # Plot the Q_vals
diff --git a/examples/toy_env/Toy_env.py b/examples/toy_env/Toy_env.py
index 7ee220d7..7b9ffa50 100644
--- a/examples/toy_env/Toy_env.py
+++ b/examples/toy_env/Toy_env.py
@@ -97,7 +97,7 @@ def act(self, action):
         
         return reward
 
-    def summarizePerformance(self, test_data_set):
+    def summarizePerformance(self, test_data_set, *args, **kwargs):
         """
         This function is called at every PERIOD_BTW_SUMMARY_PERFS.
         Parameters

From d46319a2349cfb212d7df0b46c958f1796df1a20 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 10:35:02 -0400
Subject: [PATCH 53/96] remove ALE_gym

---
 examples/ALE/ALE_env_gym.py | 123 ------------------------------------
 1 file changed, 123 deletions(-)
 delete mode 100644 examples/ALE/ALE_env_gym.py

diff --git a/examples/ALE/ALE_env_gym.py b/examples/ALE/ALE_env_gym.py
deleted file mode 100644
index 885494c8..00000000
--- a/examples/ALE/ALE_env_gym.py
+++ /dev/null
@@ -1,123 +0,0 @@
-""" Interface with the ALE environment
-
-Authors: Vincent Francois-Lavet
-"""
-import numpy as np
-np.set_printoptions(threshold=np.nan)
-import cv2
-#from ale_python_interface import ALEInterface
-import gym
-from deer.base_classes import Environment
-
-import matplotlib
-matplotlib.use('qt5agg')
-from mpl_toolkits.axes_grid1 import host_subplot
-import mpl_toolkits.axisartist as AA
-import matplotlib.pyplot as plt
-from PIL import Image
-
-import gym
-    
-class MyEnv(Environment):
-    VALIDATION_MODE = 0
-
-    def __init__(self, rng, **kwargs):
-        """ Initialize environment.
-
-        Arguments:
-            rng - the numpy random number generator            
-        """
-        self.env = gym.make('MontezumaRevenge-v4')#SpaceInvaders-v4')#Breakout-v4')#BeamRider-v4')#Qbert-v4')#Seaquest-v4')#Freeway-v4')
-        self._random_state=rng
-        self.env.reset()
-        frame_skip=kwargs.get('frame_skip',1)
-        self._frame_skip = frame_skip if frame_skip >= 1 else 1
-        
-        self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1)
-        self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
-        
-        #plt.imshow(self._reduced_screen, cmap='gray')
-        #plt.show()
-        
-        self._mode = -1
-        self._mode_score = 0.0
-        self._mode_episode_count = 0
-
-
-                
-    def reset(self, mode):
-        if mode == self._mode:
-            # already in the right mode
-            self._mode_episode_count += 1
-        else:
-            # switching mode
-            self._mode = mode
-            self._mode_score = 0.0
-            self._mode_episode_count = 0
-
-        self.env.reset()
-        for _ in range(self._random_state.randint(15)):
-            action = self.env.action_space.sample()
-
-            # this executes the environment with an action,
-            # and returns the observation of the environment,
-            # the reward, if the env is over, and other info.
-            observation, reward, self.terminal, info = self.env.step(action)
-
-        self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1)
-        self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
-        self.state=np.zeros((84,84), dtype=np.uint8)#np.zeros((4,84,84), dtype=np.uint8) #FIXME
-        
-        return [1*[84 * [84 * [0]]]]#[1*[4 * [84 * [84 * [0]]]]]
-        
-        
-    def act(self, action):
-        #print "action"
-        #print action
-        
-        self.state=np.zeros((84,84), dtype=np.uint8)#np.zeros((4,84,84), dtype=np.uint8)
-        reward=0
-        for t in range(4):
-            observation, r, self.terminal, info = self.env.step(action)
-            #print "observation, reward, self.terminal"
-            #print observation, reward, self.terminal
-            reward+=r
-            self._screen=np.average(observation,axis=-1) # Gray levels
-            self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_NEAREST)  # 84*84
-            #plt.imshow(self._screen, cmap='gray')
-            #plt.show()
-            if self.inTerminalState():
-                break
-            #self.state[t]=self._reduced_screen
-        self.state=self._reduced_screen
-            
-        self._mode_score += reward
-        return np.sign(reward)
-
-    def summarizePerformance(self, test_data_set, learning_algo):
-        if self.inTerminalState() == False:
-            self._mode_episode_count += 1
-        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
-
-
-    def inputDimensions(self):
-        return [(1, 84, 84)]#[(1, 4, 84, 84)]
-
-    def observationType(self, subject):
-        return np.uint8
-
-    def nActions(self):
-        print "self.env.action_space"
-        print self.env.action_space
-        return self.env.action_space.n
-
-    def observe(self):
-        return [np.array(self.state, dtype=np.uint8)]
-
-    def inTerminalState(self):
-        return self.terminal
-                
-
-
-if __name__ == "__main__":
-    pass
\ No newline at end of file

From 0157ae6ff6b81e2b71f3dd6e7e60aec9a785a292 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 10:43:01 -0400
Subject: [PATCH 54/96] removing theano

---
 deer/q_networks/NN_lasagne.py    | 202 --------------------
 deer/q_networks/NN_theano.py     | 251 ------------------------
 deer/q_networks/q_net_theano.py  | 314 -------------------------------
 deer/q_networks/theano_layers.py | 137 --------------
 deer/q_networks/updates.py       |  86 ---------
 5 files changed, 990 deletions(-)
 delete mode 100644 deer/q_networks/NN_lasagne.py
 delete mode 100644 deer/q_networks/NN_theano.py
 delete mode 100644 deer/q_networks/q_net_theano.py
 delete mode 100644 deer/q_networks/theano_layers.py
 delete mode 100644 deer/q_networks/updates.py

diff --git a/deer/q_networks/NN_lasagne.py b/deer/q_networks/NN_lasagne.py
deleted file mode 100644
index 8870ac12..00000000
--- a/deer/q_networks/NN_lasagne.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""
-Neural network using Lasagne (called by q_net_theano)
-
-.. Authors: Vincent Francois-Lavet, David Taralla
-"""
-
-import lasagne
-import numpy as np
-import theano
-import theano.tensor as T
-
-    
-class NN():
-    """
-    Deep Q-learning network using Lasagne on top of Theano
-    
-    Parameters
-    -----------
-    batch_size : int
-        Number of tuples taken into account for each iteration of gradient descent
-    input_dimensions :
-    n_actions :
-    random_state : numpy random number generator
-    """
-    def __init__(self, batch_size, input_dimensions, n_actions, random_state):
-        self._input_dimensions=input_dimensions
-        self._batch_size=batch_size
-        self._random_state=random_state
-        self._n_actions=n_actions
-        
-    def _buildDQN(self, inputs):
-        """
-        Build a network consistent with each type of inputs
-        """
-        if ("gpu" in theano.config.device):
-            from lasagne.layers.cuda_convnet import Conv2DCCLayer
-            conv2DFunc = Conv2DCCLayer
-        else:
-            conv2DFunc = lasagne.layers.Conv2DLayer
-
-        l_outs_conv=[]
-        for i, dim in enumerate(self._input_dimensions):
-            # - observation[i] is a FRAME -
-            if len(dim) == 3: 
-                # Building here for 3D
-                l_in = lasagne.layers.InputLayer(
-                    shape=(self._batch_size,) + dim,
-                    input_var=inputs[i],
-                )
-                
-                l_conv1 = conv2DFunc(
-                    l_in,
-                    num_filters=32,
-                    filter_size=(8, 8),
-                    stride=(4, 4),
-                    nonlinearity=lasagne.nonlinearities.rectify,
-                    W=lasagne.init.HeUniform(),
-                    b=lasagne.init.Constant(.0)
-                )
-                
-                l_conv2 = conv2DFunc(
-                    l_conv1,
-                    num_filters=64,
-                    filter_size=(4, 4),
-                    stride=(2, 2),
-                    nonlinearity=lasagne.nonlinearities.rectify,
-                    W=lasagne.init.HeUniform(),
-                    b=lasagne.init.Constant(.0)
-                )
-                
-                l_conv3 = conv2DFunc(
-                    l_conv2,
-                    num_filters=64,
-                    filter_size=(3, 3),
-                    stride=(1, 1),
-                    nonlinearity=lasagne.nonlinearities.rectify,
-                    W=lasagne.init.HeUniform(),
-                    b=lasagne.init.Constant(.0)
-                )
-                l_outs_conv.append(l_conv3)
-                
-            # - observation[i] is a VECTOR -
-            elif len(dim) == 2 and dim[0] > 3:
-                # Building here for  2D
-                l_in = lasagne.layers.InputLayer(
-                    shape=(self._batch_size, 1) + dim,
-                    input_var=inputs[i].reshape((self._batch_size, 1) + dim),
-                )
-                
-                l_conv1 = conv2DFunc(
-                    l_in,
-                    num_filters=16,
-                    filter_size=(2, 1),#filter_size=(8, 8),
-                    stride=(1, 1),#stride=(4, 4),
-                    nonlinearity=lasagne.nonlinearities.rectify,
-                    W=lasagne.init.HeUniform(),
-                    b=lasagne.init.Constant(.0)
-                )
-                
-                l_conv2 = conv2DFunc(
-                    l_conv1,
-                    num_filters=16,
-                    filter_size=(2, 2),
-                    stride=(1, 1),
-                    nonlinearity=lasagne.nonlinearities.rectify,
-                    W=lasagne.init.HeUniform(),
-                    b=lasagne.init.Constant(.0),
-                )
-                l_outs_conv.append(l_conv2)
-                
-            # - observation[i] is a SCALAR -
-            else:
-                if dim[0] > 3:
-                    # Building here for  1D
-                    l_in = lasagne.layers.InputLayer(
-                        shape=(self._batch_size, 1) + dim,
-                        input_var=inputs[i].reshape((self._batch_size, 1) + dim),
-                    )
-                
-                    l_conv1 = lasagne.layers.Conv1DLayer(
-                        l_in,
-                        num_filters=8,#32,
-                        filter_size=2,#filter_size=(8, 8),
-                        stride=1,#stride=(4, 4),
-                        nonlinearity=lasagne.nonlinearities.rectify,
-                        W=lasagne.init.HeUniform(), # Defaults to Glorot
-                        b=lasagne.init.Constant(.0)
-                    )
-                
-                    l_conv2 = lasagne.layers.Conv1DLayer(
-                        l_conv1,
-                        num_filters=8,#64,
-                        filter_size=2,#filter_size=(4, 4),
-                        stride=1,#stride=(2, 2),
-                        nonlinearity=lasagne.nonlinearities.rectify,
-                        W=lasagne.init.HeUniform(),
-                        b=lasagne.init.Constant(.0)
-                    )
-                
-                    l_outs_conv.append(l_conv2)
-                else:
-                    # Building here for 1D simple
-                    l_in = lasagne.layers.InputLayer(
-                        shape=(self._batch_size, 1) + dim,
-                        input_var=inputs[i].reshape((self._batch_size, 1) + dim),
-                    )
-                                
-                    l_outs_conv.append(l_in)
-
-        ## Custom merge of layers
-        ## NB : l_output_conv=lasagne.layers.MergeLayer(l_outs_conv) gives NOT IMPLEMENTED ERROR
-        output_conv = lasagne.layers.get_output(l_outs_conv[0]).flatten().reshape((self._batch_size, np.prod(l_outs_conv[0].output_shape[1:])))
-        shapes = [np.prod(l_outs_conv[0].output_shape[1:])]
-
-        if (len(l_outs_conv)>1):
-            for l_out_conv in l_outs_conv[1:]:
-                output_conv=T.concatenate((output_conv, lasagne.layers.get_output(l_out_conv).flatten().reshape((self._batch_size, np.prod(l_out_conv.output_shape[1:])))) , axis=1)
-                shapes.append(np.prod(l_out_conv.output_shape[1:]))
-
-        shape = sum(shapes)
-
-        l_output_conv = lasagne.layers.InputLayer(
-            shape=([self._batch_size, shape]),
-            input_var=output_conv,
-        )
-
-        l_hidden1 = lasagne.layers.DenseLayer(
-            l_output_conv,
-            num_units=50,#512,
-            nonlinearity=lasagne.nonlinearities.rectify,
-            W=lasagne.init.HeUniform(),
-            b=lasagne.init.Constant(.0)
-        )
-
-        l_hidden2 = lasagne.layers.DenseLayer(
-            l_hidden1,
-            num_units=20,#50,#512,
-            nonlinearity=lasagne.nonlinearities.rectify,
-            W=lasagne.init.HeUniform(),
-            b=lasagne.init.Constant(.0)
-        )
-
-        l_out = lasagne.layers.DenseLayer(
-            l_hidden2,
-            num_units=self._n_actions,
-            nonlinearity=None,
-            W=lasagne.init.HeUniform(),
-            b=lasagne.init.Constant(.0)
-        )
-
-        params = lasagne.layers.helper.get_all_params(l_out)
-
-        for conv_param in l_outs_conv:
-            for p in lasagne.layers.helper.get_all_params(conv_param):
-                params.append(p)
-
-        
-        return lasagne.layers.get_output(l_out), params, shapes
-
-
-if __name__ == '__main__':
-    pass
diff --git a/deer/q_networks/NN_theano.py b/deer/q_networks/NN_theano.py
deleted file mode 100644
index dded66f3..00000000
--- a/deer/q_networks/NN_theano.py
+++ /dev/null
@@ -1,251 +0,0 @@
-"""
-Neural network using Theano (called by q_net_theano)
-
-.. Authors: Vincent Francois-Lavet, David Taralla
-"""
-
-import numpy as np
-import theano
-import theano.tensor as T
-
-from .theano_layers import ConvolutionalLayer,HiddenLayer
-
-    
-class NN():
-    """
-    Deep Q-learning network using Theano
-    
-    Parameters
-    -----------
-    batch_size : int
-        Number of tuples taken into account for each iteration of gradient descent
-    input_dimensions :
-    n_Actions :
-    random_state : numpy random number generator
-    """
-    def __init__(self, batch_size, input_dimensions, n_actions, random_state):
-        self._input_dimensions=input_dimensions
-        self._batch_size=batch_size
-        self._random_state=random_state
-        self._n_actions=n_actions
-        
-    def _buildDQN(self, inputs):
-        """
-        Build a network consistent with each type of inputs
-        """
-        layers=[]
-        outs_conv=[]
-        outs_conv_shapes=[]
-        
-        for i, dim in enumerate(self._input_dimensions):
-            nfilter=[]
-            
-            # - observation[i] is a FRAME -
-            if len(dim) == 3: 
-
-                ### First layer
-                newR = dim[1]
-                newC = dim[2]
-                fR=4  # filter Rows
-                fC=4  # filter Column
-                pR=2  # pool Rows
-                pC=2  # pool Column
-                nfilter.append(8)
-                stride_size=2
-                l_conv1 = ConvolutionalLayer(
-                    rng=self._random_state,
-                    input=inputs[i].reshape((self._batch_size,dim[0],newR,newC)),
-                    filter_shape=(nfilter[0],dim[0],fR,fC),
-                    image_shape=(self._batch_size,dim[0],newR,newC),
-                    poolsize=(pR,pC),
-                    stride=(stride_size,stride_size)
-                )
-                layers.append(l_conv1)
-
-                newR = (newR - fR + 1 - pR) // stride_size + 1
-                newC = (newC - fC + 1 - pC) // stride_size + 1
-
-                ### Second layer
-                fR=4  # filter Rows
-                fC=4  # filter Column
-                pR=2  # pool Rows
-                pC=2  # pool Column
-                nfilter.append(16)
-                stride_size=2
-                l_conv2 = ConvolutionalLayer(
-                    rng=self._random_state,
-                    input=l_conv1.output.reshape((self._batch_size,nfilter[0],newR,newC)),
-                    filter_shape=(nfilter[1],nfilter[0],fR,fC),
-                    image_shape=(self._batch_size,nfilter[0],newR,newC),
-                    poolsize=(pR,pC),
-                    stride=(stride_size,stride_size)
-                )
-                layers.append(l_conv2)
-
-                newR = (newR - fR + 1 - pR) // stride_size + 1
-                newC = (newC - fC + 1 - pC) // stride_size + 1
-
-                ### Third layer
-                fR=3  # filter Rows
-                fC=3  # filter Column
-                pR=1  # pool Rows
-                pC=1  # pool Column
-                nfilter.append(16)
-                stride_size=1
-                l_conv3 = ConvolutionalLayer(
-                    rng=self._random_state,
-                    input=l_conv2.output.reshape((self._batch_size,nfilter[1],newR,newC)),
-                    filter_shape=(nfilter[2],nfilter[1],fR,fC),
-                    image_shape=(self._batch_size,nfilter[1],newR,newC),
-                    poolsize=(pR,pC),
-                    stride=(stride_size,stride_size)
-                )
-                layers.append(l_conv3)
-
-                newR = (newR - fR + 1 - pR) // stride_size + 1
-                newC = (newC - fC + 1 - pC) // stride_size + 1
-
-                outs_conv.append(l_conv3.output)
-                outs_conv_shapes.append((nfilter[2],newR,newC))
-
-                
-            # - observation[i] is a VECTOR -
-            elif len(dim) == 2 and dim[0] > 3:                
-                
-                newR = dim[0]
-                newC = dim[1]
-                
-                fR=2  # filter Rows
-                fC=1  # filter Column
-                pR=1  # pool Rows
-                pC=1  # pool Column
-                nfilter.append(16)
-                stride_size=1
-
-                l_conv1 = ConvolutionalLayer(
-                    rng=self._random_state,
-                    input=inputs[i].reshape((self._batch_size,1,newR,newC)),
-                    filter_shape=(nfilter[0],1,fR,fC),
-                    image_shape=(self._batch_size,1,newR,newC),
-                    poolsize=(pR,pC),
-                    stride=(stride_size,stride_size)
-                )                
-                layers.append(l_conv1)
-                
-                newR = (newR - fR + 1 - pR) // stride_size + 1  # stride 2
-                newC = (newC - fC + 1 - pC) // stride_size + 1  # stride 2
-
-                fR=2  # filter Rows
-                fC=2  # filter Column
-                pR=1  # pool Rows
-                pC=1  # pool Column
-                nfilter.append(16)
-                stride_size=1
-
-                l_conv2 = ConvolutionalLayer(
-                    rng=self._random_state,
-                    input=l_conv1.output.reshape((self._batch_size,nfilter[0],newR,newC)),
-                    filter_shape=(nfilter[1],nfilter[0],fR,fC),
-                    image_shape=(self._batch_size,nfilter[0],newR,newC),
-                    poolsize=(pR,pC),
-                    stride=(stride_size,stride_size)
-                )                
-                layers.append(l_conv2)
-                
-                newR = (newR - fR + 1 - pR) // stride_size + 1  # stride 2
-                newC = (newC - fC + 1 - pC) // stride_size + 1  # stride 2
-
-                outs_conv.append(l_conv2.output)
-                outs_conv_shapes.append((nfilter[1],newR,newC))
-
-
-            # - observation[i] is a SCALAR -
-            else:
-                if dim[0] > 3:
-                    newR = 1
-                    newC = dim[0]
-                    
-                    fR=1  # filter Rows
-                    fC=2  # filter Column
-                    pR=1  # pool Rows
-                    pC=1  # pool Column
-                    nfilter.append(8)
-                    stride_size=1
-
-                    l_conv1 = ConvolutionalLayer(
-                        rng=self._random_state,
-                        input=inputs[i].reshape((self._batch_size,1,newR,newC)),
-                        filter_shape=(nfilter[0],1,fR,fC),
-                        image_shape=(self._batch_size,1,newR,newC),
-                        poolsize=(pR,pC),
-                        stride=(stride_size,stride_size)
-                    )                
-                    layers.append(l_conv1)
-                    
-                    newC = (newC - fC + 1 - pC) // stride_size + 1  # stride 2
-
-                    fR=1  # filter Rows
-                    fC=2  # filter Column
-                    pR=1  # pool Rows
-                    pC=1  # pool Column
-                    nfilter.append(8)
-                    stride_size=1
-                    
-                    l_conv2 = ConvolutionalLayer(
-                        rng=self._random_state,
-                        input=l_conv1.output.reshape((self._batch_size,nfilter[0],newR,newC)),
-                        filter_shape=(nfilter[1],nfilter[0],fR,fC),
-                        image_shape=(self._batch_size,nfilter[0],newR,newC),
-                        poolsize=(pR,pC),
-                        stride=(stride_size,stride_size)
-                    )                
-                    layers.append(l_conv2)
-                    
-                    newC = (newC - fC + 1 - pC) // stride_size + 1  # stride 2
-
-                    outs_conv.append(l_conv2.output)
-                    outs_conv_shapes.append((nfilter[1],newC))
-                    
-                else:
-                    if(len(dim) == 2):
-                        outs_conv_shapes.append((dim[0],dim[1]))
-                    elif(len(dim) == 1):
-                        outs_conv_shapes.append((1,dim[0]))
-                    outs_conv.append(inputs[i])
-        
-        
-        ## Custom merge of layers
-        output_conv = outs_conv[0].flatten().reshape((self._batch_size, np.prod(outs_conv_shapes[0])))
-        shapes=np.prod(outs_conv_shapes[0])
-
-        if (len(outs_conv)>1):
-            for out_conv,out_conv_shape in zip(outs_conv[1:],outs_conv_shapes[1:]):
-                output_conv=T.concatenate((output_conv, out_conv.flatten().reshape((self._batch_size, np.prod(out_conv_shape)))) , axis=1)
-                shapes+=np.prod(out_conv_shape)
-                shapes
-
-                
-        self.hiddenLayer1 = HiddenLayer(rng=self._random_state, input=output_conv,
-                                       n_in=shapes, n_out=50,
-                                       activation=T.tanh)                                       
-        layers.append(self.hiddenLayer1)
-
-        self.hiddenLayer2 = HiddenLayer(rng=self._random_state, input=self.hiddenLayer1.output,
-                                       n_in=50, n_out=20,
-                                       activation=T.tanh)
-        layers.append(self.hiddenLayer2)
-
-        self.outLayer = HiddenLayer(rng=self._random_state, input=self.hiddenLayer2.output,
-                                       n_in=20, n_out=self._n_actions,
-                                       activation=None)
-        layers.append(self.outLayer)
-
-        # Grab all the parameters together.
-        params = [param
-                       for layer in layers
-                       for param in layer.params]
-        
-        return self.outLayer.output, params, outs_conv_shapes
-
-if __name__ == '__main__':
-    pass
diff --git a/deer/q_networks/q_net_theano.py b/deer/q_networks/q_net_theano.py
deleted file mode 100644
index 058608f0..00000000
--- a/deer/q_networks/q_net_theano.py
+++ /dev/null
@@ -1,314 +0,0 @@
-"""
-Code for general deep Q-learning using Theano that can take as inputs scalars, vectors and matrices
-
-.. Authors: Vincent Francois-Lavet, David Taralla
-
-.. Inspired from "Human-level control through deep reinforcement learning",
-.. Nature, 518(7540):529-533, February 2015
-"""
-
-import numpy as np
-import theano
-import theano.tensor as T
-from .updates import deepmind_rmsprop
-from ..base_classes import QNetwork
-from .NN_theano import NN # Default Neural network used
-    
-class MyQNetwork(QNetwork):
-    """
-    Deep Q-learning network using Theano
-    
-    Parameters
-    -----------
-    environment : object from class Environment
-    rho : float
-        Parameter for rmsprop. Default : 0.9
-    rms_epsilon : float
-        Parameter for rmsprop. Default : 0.0001
-    momentum : float
-        Not implemented.
-    clip_delta : float
-        If > 0, the squared loss is linear past the clip point which keeps the gradient constant. Default : 0
-    freeze_interval : int
-        Period during which the target network is freezed and after which the target network is updated. Default : 1000
-    batch_size : int
-        Number of tuples taken into account for each iteration of gradient descent. Default : 32
-    update_rule: str
-        {sgd,rmsprop}. Default : rmsprop
-    random_state : numpy random number generator
-        Default : random seed.
-    double_Q : bool
-        Activate or not the DoubleQ learning : not implemented yet. Default : False
-        More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
-    neural_network  : object
-        default is deer.qnetworks.NN_theano
-    """
-
-    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN):
-        """ Initialize environment
-        
-        """
-        QNetwork.__init__(self,environment, batch_size)
-        
-        self._rho = rho
-        self._rms_epsilon = rms_epsilon
-        self._momentum = momentum
-        self._clip_delta = clip_delta
-        self._freeze_interval = freeze_interval
-        self._double_Q = double_Q
-        self._random_state = random_state
-        
-        self.update_counter = 0
-        
-        states=[]   # list of symbolic variables for each of the k element in the belief state
-                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
-        next_states=[] # idem than states at t+1 
-        self.states_shared=[] # list of shared variable for each of the k element in the belief state
-        self.next_states_shared=[] # idem that self.states_shared at t+1
-
-        for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3:
-                states.append(T.tensor4("%s_%s" % ("state", i)))
-                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))
-
-            elif len(dim) == 2:
-                states.append(T.tensor3("%s_%s" % ("state", i)))
-                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
-                
-            elif len(dim) == 1:            
-                states.append( T.matrix("%s_%s" % ("state", i)) )
-                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
-                
-            self.states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False))
-            self.next_states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False))
-        
-        print("Number of observations per state: {}".format(len(self.states_shared)))
-        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._input_dimensions))
-                
-        rewards = T.col('rewards')
-        actions = T.icol('actions')
-        terminals = T.icol('terminals')
-        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
-        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
-        
-        Q_net=neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
-        self.q_vals, self.params, shape_after_conv = Q_net._buildDQN(states)
-        
-        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))
-
-        self.next_q_vals, self.next_params, shape_after_conv = Q_net._buildDQN(next_states)
-        self._resetQHat()
-
-        self.rewards_shared = theano.shared(
-            np.zeros((batch_size, 1), dtype=theano.config.floatX),
-            broadcastable=(False, True))
-
-        self.actions_shared = theano.shared(
-            np.zeros((batch_size, 1), dtype='int32'),
-            broadcastable=(False, True))
-
-        self.terminals_shared = theano.shared(
-            np.zeros((batch_size, 1), dtype='int32'),
-            broadcastable=(False, True))
-        
-        
-        if(self._double_Q==True):
-            givens_next={}
-            for i, x in enumerate(self.next_states_shared):
-                givens_next[ states[i] ] = x
-
-            self.next_q_vals_current_qnet=theano.function([], self.q_vals,
-                                          givens=givens_next)
-
-            next_q_curr_qnet = theano.clone(self.next_q_vals)
-
-            argmax_next_q_vals=T.argmax(next_q_curr_qnet, axis=1, keepdims=True)
-
-            max_next_q_vals=self.next_q_vals[T.arange(batch_size),argmax_next_q_vals.reshape((-1,))].reshape((-1, 1))
-
-        else:
-            max_next_q_vals=T.max(self.next_q_vals, axis=1, keepdims=True)
-
-
-        not_terminals=T.ones_like(terminals) - terminals
-
-        target = rewards + not_terminals * thediscount * max_next_q_vals
-
-        q_val=self.q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))
-        # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32...
-        diff = - q_val + target 
-
-        if self._clip_delta > 0:
-            # This loss function implementation is taken from
-            # https://github.com/spragunr/deep_q_rl
-            # If we simply take the squared clipped diff as our loss,
-            # then the gradient will be zero whenever the diff exceeds
-            # the clip bounds. To avoid this, we extend the loss
-            # linearly past the clip point to keep the gradient constant
-            # in that regime.
-            # 
-            # This is equivalent to declaring d loss/d q_vals to be
-            # equal to the clipped diff, then backpropagating from
-            # there, which is what the DeepMind implementation does.
-            quadratic_part = T.minimum(abs(diff), self._clip_delta)
-            linear_part = abs(diff) - quadratic_part
-            loss_ind = 0.5 * quadratic_part ** 2 + self._clip_delta * linear_part
-        else:
-            loss_ind = 0.5 * diff ** 2
-
-        loss = T.mean(loss_ind)
-
-        givens = {
-            rewards: self.rewards_shared,
-            actions: self.actions_shared, ## actions not needed!
-            terminals: self.terminals_shared
-        }
-        
-        for i, x in enumerate(self.states_shared):
-            givens[ states[i] ] = x 
-        for i, x in enumerate(self.next_states_shared):
-            givens[ next_states[i] ] = x
-                
-                
-        gparams=[]
-        for p in self.params:
-            gparam =  T.grad(loss, p)
-            gparams.append(gparam)
-
-        updates = []
-        
-        if update_rule == 'deepmind_rmsprop':
-            updates = deepmind_rmsprop(loss, self.params, gparams, thelr, self._rho,
-                                       self._rms_epsilon)
-        elif update_rule == 'rmsprop':
-            for i,(p, g) in enumerate(zip(self.params, gparams)):                
-                acc = theano.shared(p.get_value() * 0.)
-                acc_new = rho * acc + (1 - self._rho) * g ** 2
-                gradient_scaling = T.sqrt(acc_new + self._rms_epsilon)
-                g = g / gradient_scaling
-                updates.append((acc, acc_new))
-                updates.append((p, p - thelr * g))
-
-        elif update_rule == 'sgd':
-            for i, (param, gparam) in enumerate(zip(self.params, gparams)):
-                updates.append((param, param - thelr * gparam))
-        else:
-            raise ValueError("Unrecognized update: {}".format(update_rule))
-    
-        
-        if(self._double_Q==True):
-            self._train = theano.function([thediscount, thelr, next_q_curr_qnet], [loss, loss_ind, self.q_vals], updates=updates,
-                                      givens=givens,
-                                      on_unused_input='warn')
-        else:
-            self._train = theano.function([thediscount, thelr], [loss, loss_ind, self.q_vals], updates=updates,
-                                      givens=givens,
-                                      on_unused_input='warn')
-        givens2={}
-        for i, x in enumerate(self.states_shared):
-            givens2[ states[i] ] = x 
-
-        self._q_vals = theano.function([], self.q_vals,
-                                      givens=givens2,
-                                      on_unused_input='warn')
-
-
-    def getAllParams(self):
-        params_value=[]
-        for i,p in enumerate(self.params):
-            params_value.append(p.get_value())
-        return params_value
-
-    def setAllParams(self, list_of_values):
-        for i,p in enumerate(self.params):
-            p.set_value(list_of_values[i])
-    
-    def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
-        """
-        Train one batch.
-
-        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
-        2. perform batch training
-
-        Parameters
-        -----------
-        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        actions_val : b x 1 numpy array of integers
-        rewards_val : b x 1 numpy array
-        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        terminals_val : b x 1 numpy boolean array
-
-        Returns
-        -------
-        Average loss of the batch training (RMSE)
-        Individual (square) losses for each tuple
-        """
-        
-        for i in range(len(self.states_shared)):
-            self.states_shared[i].set_value(states_val[i])
-            
-        for i in range(len(self.states_shared)):
-            self.next_states_shared[i].set_value(next_states_val[i])
-        
-        #print rewards_val
-        #print actions_val
-        self.actions_shared.set_value(actions_val.reshape(len(actions_val), 1))
-        self.rewards_shared.set_value(rewards_val.reshape(len(rewards_val), 1))
-        self.terminals_shared.set_value(terminals_val.reshape(len(terminals_val), 1))
-        if self.update_counter % self._freeze_interval == 0:
-            self._resetQHat()
-        
-        #print self._q_vals()
-        
-        if(self._double_Q==True):
-            self._next_q_curr_qnet = self.next_q_vals_current_qnet()
-            loss, loss_ind, _ = self._train(self._df, self._lr,self._next_q_curr_qnet)
-        else:
-            loss, loss_ind, _ = self._train(self._df, self._lr)
-
-        #print "after self._q_vals"
-        #print self._q_vals()
-
-        self.update_counter += 1
-        
-        # NB : loss=np.average(loss_ind)
-        return np.sqrt(loss),loss_ind
-
-    def qValues(self, state_val):
-        """ Get the q values for one belief state
-
-        Arguments
-        ---------
-        state_val : one belief state
-
-        Returns
-        -------
-        The q values for the provided belief state
-        """ 
-        # Set the first element of the batch to values provided by state_val
-        for i in range(len(self.states_shared)):
-            aa = self.states_shared[i].get_value()
-            aa[0] = state_val[i]
-            self.states_shared[i].set_value(aa)
-        
-        return self._q_vals()[0]
-
-    def chooseBestAction(self, state):
-        """ Get the best action for a belief state
-
-        Arguments
-        ---------
-        state : one belief state
-
-        Returns
-        -------
-        The best action : int
-        """        
-        q_vals = self.qValues(state)
-
-        return np.argmax(q_vals),np.max(q_vals)
-
-    def _resetQHat(self):
-        for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
-            next_param.set_value(param.get_value())        
-
diff --git a/deer/q_networks/theano_layers.py b/deer/q_networks/theano_layers.py
deleted file mode 100644
index 9721e827..00000000
--- a/deer/q_networks/theano_layers.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import numpy
-
-import theano
-import theano.tensor as T
-from theano.tensor.signal import pool
-from theano.tensor.nnet import conv
-
-
-class ConvolutionalLayer(object):
-    """Pool Layer of a convolutional network """
-
-    def __init__(self, rng, input, filter_shape, image_shape, poolsize=(1, 1), stride=(1,1)):
-        """
-        Allocate a LeNetConvPoolLayer with shared variable internal parameters.
-
-        :type rng: numpy.random.RandomState
-        :param rng: a random number generator used to initialize weights
-
-        :type input: theano.tensor.dtensor4
-        :param input: symbolic image tensor, of shape image_shape
-
-        :type filter_shape: tuple or list of length 4
-        :param filter_shape: (number of filters, num input feature maps,
-                              filter height,filter width)
-
-        :type image_shape: tuple or list of length 4
-        :param image_shape: (batch size, num input feature maps,
-                             image height, image width)
-
-        :type poolsize: tuple or list of length 2
-        :param poolsize: the downsampling (pooling) factor (#rows,#cols)
-        """
-
-        assert image_shape[1] == filter_shape[1]
-        self.input = input
-
-        # there are "num input feature maps * filter height * filter width"
-        # inputs to each hidden unit
-        fan_in = numpy.prod(filter_shape[1:])
-        # each unit in the lower layer receives a gradient from:
-        # "num output feature maps * filter height * filter width" /
-        #   pooling size
-        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
-                   numpy.prod(poolsize))
-        # initialize weights with random weights
-        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
-        self.W = theano.shared(numpy.asarray(
-            rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
-            dtype=theano.config.floatX),
-                               borrow=True)
-
-        # the bias is a 1D tensor -- one bias per output feature map
-        b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
-        self.b = theano.shared(value=b_values, borrow=True)
-
-        # convolve input feature maps with filters
-        conv_out = conv.conv2d(input=input, filters=self.W,
-                filter_shape=filter_shape, image_shape=image_shape)
-
-        # downsample each feature map individually, using maxpooling
-        pooled_out = pool.pool_2d(input=conv_out,
-                                            ds=poolsize, ignore_border=True, st=stride)
-
-        # add the bias term. Since the bias is a vector (1D array), we first
-        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
-        # thus be broadcasted across mini-batches and feature map
-        # width & height
-        self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
-
-        # store parameters of this layer
-        self.params = [self.W, self.b]
-
-
-class HiddenLayer(object):
-    def __init__(self, rng, input, n_in, n_out, W=None, b=None,
-                 activation=T.tanh):
-        """
-        Typical hidden layer of a MLP: units are fully-connected and have
-        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
-        and the bias vector b is of shape (n_out,).
-
-        NOTE : The nonlinearity used here is tanh
-
-        Hidden unit activation is given by: tanh(dot(input,W) + b)
-
-        :type rng: numpy.random.RandomState
-        :param rng: a random number generator used to initialize weights
-
-        :type input: theano.tensor.dmatrix
-        :param input: a symbolic tensor of shape (n_examples, n_in)
-
-        :type n_in: int
-        :param n_in: dimensionality of input
-
-        :type n_out: int
-        :param n_out: number of hidden units
-
-        :type activation: theano.Op or function
-        :param activation: Non linearity to be applied in the hidden
-                           layer
-        """
-        self.input = input
-
-        # `W` is initialized with `W_values` which is uniformely sampled
-        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
-        # for tanh activation function
-        # the output of uniform if converted using asarray to dtype
-        # theano.config.floatX so that the code is runable on GPU
-        # Note : optimal initialization of weights is dependent on the
-        #        activation function used (among other things).
-        #        For example, results presented in [Xavier10] suggest that you
-        #        should use 4 times larger initial weights for sigmoid
-        #        compared to tanh
-        #        We have no info for other function, so we use the same as
-        #        tanh.
-        if W is None:
-            W_values = numpy.asarray(rng.uniform(
-                    low=-numpy.sqrt(6. / (n_in + n_out)),
-                    high=numpy.sqrt(6. / (n_in + n_out)),
-                    size=(n_in, n_out)), dtype=theano.config.floatX)
-            if activation == theano.tensor.nnet.sigmoid:
-                W_values *= 4
-
-            W = theano.shared(value=W_values, name='W', borrow=True)
-
-        if b is None:
-            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
-            b = theano.shared(value=b_values, name='b', borrow=True)
-
-        self.W = W
-        self.b = b
-
-        lin_output = T.dot(input, self.W) + self.b
-        self.output = (lin_output if activation is None
-                       else activation(lin_output))
-        # parameters of the model
-        self.params = [self.W, self.b]
diff --git a/deer/q_networks/updates.py b/deer/q_networks/updates.py
deleted file mode 100644
index 2b169226..00000000
--- a/deer/q_networks/updates.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""
-Gradient update rules for the deep_q_rl package. 
-
-Some code here is modified from the Lasagne package:
- 
-https://github.com/Lasagne/Lasagne/blob/master/LICENSE
-
-Copyright (c) 2014 Sander Dieleman
-The MIT License (MIT)
-
-"""
-
-import theano
-import theano.tensor as T
-from collections import OrderedDict
-import numpy as np
-
-
-
-def deepmind_rmsprop(loss_or_grads, params, grads, learning_rate, 
-                     rho, epsilon):
-    """RMSProp updates [1]_.
-
-    Scale learning rates by dividing with the moving average of the root mean
-    squared (RMS) gradients.
-
-    Parameters
-    ----------
-    loss_or_grads : symbolic expression or list of expressions
-        A scalar loss expression, or a list of gradient expressions
-    params : list of shared variables
-        The variables to generate update expressions for
-    learning_rate : float or symbolic scalar
-        The learning rate controlling the size of update steps
-    rho : float or symbolic scalar
-        Gradient moving average decay factor
-    epsilon : float or symbolic scalar
-        Small value added for numerical stability
-
-    Returns
-    -------
-    OrderedDict
-        A dictionary mapping each parameter to its update expression
-
-    Notes
-    -----
-    `rho` should be between 0 and 1. A value of `rho` close to 1 will decay the
-    moving average slowly and a value close to 0 will decay the moving average
-    fast.
-
-    Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the
-    learning rate :math:`\\eta_t` is calculated as:
-
-    .. math::
-       r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
-       \\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}}
-
-    References
-    ----------
-    .. [1] Tieleman, T. and Hinton, G. (2012):
-           Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
-           Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
-    """
-
-    updates = OrderedDict()
-
-    for param, grad in zip(params, grads):
-        value = param.get_value(borrow=True)
-
-        acc_grad = theano.shared(np.zeros(value.shape, dtype=value.dtype),
-                             broadcastable=param.broadcastable)
-        acc_grad_new = rho * acc_grad + (1 - rho) * grad
-
-        acc_rms = theano.shared(np.zeros(value.shape, dtype=value.dtype),
-                             broadcastable=param.broadcastable)
-        acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2
-
-
-        updates[acc_grad] = acc_grad_new
-        updates[acc_rms] = acc_rms_new
-
-        updates[param] = (param - learning_rate * 
-                          (grad / 
-                           T.sqrt(acc_rms_new - acc_grad_new **2 + epsilon)))
-
-    return updates

From b2fd82c6677f4559b12a091a902a674322627029 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 10:57:35 -0400
Subject: [PATCH 55/96] rename qnetworks to learning_algo

---
 .../AC_net_keras.py                           |   0
 .../CRAR_keras.py}                            |   0
 .../NN_CRAR_keras.py}                         |   0
 .../{q_networks => learning_algo}/NN_keras.py |   0
 .../NN_keras_LSTM.py                          |   0
 .../{q_networks => learning_algo}/__init__.py |   0
 .../q_net_keras.py                            |   0
 deer/q_networks/NN_keras_lp.py                | 522 -----------
 deer/q_networks/q_net_keras_lp_nstep.py       | 809 ------------------
 9 files changed, 1331 deletions(-)
 rename deer/{q_networks => learning_algo}/AC_net_keras.py (100%)
 rename deer/{q_networks/q_net_keras_lp.py => learning_algo/CRAR_keras.py} (100%)
 rename deer/{q_networks/NN_keras_lp_high_int_dim.py => learning_algo/NN_CRAR_keras.py} (100%)
 rename deer/{q_networks => learning_algo}/NN_keras.py (100%)
 rename deer/{q_networks => learning_algo}/NN_keras_LSTM.py (100%)
 rename deer/{q_networks => learning_algo}/__init__.py (100%)
 rename deer/{q_networks => learning_algo}/q_net_keras.py (100%)
 delete mode 100644 deer/q_networks/NN_keras_lp.py
 delete mode 100644 deer/q_networks/q_net_keras_lp_nstep.py

diff --git a/deer/q_networks/AC_net_keras.py b/deer/learning_algo/AC_net_keras.py
similarity index 100%
rename from deer/q_networks/AC_net_keras.py
rename to deer/learning_algo/AC_net_keras.py
diff --git a/deer/q_networks/q_net_keras_lp.py b/deer/learning_algo/CRAR_keras.py
similarity index 100%
rename from deer/q_networks/q_net_keras_lp.py
rename to deer/learning_algo/CRAR_keras.py
diff --git a/deer/q_networks/NN_keras_lp_high_int_dim.py b/deer/learning_algo/NN_CRAR_keras.py
similarity index 100%
rename from deer/q_networks/NN_keras_lp_high_int_dim.py
rename to deer/learning_algo/NN_CRAR_keras.py
diff --git a/deer/q_networks/NN_keras.py b/deer/learning_algo/NN_keras.py
similarity index 100%
rename from deer/q_networks/NN_keras.py
rename to deer/learning_algo/NN_keras.py
diff --git a/deer/q_networks/NN_keras_LSTM.py b/deer/learning_algo/NN_keras_LSTM.py
similarity index 100%
rename from deer/q_networks/NN_keras_LSTM.py
rename to deer/learning_algo/NN_keras_LSTM.py
diff --git a/deer/q_networks/__init__.py b/deer/learning_algo/__init__.py
similarity index 100%
rename from deer/q_networks/__init__.py
rename to deer/learning_algo/__init__.py
diff --git a/deer/q_networks/q_net_keras.py b/deer/learning_algo/q_net_keras.py
similarity index 100%
rename from deer/q_networks/q_net_keras.py
rename to deer/learning_algo/q_net_keras.py
diff --git a/deer/q_networks/NN_keras_lp.py b/deer/q_networks/NN_keras_lp.py
deleted file mode 100644
index abcc0312..00000000
--- a/deer/q_networks/NN_keras_lp.py
+++ /dev/null
@@ -1,522 +0,0 @@
-"""
-Neural network using Keras (called by q_net_keras)
-.. Author: Vincent Francois-Lavet
-"""
-
-import numpy as np
-from keras import backend as K
-from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization, merge
-from keras import regularizers
-np.random.seed(102912)
-
-class NN():
-    """
-    Deep Q-learning network using Keras
-    
-    Parameters
-    -----------
-    batch_size : int
-        Number of tuples taken into account for each iteration of gradient descent
-    input_dimensions :
-    n_actions :
-    random_state : numpy random number generator
-    action_as_input : Boolean
-        Whether the action is given as input or as output
-    """
-    def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False):
-        self._input_dimensions=input_dimensions
-        self._batch_size=batch_size
-        self._random_state=random_state
-        self._n_actions=n_actions
-        self._action_as_input=action_as_input
-        self.internal_dim=3 # size random vector
-        self._rand_vect_size=5 # size output distribution
-
-    def encoder_model(self):
-        """
-    
-        Parameters
-        -----------
-        s
-    
-        Returns
-        -------
-        model with output x (= encoding of s)
-    
-        """
-        layers=[]
-        outs_conv=[]
-        inputs=[]
-
-        for i, dim in enumerate(self._input_dimensions):
-            # - observation[i] is a FRAME
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
-                inputs.append(input)
-                x=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)    #data_format='channels_last'
-                x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
-                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                x = Conv2D(8, (3, 3), padding='same', activation='tanh')(x)
-                x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                #x = Conv2D(16, (4, 4), padding='same', activation='tanh')(x)
-                #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                
-                out = Flatten()(x)
-                
-            # - observation[i] is a VECTOR
-            elif len(dim) == 2:
-                if dim[0] > 3:
-                    input = Input(shape=(dim[0],dim[1]))
-                    inputs.append(input)
-                    reshaped=Reshape((dim[0],dim[1],1), input_shape=(dim[0],dim[1]))(input) 
-                    x = Conv2D(16, (2, 1), activation='relu', border_mode='valid')(reshaped)#Conv on the history
-                    x = Conv2D(16, (2, 2), activation='relu', border_mode='valid')(x)       #Conv on the history & features
-
-                    out = Flatten()(x)
-                else:
-                    input = Input(shape=(dim[0],dim[1]))
-                    inputs.append(input)
-                    out = Flatten()(input)
-
-            # - observation[i] is a SCALAR -
-            else:
-                if dim[0] > 3:
-                    # this returns a tensor
-                    input = Input(shape=(dim[0],))
-                    inputs.append(input)
-                    reshaped=Reshape((1,dim[0],1), input_shape=(dim[0],))(input)  
-                    x = Conv2D(8, (1,2), activation='relu', border_mode='valid')(reshaped)  #Conv on the history
-                    x = Conv2D(8, (1,2), activation='relu', border_mode='valid')(x)         #Conv on the history
-                    
-                    out = Flatten()(x)
-                                        
-                else:
-                    input = Input(shape=(dim[0],))
-                    inputs.append(input)
-                    out=input
-                    
-            outs_conv.append(out)
-
-        if (self._action_as_input==True):
-            if ( isinstance(self._n_actions,int)):
-                print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
-            else:
-                input = Input(shape=(len(self._n_actions),))
-                inputs.append(input)
-                outs_conv.append(input)
-        
-        if len(outs_conv)>1:
-            x = merge(outs_conv, mode='concat')
-        else:
-            x= outs_conv [0]
-        
-        # we stack a deep fully-connected network on top
-        x = Dense(200, activation='tanh')(x)
-        x = Dense(100, activation='tanh')(x)
-        x = Dense(50, activation='tanh')(x)
-        x = Dense(10, activation='tanh')(x)
-        
-        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
-        
-        model = Model(inputs=inputs, outputs=x)
-        
-        return model
-
-    def encoder_diff_model(self,encoder_model):
-        """
-    
-        Parameters
-        -----------
-        s
-    
-        Returns
-        -------
-        model with output x (= encoding of s)
-    
-        """
-        inputs=[]
-        
-        for j in range(2):
-            for i, dim in enumerate(self._input_dimensions):
-                if len(dim) == 3:
-                    input = Input(shape=(dim[0],dim[1],dim[2]))
-                    inputs.append(input)
-            
-                elif len(dim) == 2:
-                    input = Input(shape=(dim[0],dim[1]))
-                    inputs.append(input)
-            
-                else:
-                    input = Input(shape=(dim[0],))
-                    inputs.append(input)
-        
-        half = len(inputs)/2
-        x1 = encoder_model(inputs[:half])
-        x2 = encoder_model(inputs[half:])
-        
-        x = Subtract()([x1,x2])
-        model = Model(inputs=inputs, outputs=x)
-        
-        return model
-
-    def transition_model(self):
-        """
-    
-        Parameters
-        -----------
-        x
-        a
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
-
-        x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(10, activation='tanh')(x) #5,15
-        x = Dense(30, activation='tanh')(x) # ,30
-        x = Dense(30, activation='tanh')(x) # ,30
-        x = Dense(10, activation='tanh')(x) # ,30
-        #x = Dense(5, activation='tanh')(x) #5,15
-        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
-        x = Add()([inputs[0],x])
-        
-        model = Model(inputs=inputs, outputs=x)
-        
-        return model
-
-    def transition_model2(self):
-        """
-    
-        Parameters
-        -----------
-        x
-        a
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
-
-        x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(10, activation='tanh')(x)
-        x = BatchNormalization()(x)
-        x = Dense(50, activation='tanh')(x)
-        x = BatchNormalization()(x)
-        x = Dense(10, activation='tanh')(x)
-        x = BatchNormalization()(x)
-        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
-        x = Add()([inputs[0],x])
-        
-        model = Model(inputs=inputs, outputs=x)
-        
-        return model
-
-    def diff_Tx_x_(self,encoder_model,transition_model):
-        """
-    
-        Parameters
-        -----------
-        s
-        a
-        s'
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs=[]
-        for j in range(2):
-            for i, dim in enumerate(self._input_dimensions):
-                if len(dim) == 3:
-                    input = Input(shape=(dim[0],dim[1],dim[2]))
-                    inputs.append(input)
-            
-                elif len(dim) == 2:
-                    input = Input(shape=(dim[0],dim[1]))
-                    inputs.append(input)
-            
-                else:
-                    input = Input(shape=(dim[0],))
-                    inputs.append(input)
-
-        half = len(inputs)/2
-        enc_x = encoder_model(inputs[:half]) #s --> x
-        enc_x_ = encoder_model(inputs[half:]) #s --> x
-
-        input = Input(shape=(self._n_actions,))
-        inputs.append(input)
-                
-        Tx= transition_model([enc_x,inputs[-1]])
-        
-        x = Subtract()([Tx,enc_x_])
-        
-        model = Model(inputs=inputs, outputs=x )
-        
-        return model
-
-    def diff_s_s_(self,encoder_model):
-        """
-    
-        Parameters
-        -----------
-        s
-        a
-        random z
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs=[]
-        
-        for j in range(2):
-            for i, dim in enumerate(self._input_dimensions):
-                if len(dim) == 3:
-                    input = Input(shape=(dim[0],dim[1],dim[2]))
-                    inputs.append(input)
-            
-                elif len(dim) == 2:
-                    input = Input(shape=(dim[0],dim[1]))
-                    inputs.append(input)
-            
-                else:
-                    input = Input(shape=(dim[0],))
-                    inputs.append(input)
-        
-        half = len(inputs)/2
-        enc_x = encoder_model(inputs[:half]) #s --> x #FIXME
-        enc_x_ = encoder_model(inputs[half:]) #s --> x
-        
-        x = Subtract()([enc_x,enc_x_])
-        x = Dot(axes=-1, normalize=False)([x,x])
-        
-        model = Model(inputs=inputs, outputs=x )
-        
-        return model
-
-    def diff_sa_sa(self,encoder_model,transition_model):
-        """
-    
-        Parameters
-        -----------
-        s
-        a
-        rand_a
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs=[]
-        
-        for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
-                inputs.append(input)
-
-            elif len(dim) == 2:
-                input = Input(shape=(dim[0],dim[1]))
-                inputs.append(input)
-
-            else:
-                input = Input(shape=(dim[0],))
-                inputs.append(input)
-        
-        input = Input(shape=(self._n_actions,))
-        inputs.append(input)
-        input = Input(shape=(self._n_actions,))
-        inputs.append(input)
-        
-        enc_x = encoder_model(inputs[:-2]) #s --> x
-        Tx= transition_model([enc_x,inputs[-2]])
-        rand_Tx= transition_model([enc_x,inputs[-1]])
-                
-        x = Subtract()([Tx,rand_Tx])
-        x = Dot(axes=-1, normalize=False)([x,x])
-        
-        model = Model(inputs=inputs, outputs=x )
-        
-        return model
-
-    def diff_Tx(self,transition_model):
-        """
-    
-        Parameters
-        -----------
-        x
-        a
-        x
-        a
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ), Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) )] #x,a,x,a
-        
-        #identity_mat=inputs[2]#K.constant(np.diag(np.ones(self._n_actions)), name="identity_mat")
-        Tx = transition_model(inputs[:2])
-        Tx2 = transition_model(inputs[2:])
-        
-        #tile_x=K.tile(inputs[0],(self._n_actions,1))        
-        #Tx_ = transition_model([tile_x]+[identity_mat])
-        
-        x = Subtract()([Tx,Tx2])
-        x = Dot(axes=-1, normalize=False)([x,x])
-        
-        model = Model(inputs=inputs, outputs=x )
-        
-        return model
-
-    def R_model(self):
-        """
-        Build a network consistent with each type of inputs
-
-        Parameters
-        -----------
-        x
-        a
-    
-        Returns
-        -------
-        r
-        """
-        
-        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
-        
-        x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(10, activation='tanh')(x)
-        x = Dense(20, activation='tanh')(x)
-        x = Dense(10, activation='tanh')(x)
-        
-        out = Dense(1)(x)
-                
-        model = Model(inputs=inputs, outputs=out)
-        
-        return model
-
-    def full_R_model(self,encoder_model,R_model):
-        """
-        Maps internal state to immediate rewards
-
-        Parameters
-        -----------
-        s
-        a
-        (noise in abstract state space) : FIXME
-    
-        Returns
-        -------
-        r
-        """
-        
-        inputs=[]
-        
-        for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
-                inputs.append(input)
-
-            elif len(dim) == 2:
-                input = Input(shape=(dim[0],dim[1]))
-                inputs.append(input)
-
-            else:
-                input = Input(shape=(dim[0],))
-                inputs.append(input)
-        
-        input = Input(shape=(self._n_actions,))
-        inputs.append(input)
-        
-        enc_x = encoder_model(inputs[:-1]) #s --> x
-                
-        out = R_model([enc_x]+inputs[-1:])
-                
-        model = Model(inputs=inputs, outputs=out)
-        
-        return model
-
-    def Q_model(self):
-        
-        inputs = [ Input( shape=(self.internal_dim,) ) ] #x
-        
-        #if (self._action_as_input==True):
-        #    if ( isinstance(self._n_actions,int)):
-        #        print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
-        #    else:
-        #        input = Input(shape=(len(self._n_actions),))
-        #        inputs.append(input)
-                
-        #x = Add()([x,inputs[-1]]) #????
-        
-        # we stack a deep fully-connected network on top
-        x = Dense(20, activation='tanh')(inputs[0])
-        x = Dense(50, activation='tanh')(x)
-        x = Dense(20, activation='tanh')(x)
-        
-        #if (self._action_as_input==False):
-        #    if ( isinstance(self._n_actions,int)):
-        out = Dense(self._n_actions)(x)
-        #    else:
-        #        out = Dense(len(self._n_actions))(x)
-        #else:
-        #    out = Dense(1)(x)
-                
-        model = Model(inputs=inputs, outputs=out)
-        
-        return model
-
-
-    def full_Q_model(self, encoder_model, Q_model):
-        """
-        Build a network consistent with each type of inputs
-
-        Parameters
-        -----------
-        s
-        noise in abstract state space
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-        """
-        inputs=[]
-        
-        for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
-                inputs.append(input)
-
-            elif len(dim) == 2:
-                input = Input(shape=(dim[0],dim[1]))
-                inputs.append(input)
-
-            else:
-                input = Input(shape=(dim[0],))
-                inputs.append(input)
-                
-        out = encoder_model(inputs)
-        input = Input(shape=(self.internal_dim,))
-        inputs.append(input)
-                
-        x=Add()([out,inputs[-1]]) # adding noise in the abstract state space
-        
-        out = Q_model(out)
-
-        model = Model(inputs=inputs, outputs=out)
-        
-        return model
-
-if __name__ == '__main__':
-    pass
-    
\ No newline at end of file
diff --git a/deer/q_networks/q_net_keras_lp_nstep.py b/deer/q_networks/q_net_keras_lp_nstep.py
deleted file mode 100644
index 04d55114..00000000
--- a/deer/q_networks/q_net_keras_lp_nstep.py
+++ /dev/null
@@ -1,809 +0,0 @@
-"""
-Code for general deep Q-learning using Keras that can take as inputs scalars, vectors and matrices
-
-.. Author: Vincent Francois-Lavet
-"""
-
-import numpy as np
-np.set_printoptions(threshold=np.nan)
-from keras.optimizers import SGD,RMSprop
-from keras import backend as K
-from ..base_classes import QNetwork
-from .NN_keras_lp_high_int_dim import NN # Default Neural network used
-import tensorflow as tf
-config = tf.ConfigProto()
-config.gpu_options.allow_growth=True
-sess = tf.Session(config=config)
-
-def mean_squared_error_p(y_true, y_pred):
-    return K.clip(K.max(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
-    #return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
-    #return K.mean(  K.square( K.clip(K.abs(y_pred - y_true)-1,0.,100.) )  ,  axis=-1  )   # = mse error
-
-def exp_dec_error(y_true, y_pred):
-    return K.exp( - 5.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
-
-def cosine_proximity2(y_true, y_pred):
-    y_true = K.l2_normalize(y_true[:,0:2], axis=-1)
-    y_pred = K.l2_normalize(y_pred[:,0:2], axis=-1)
-    return -K.sum(y_true * y_pred, axis=-1)
-
-#def rms_from_squared_components(y_true, y_pred):
-#    return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
-#
-#def squared_error_from_squared_components(y_true, y_pred):
-#    return - K.sum(  K.clip(y_pred,0.,1)  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
-
-def loss_diff_s_s_(y_true, y_pred):
-    return K.square(   1.    -    K.sqrt(  K.clip( K.sum(y_pred,axis=-1,keepdims=True), 0.000001 , 1. )  )     ) # tend to increase y_pred --> loss -1
-
-class MyQNetwork(QNetwork):
-    """
-    Deep Q-learning network using Keras (with any backend)
-    
-    Parameters
-    -----------
-    environment : object from class Environment
-    rho : float
-        Parameter for rmsprop. Default : 0.9
-    rms_epsilon : float
-        Parameter for rmsprop. Default : 0.0001
-    momentum : float
-        Default : 0
-    clip_delta : float
-        Not implemented.
-    freeze_interval : int
-        Period during which the target network is freezed and after which the target network is updated. Default : 1000
-    batch_size : int
-        Number of tuples taken into account for each iteration of gradient descent. Default : 32
-    update_rule: str
-        {sgd,rmsprop}. Default : rmsprop
-    random_state : numpy random number generator
-    double_Q : bool, optional
-        Activate or not the double_Q learning.
-        More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
-    neural_network : object, optional
-        default is deer.qnetworks.NN_keras
-    """
-
-    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN, **kwargs):
-        """ Initialize environment
-        
-        """
-        QNetwork.__init__(self,environment, batch_size)
-
-        
-        self._rho = rho
-        self._rms_epsilon = rms_epsilon
-        self._momentum = momentum
-        self._update_rule = update_rule
-        self._freeze_interval = freeze_interval
-        self._double_Q = double_Q
-        self._random_state = random_state
-        self.update_counter = 0    
-        self._high_int_dim = kwargs.get('high_int_dim',False)
-        self._internal_dim = kwargs.get('internal_dim',2)
-        self.loss_interpret=0
-        self.loss_T2=0
-        self.loss_disentangle_t=0
-        self.loss_disentangle_a=0
-        self.loss_Q=0
-        self.loss_disambiguate1=0
-        self.loss_disambiguate2=0
-        self.nstep=1 # ! If n>1, training Q, also modifies T, R and gamma which leads to higher loss for them.
-        self.loss_T=np.zeros((self.nstep))
-        self.loss_gamma=np.zeros((self.nstep))
-        self.lossR=np.zeros((self.nstep))
-
-        
-        self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
-
-        self.encoder = self.learn_and_plan.encoder_model()
-        self.encoder_diff = self.learn_and_plan.encoder_diff_model(self.encoder)
-        
-        self.Q = self.learn_and_plan.Q_model()
-        self.R = self.learn_and_plan.R_model()
-        self.gamma = self.learn_and_plan.R_model()
-        self.transition = self.learn_and_plan.transition_model()
-#        self.transition2 = self.learn_and_plan.transition_model2()
-
-        self.full_Qs=[]
-        for i in range(self.nstep):
-            self.full_Qs.append(self.learn_and_plan.full_Q_model(self.encoder,self.Q,i,self.transition,self.R,self.gamma))
-        
-        # used to fit rewards
-        self.full_Rs=[]
-        for i in range(self.nstep):
-            self.full_Rs.append(self.learn_and_plan.full_R_model(self.encoder,self.R,i,self.transition))
-
-        # used to fit gammas
-        self.full_gammas=[]
-        for i in range(self.nstep):
-            self.full_gammas.append(self.learn_and_plan.full_R_model(self.encoder,self.gamma,i,self.transition))
-        
-        # used to fit transitions
-        self.diff_Tx_x_s=[]
-        for i in range(self.nstep):
-            self.diff_Tx_x_s.append(self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition,i))#full_transition_model(self.encoder,self.transition)
-        
-        # used to force features variations
-        if(self._high_int_dim==False):
-            self.force_features=self.learn_and_plan.force_features(self.encoder,self.transition)
-        
-        # constraint on consecutive t
-        self.diff_s_s_ = self.learn_and_plan.encoder_diff_model(self.encoder)#diff_s_s_(self.encoder)
-#        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
-
-        # used to disentangle actions
-        self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)
-                
-        layers=self.encoder.layers+self.Q.layers+self.R.layers+self.gamma.layers+self.transition.layers
-        # Grab all the parameters together.
-        self.params = [ param
-                    for layer in layers 
-                    for param in layer.trainable_weights ]
-
-        self._compile()
-
-        self.learn_and_plan_target = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
-        self.encoder_target = self.learn_and_plan_target.encoder_model()
-        self.Q_target = self.learn_and_plan_target.Q_model()
-        self.R_target = self.learn_and_plan_target.R_model()
-        self.gamma_target = self.learn_and_plan_target.R_model()
-        self.transition_target = self.learn_and_plan_target.transition_model()
-        
-        self.full_Q_target = self.learn_and_plan_target.full_Q_model(self.encoder_target,self.Q_target) # FIXME
-        self.full_Q_target.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.full_Q
-
-        layers=self.encoder_target.layers+self.Q_target.layers+self.R_target.layers+self.gamma_target.layers+self.transition_target.layers
-        # Grab all the parameters together.
-        self.params_target = [ param
-                    for layer in layers 
-                    for param in layer.trainable_weights ]
-
-        self._resetQHat()
-
-    def getAllParams(self):
-        params_value=[]
-        for i,p in enumerate(self.params):
-            params_value.append(K.get_value(p))
-        return params_value
-
-    def setAllParams(self, list_of_values):
-        for i,p in enumerate(self.params):
-            K.set_value(p,list_of_values[i])
-
-    def train(self, observations_val, actions_val, rewards_val, terminals_val):
-        """
-        Train one batch.
-
-        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
-        2. perform batch training
-
-        Parameters
-        -----------
-        observations_val : batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        actions_val : b x 1 numpy array of integers
-        rewards_val : b x 1 numpy array
-        terminals_val : b x 1 numpy boolean array
-
-        Returns
-        -------
-        Average loss of the batch training (RMSE)
-        Individual (square) losses for each tuple
-        """
-        
-        onehot_actions = [np.zeros((self._batch_size, self._n_actions)) for n in range(self.nstep)]
-        for n in range(self.nstep):
-            onehot_actions[n][np.arange(self._batch_size), actions_val[:,n]] = 1
-        onehot_actions_rand = [np.zeros((self._batch_size, self._n_actions)) for n in range(self.nstep)]
-        for n in range(self.nstep):
-            onehot_actions_rand[n][np.arange(self._batch_size), np.random.randint(0,self._n_actions,(32))] = 1
-        
-        observations_val=list(observations_val)
-        states_val_1=[]
-        states_val=[]
-        next_states_val=[]
-        for obs in observations_val:
-            states_val_1.append(obs[:,0:1]) # t-n+1
-            states_val.append(obs[:,self.nstep-1:-1]) # t
-            next_states_val.append(obs[:,self.nstep:]) # t+1
-        Es_=self.encoder.predict(next_states_val)
-        Es=self.encoder.predict(states_val)
-        
-        if(self.update_counter%500==0):
-            if(self.nstep==2):
-                Es_1=self.encoder.predict(states_val_1)
-                ETs_1=self.transition.predict([Es_1,onehot_actions[0]]) # t+1
-                ETTs_1=self.transition.predict([ETs_1,onehot_actions[1]]) # t+1
-                print "ETTs_1[0]"
-                print ETTs_1[0]
-                print "onehot_actions"
-                print onehot_actions[0][0]
-                print onehot_actions[1][0]
-            
-            
-            ETs=self.transition.predict([Es,onehot_actions[-1]]) # t+1
-            R=self.R.predict([Es[0:1],np.array([[1,0,0,0]])]) # t
-            R1=self.R.predict([Es[0:1],np.array([[0,1,0,0]])]) # t
-            R2=self.R.predict([Es[0:1],np.array([[0,0,1,0]])]) # t
-            gamma=self.gamma.predict([Es,onehot_actions[-1]]) # t
-                   
-            print "states_val[0][0]"
-            print states_val[0][0]
-            #print "len(states_val)"
-            #print len(states_val)
-            #print states_val[0].shape
-            print "next_states_val[0][0]"
-            print next_states_val[0][0]
-            print actions_val[0], rewards_val[0], terminals_val[0]
-            print "Es[0],ETs[0],Es_[0]"
-            if(Es.ndim==4):
-                print np.transpose(Es, (0, 3, 1, 2))[0],np.transpose(ETs, (0, 3, 1, 2))[0],np.transpose(Es_, (0, 3, 1, 2))[0]    # data_format='channels_last' --> 'channels_first'
-            else:
-                print Es[0],ETs[0],Es_[0]
-            print "R[0]"
-            print R[0]
-            print R1[0]
-            print R2[0]
-            print "gamma[0]"
-            print gamma[0]
-            print "self.full_Qs[0].predict(states_val)[0]"
-            print self.full_Qs[0].predict(states_val)[0]
-            print "self.full_Rs[0].predict(states_val)[0]"
-            print self.full_Rs[0].predict(states_val+[np.repeat(np.array([[1,0,0,0]]),32,axis=0)])[0]
-            print self.full_Rs[0].predict(states_val+[np.repeat(np.array([[0,1,0,0]]),32,axis=0)])[0]
-            print self.full_Rs[0].predict(states_val+[np.repeat(np.array([[0,0,1,0]]),32,axis=0)])[0]
-            
-        # Fit transition
-        for n in range(self.nstep):
-            states_val=[]
-            for obs in observations_val:
-                states_val.append(obs[:,-n-2:-n-1]) # t-n
-            #print self.loss_T
-            #print self.loss_T[0]
-            #print self.loss_T[n]
-            self.loss_T[n]=self.loss_T[n]+self.diff_Tx_x_s[n].train_on_batch(states_val+next_states_val+onehot_actions[-1-n:]+[(1-terminals_val[:,-1])], np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))
-    
-        # Interpretable AI
-        if(self._high_int_dim==False):
-            target_modif_features=np.zeros((self._n_actions,self._internal_dim))
-            ## Catcher
-            #target_modif_features[0,0]=1    # dir
-            #target_modif_features[1,0]=-1   # opposite dir
-            #target_modif_features[0:2,1]=1    # temps
-            ## Laby
-            target_modif_features[0,0]=1
-            target_modif_features[1,0]=0
-            #target_modif_features[2,1]=0
-            #target_modif_features[3,1]=0
-            target_modif_features=np.repeat(target_modif_features,self._batch_size,axis=0)
-            states_val_tiled=[]
-            for obs in observations_val:
-                states_val_tiled.append(np.tile(obs[:,-2:-1],(self._n_actions,1,1,1)))
-            onehot_actions_tiled = np.diag(np.ones(self._n_actions))#np.zeros((self._batch_size*self._n_actions, self._n_actions))
-            onehot_actions_tiled = np.repeat(onehot_actions_tiled,self._batch_size,axis=0)
-                
-            self.loss_interpret+=self.force_features.train_on_batch(states_val_tiled+[onehot_actions_tiled], target_modif_features)
-
-        # Fit rewards
-        for n in range(self.nstep):
-            states_val=[]
-            for obs in observations_val:
-                states_val.append(obs[:,-n-2:-n-1]) # t-n
-            self.lossR[n]+=self.full_Rs[n].train_on_batch(states_val+onehot_actions[-1-n:], rewards_val[:,-1]) 
-       
-        # Fit gammas
-        for n in range(self.nstep):
-            states_val=[]
-            for obs in observations_val:
-                states_val.append(obs[:,-n-2:-n-1]) # t-n
-            self.loss_gamma[n]+=self.full_gammas[n].train_on_batch(states_val+onehot_actions[-1-n:], (1-terminals_val[:,-1])*self._df) 
-
-        # Loss to ensure limited volume in abstract state space
-        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
-        
-        # Increase the entropy in the abstract features of two states
-        # This is done only when states_val is made up of only one observation --> FIXME
-        rolled=np.roll(states_val[0],1,axis=0)
-#        for i in range(self._batch_size):
-#            j=0
-#            l=0
-#            while((states_val[0][i]==rolled[i+j-l]).all()):
-#                if(i+j==31):
-#                    l=self._batch_size
-#                if(j==31):
-#                    break
-#                j=j+1
-#            rolled[i]=rolled[i+j-l]
-        # Loss to ensure entropy in abstract state space
-        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
-        #
-        #
-        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.reshape(np.zeros_like(Es),(self._batch_size,-1)))#np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
-        #
-        ## Disentangle actions
-        self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch(states_val+onehot_actions[-1:]+onehot_actions_rand[-1:], np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.ones(self._batch_size))
-
-#
-#        # Loss to have all s' following s,a with a to a distance 1 of s,a)
-#        tiled_x=np.tile(Es,(self._n_actions,1))
-#        tiled_onehot_actions=np.tile(onehot_actions,(self._n_actions,1))
-#        tiled_onehot_actions2=np.repeat(np.diag(np.ones(self._n_actions)),self._batch_size,axis=0)
-#        #self.loss_disentangle_a+=self.diff_Tx.train_on_batch([tiled_x,tiled_onehot_actions,tiled_x,tiled_onehot_actions2], np.ones(self._batch_size*self._n_actions)) 
-
-
-        
-        if(self.update_counter%500==0):
-            print "self.loss_Q"
-            print self.loss_Q
-            if(self.nstep>1):
-                print "self.loss_T[0]/100.,self.loss_T[1]/100.,self.lossR[0]/100.,self.lossR[1]/100.,self.loss_gamma[0]/100.,self.loss_gamma[1]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
-                print self.loss_T[0]/100.,self.loss_T[1]/100.,self.lossR[0]/100.,self.lossR[1]/100.,self.loss_gamma[0]/100.,self.loss_gamma[1]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
-            else:
-                print "self.loss_T[0]/100.,self.lossR[0]/100.,self.loss_gamma[0]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
-                print self.loss_T[0]/100.,self.lossR[0]/100.,self.loss_gamma[0]/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
-            if(self._high_int_dim==False):
-                print "self.loss_interpret/100."
-                print self.loss_interpret/100.
-                
-            print K.get_value(self.encoder.optimizer.lr)
-            print K.get_value(self.encoder_diff.optimizer.lr)
-            self.loss_T=np.zeros((self.nstep))
-            self.loss_interpret=0
-            self.loss_T2=0
-            self.lossR=np.zeros((self.nstep))
-            self.loss_gamma=np.zeros((self.nstep))
-            self.loss_Q=0
-
-            self.loss_disentangle_t=0
-            self.loss_disentangle_a=0
-            
-            self.loss_disambiguate1=0
-            self.loss_disambiguate2=0
-            
-            print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es))
-            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es))
-
-            print "self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
-            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
-
-            print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es))
-
-
-        if self.update_counter % self._freeze_interval == 0:
-            self._resetQHat()
-        
-        next_q_vals = self.full_Q_target.predict([next_states_val[0]])
-        #next_q_vals = self.qValues_planning(next_states_val, self.R_target, self.transition_target, self.Q_target, d=self.nstep)#self.full_Q_target.predict([next_states_val[0]])
-        
-        if(self._double_Q==True):
-            next_q_vals_current_qnet=self.full_Qs[0].predict(next_states_val)
-            argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1)
-            max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1))
-        else:
-            max_next_q_vals=np.max(next_q_vals, axis=1, keepdims=True)
-
-        not_terminals=np.ones_like(terminals_val) - terminals_val
-        
-        target = rewards_val[:,-1] + not_terminals[:,-1] * self._df * max_next_q_vals.reshape((-1))
-        
-        
-        q_vals=[]
-        for n in range(self.nstep):
-            states_val=[]
-            for obs in observations_val:
-                states_val.append(obs[:,-n-2:-n-1]) # t
-            q_vals.append(self.full_Qs[n].predict(states_val+onehot_actions[-1-n:-1]))
-
-        # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
-        # FIXME for all n
-        q_val=q_vals[0][np.arange(self._batch_size), actions_val[:,0]]     
-        diff = - q_val + target 
-        loss_ind=pow(diff,2)
-        
-        for n in range(self.nstep):
-            q_vals[n][  np.arange(self._batch_size), actions_val[:,-1]  ] = target
-                
-        # Is it possible to use something more flexible than this? 
-        # Only some elements of next_q_vals are actual value that I target. 
-        # My loss should only take these into account.
-        # Workaround here is that many values are already "exact" in this update
-
-        #print "q_vals"
-        #print q_vals[0][0],q_vals[1][0]
-        loss=0
-        for n in range(self.nstep):
-            states_val=[]
-            for obs in observations_val:
-                states_val.append(obs[:,-n-2:-n-1]) # t-n
-            loss+=self.full_Qs[n].train_on_batch(states_val+onehot_actions[-1-n:-1] , q_vals[n] ) 
-        self.loss_Q+=loss
-
-        if(self.update_counter%100==0):
-            print self.update_counter
-        
-        self.update_counter += 1        
-
-        # loss*self._n_actions = np.average(loss_ind)
-        return np.sqrt(loss),loss_ind
-
-
-#    def train_model(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
-#        """
-#        Train the model based part
-#
-#        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
-#        2. perform batch training
-#
-#        Parameters
-#        -----------
-#        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-#        actions_val : b x 1 numpy array of integers
-#        rewards_val : b x 1 numpy array
-#        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-#        terminals_val : b x 1 numpy boolean array
-#
-#        Returns
-#        -------
-#        Average loss of the batch training (RMSE)
-#        Individual (square) losses for each tuple
-#        """
-#
-#        onehot_actions = np.zeros((self._batch_size, self._n_actions))
-#        onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
-#        Es_=self.encoder.predict([next_states_val[0]])
-#        Es=self.encoder.predict([states_val[0]])
-#        ETs=self.transition.predict([Es,onehot_actions])
-#
-##        if(self.update_counter>3000):
-#        self.loss_T2=self.transition2.train_on_batch([Es,onehot_actions], Es_)
-##        if(self.update_counter%100==0):
-##            loss=0.
-##            for i in range (100):
-##                loss+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
-##                if(i%10==0):
-##                    print "loss/(i+1)"
-##                    print loss/(i+1)
-##            print "loss/100."
-##            print loss/100.
-#            #print K.get_value(self.transition2.optimizer.lr)
-#            #print [ K.get_value(param)
-#            #        for layer in self.encoder.layers
-#            #        for param in layer.trainable_weights ][0][0]
-#        return self.loss_T2
-
-
-
-    def qValues(self, state_val):
-        """ Get the q values for one belief state (without planning)
-
-        Arguments
-        ---------
-        state_val : one belief state
-
-        Returns
-        -------
-        The q values for the provided belief state
-        """ 
-        #return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
-        return self.full_Qs[0].predict([np.expand_dims(state,axis=0) for state in state_val])[0]
-
-    def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
-        """ Get the q values for one belief state with a planning depth d
-
-        Arguments
-        ---------
-        state_val : one belief state
-        d : planning depth
-
-        Returns
-        -------
-        The q values with planning depth d for the provided belief state
-        """
-        #print "state_val[0]"
-        #print state_val[0]
-        #print len(state_val)
-#        print "state_val[0][0]"
-#        print state_val[0][0]
-#        print state_val[0].shape
-        print "self.full_Qs[0].predict(state_val)[0]"
-        print self.full_Qs[0].predict(state_val)[0]
-        encoded_x = self.encoder.predict(state_val)
-        ## DEBUG PURPOSES
-#        print "encoded_x[0]"
-#        print encoded_x[0]
-        
-        identity_matrix = np.diag(np.ones(self._n_actions))
-        if(encoded_x.ndim==2):
-            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
-        elif(encoded_x.ndim==4):
-            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1,1,1))
-        else:
-            print ("error")
-        
-        repeat_identity=np.repeat(identity_matrix,len(encoded_x),axis=0)
-        ##print tile3_encoded_x
-        ##print repeat_identity
-        r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
-        #print "r_vals_d0"
-        #print r_vals_d0
-        r_vals_d0=r_vals_d0.flatten()
-        print "r_vals_d0"
-        print r_vals_d0
-        next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
-        #print "next_x_predicted"
-        #print next_x_predicted
-        one_hot_first_action=np.zeros((1,self._n_actions))
-        one_hot_first_action[0]=1
-        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
-        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
-        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
-        #print "next_x_predicted action 0 t4"
-        #print next_x_predicted
-        ## END DEBUG PURPOSES
-
-        QD_plan=0
-        for i in range(d+1): #TO DO: improve planning algorithm
-            #print encoded_x
-            Qd=self.qValues_planning_abstr(encoded_x, R, gamma, T, Q, d=i, branching_factor=[self._n_actions,2,2,2,2,2,2,2]).reshape(len(encoded_x),-1)
-            print "Qd,i"
-            print Qd,i
-            QD_plan+=Qd
-        QD_plan=QD_plan/(d+1)
-        
-        print "QD_plan"
-        print QD_plan
-
-        return QD_plan
-
-#    def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
-#        """ 
-#        """
-#        branching_factor=self._n_actions #TO IMPROVE, use MCTS, etc...
-#        n=len(state_abstr_val)
-#        identity_matrix = np.diag(np.ones(self._n_actions))
-#        
-#        this_branching_factor=branching_factor
-#                         
-#        if (d==0):
-#            return Q.predict([state_abstr_val]) # no change in the order of the actions
-#        else:
-#            # All actions are considered in the tree
-#            repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0) # no change in the order of the actions
-#            if(state_abstr_val.ndim==2):
-#                tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
-#            elif(state_abstr_val.ndim==4):
-#                tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1,1,1))
-#            else:
-#                print ("error")
-#            
-#            #print tile3_encoded_x
-#            #print repeat_identity
-#            r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
-#            #print "r_vals_d0"
-#            #print r_vals_d0
-#            r_vals_d0=r_vals_d0.flatten()
-#            
-#            gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
-#            #print "r_vals_d0"
-#            #print r_vals_d0
-#            gamma_vals_vals_d0=gamma_vals_d0.flatten()
-#
-#            next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
-#            return r_vals_d0+gamma_vals_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor),axis=1).flatten()
-  
-  
-    def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
-        """ 
-        """
-        #if(branching_factor==None or branching_factor>self._n_actions):
-        #    branching_factor=self._n_actions
-        
-        #print "qValues_planning_abstr d"
-        #print d
-        n=len(state_abstr_val)
-        identity_matrix = np.diag(np.ones(self._n_actions))
-        
-        this_branching_factor=branching_factor.pop(0)
-        if (n==1):
-            # We require that the first branching factor is self._n_actions so that QD_plan has the right dimension
-            this_branching_factor=self._n_actions
-        #else:
-        #    this_branching_factor=branching_factor
-                         
-        if (d==0):
-            if(this_branching_factor<self._n_actions):
-                return np.partition(Q.predict([state_abstr_val]), -this_branching_factor)[:,-this_branching_factor:]
-            else:
-                return Q.predict([state_abstr_val]) # no change in the order of the actions
-        else:
-            if(this_branching_factor==self._n_actions):
-                # All actions are considered in the tree
-                repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0)
-                if(state_abstr_val.ndim==2):
-                    tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
-                elif(state_abstr_val.ndim==4):
-                    tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1,1,1))
-                else:
-                    print ("error")
-            else:
-                # A subset of the actions are considered in the tree
-                estim_Q_values=Q.predict([state_abstr_val])
-                #print estim_Q_values
-                ind = np.argpartition(estim_Q_values, -this_branching_factor)[:,-this_branching_factor:]
-                #print ind
-                #print identity_matrix[ind]
-                #repeat_identity=np.repeat(identity_matrix[ind],len(state_abstr_val),axis=0)
-                repeat_identity=identity_matrix[ind].reshape(n*this_branching_factor,self._n_actions)
-                #print repeat_identity
-                #if(state_abstr_val.ndim==2):
-                #    tile3_encoded_x=np.tile(state_abstr_val,(this_branching_factor,1))
-                #elif(state_abstr_val.ndim==4):
-                #    tile3_encoded_x=np.tile(state_abstr_val,(this_branching_factor,1,1,1))
-                #else:
-                #    print ("error")
-                tile3_encoded_x=np.repeat(state_abstr_val,this_branching_factor,axis=0)
-                #print "tile3_encoded_x"
-                #print tile3_encoded_x
-            
-            #print tile3_encoded_x
-            #print repeat_identity
-            r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
-            #print "r_vals_d0"
-            #print r_vals_d0
-            r_vals_d0=r_vals_d0.flatten()
-            
-            gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
-            #print "r_vals_d0"
-            #print r_vals_d0
-            gamma_vals_d0=gamma_vals_d0.flatten()
-
-            next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
-            return r_vals_d0+gamma_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor[0]),axis=1).flatten()
-        
-
-
-    def chooseBestAction(self, state, mode=0):
-        """ Get the best action for a belief state
-
-        Arguments
-        ---------
-        state : one belief state
-
-        Returns
-        -------
-        The best action : int
-        """
-        if(mode>0):
-            # We use the mode to define the planning depth
-            q_vals = self.qValues_planning([np.expand_dims(s,axis=0) for s in state],self.R,self.gamma, self.transition, self.Q, d=mode*2)#self.qValues(state)#
-        else:
-            q_vals = self.qValues_planning([np.expand_dims(s,axis=0) for s in state],self.R,self.gamma, self.transition, self.Q, d=0)
-        return np.argmax(q_vals),np.max(q_vals)
-        
-    def _compile(self):
-        """ compile self.q_vals
-        """
-        if (self._update_rule=="sgd"):
-            optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False)
-        elif (self._update_rule=="rmsprop"):
-            optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon)
-        else:
-            raise Exception('The update_rule '+self._update_rule+' is not implemented.')
-        
-        optimizer1=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # Different optimizers for each network; otherwise not possible to modify each
-        optimizer2=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately (e.g. lr)
-        optimizer3=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer5=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer6=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer7=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer8=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-
-        for i in range(self.nstep):
-            #for l in self.R.layers+self.gamma.layers+self.transition.layers:
-            #    l.trainable=False
-            self.full_Qs[i].compile(optimizer=optimizer, loss='mse')
-            #for l in self.R.layers+self.gamma.layers:
-            #    l.trainable=True
-            self.full_Rs[i].compile(optimizer=optimizer3, loss='mse') # Fit rewards
-            self.full_gammas[i].compile(optimizer=optimizer3, loss='mse') # Fit gammas
-            #for l in self.transition.layers:
-            #    l.trainable=True            
-            self.diff_Tx_x_s[i].compile(optimizer=optimizer1, loss='mse') # Fit transitions
-
-        if(self._high_int_dim==False):
-            self.force_features.compile(optimizer=optimizer8,
-                  loss=cosine_proximity2)
-#        self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
-
-        self.encoder.compile(optimizer=optimizer4,
-                  loss=mean_squared_error_p)
-        self.encoder_diff.compile(optimizer=optimizer5,
-                  loss=exp_dec_error)
-                  #metrics=['accuracy'])
-
-        self.diff_s_s_.compile(optimizer=optimizer6,
-                  loss=exp_dec_error)#'mse')#loss_diff_s_s_)
-                  #metrics=['accuracy'])
-
-        self.diff_sa_sa.compile(optimizer=optimizer7,
-                  loss=exp_dec_error)#loss_diff_s_s_)
-
-#        self.diff_Tx.compile(optimizer=optimizer,
-#                  loss=mean_squared_error)
-#                  #metrics=['accuracy'])
-
-    def _resetQHat(self):
-        for i,(param,param_target) in enumerate(zip(self.params, self.params_target)):
-            K.set_value(param_target,K.get_value(param))
-
-    def setLearningRate(self, lr):
-        """ Setting the learning rate
-
-        Parameters
-        -----------
-        lr : float
-            The learning rate that has to be set
-        """
-        self._lr = lr
-        print "modif lr"
-        # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
-        for i in range(self.nstep):
-            K.set_value(self.full_Qs[i].optimizer.lr, self._lr)
-            K.set_value(self.full_Rs[i].optimizer.lr, self._lr)
-            K.set_value(self.full_gammas[i].optimizer.lr, self._lr)
-            K.set_value(self.diff_Tx_x_s[i].optimizer.lr, self._lr)
-        
-#        K.set_value(self.transition2.optimizer.lr, self._lr/2.)
-
-        if(self._high_int_dim==False):
-            K.set_value(self.force_features.optimizer.lr, self._lr*0.75)
-
-        K.set_value(self.encoder.optimizer.lr, self._lr)
-        K.set_value(self.encoder_diff.optimizer.lr, self._lr)
-
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
-        K.set_value(self.diff_sa_sa.optimizer.lr, 0) # 0 !
-#        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
-
-    def transfer(self, original, transfer, epochs=1):
-        # First, make sure that the target network and the current network are the same
-        self._resetQHat()
-        # modify the loss of the encoder
-        #self.encoder=self.learn_and_plan.encoder_model()
-        #for l in self.encoder.layers[-5:]:
-        #    l.trainable = False # Freeze dense layers # DOES NOT SEEM TO HELP (transfer on catcher)
-        #print "self.encoder.layers[-1].get_weights()"
-        #print self.encoder.layers[-1].get_weights()
-        
-        optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        self.encoder.compile(optimizer=optimizer4,
-                  loss='mse')
-        
-        # Then, train the encoder such that the original and transfer states are mapped into the same abstract representation
-        x_original=self.encoder.predict(original)#[0]
-        print "x_original[0:10]"
-        print x_original[0:10]
-        for i in range(epochs):
-            size = original[0].shape[0]
-            #print size
-            #print transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)]
-            print "train"
-            print self.encoder.train_on_batch(transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
-            #print self.encoder.train_on_batch(original[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
-            print "validation"
-            print self.encoder.test_on_batch(transfer[0][int(size*0.8):] , x_original[int(size*0.8):])
-            #print self.encoder.test_on_batch(original[0][int(size*0.8):] , x_original[int(size*0.8):] )
-         
-        #print "self.encoder.layers[-1].get_weights()"
-        #print self.encoder.layers[-1].get_weights()
-        #for l in self.encoder.layers[-5:]:
-        #    l.trainable = True
-        # recompile with original loss
-        self.encoder.compile(optimizer=optimizer4,
-                  loss=mean_squared_error_p)

From 58ee03d51d78ffc93f598a28b34401a3a723e79d Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 11:51:57 -0400
Subject: [PATCH 56/96] q_network renamed to more general learning_algo +
 necessary modifications

---
 deer/base_classes/Policy.py                   |   8 +-
 deer/base_classes/__init__.py                 |   6 +-
 .../{QNetwork.py => learning_algo.py}         |   7 +-
 deer/learning_algo/AC_net_keras.py            |   2 +-
 deer/learning_algo/CRAR_keras.py              |  11 +-
 deer/learning_algo/NN_CRAR_keras.py           |   4 +-
 deer/learning_algo/NN_keras.py                |   4 +-
 deer/learning_algo/q_net_keras.py             |   4 +-
 deer/policies/EpsilonGreedyPolicy.py          |   4 +-
 deer/policies/LongerExplorationPolicy.py      |   4 +-
 .../MG_two_storages/run_MG_two_storages.py    |   2 +-
 examples/gym/run_mountain_car.py              |   2 +-
 examples/gym/run_mountain_car_continuous.py   |   2 +-
 examples/gym/run_pendulum.py                  |   2 +-
 examples/pendulum/pendulum_env.py             | 159 ------------------
 examples/pendulum/render_movie.py             | 101 -----------
 examples/pendulum/run_pendulum.py             | 127 --------------
 examples/simplest_test_PLI/run_catcher.py     |  22 +--
 examples/simplest_test_PLI/run_simple_maze.py |  11 +-
 examples/toy_env/run_toy_env.py               |   2 +-
 examples/toy_env/run_toy_env_simple.py        |   2 +-
 21 files changed, 49 insertions(+), 437 deletions(-)
 rename deer/base_classes/{QNetwork.py => learning_algo.py} (91%)
 delete mode 100644 examples/pendulum/pendulum_env.py
 delete mode 100644 examples/pendulum/render_movie.py
 delete mode 100644 examples/pendulum/run_pendulum.py

diff --git a/deer/base_classes/Policy.py b/deer/base_classes/Policy.py
index 2797f2af..673088a3 100644
--- a/deer/base_classes/Policy.py
+++ b/deer/base_classes/Policy.py
@@ -6,14 +6,14 @@ class Policy(object):
 
     Parameters
     -----------
-    q_network : object from class QNetwork
+    learning_algo : object from class LearningALgo
     n_actions : int or list
         Definition of the action space provided by Environment.nActions()
     random_state : numpy random number generator
     """
 
-    def __init__(self, q_network, n_actions,random_state):
-        self.q_network = q_network
+    def __init__(self, learning_algo, n_actions,random_state):
+        self.learning_algo = learning_algo
         self.n_actions = n_actions
         self.random_state = random_state
 
@@ -22,7 +22,7 @@ def __init__(self, q_network, n_actions,random_state):
     def bestAction(self, state, mode=None):
         """ Returns the best Action for the given state. This is an additional encapsulation for q-network.
         """
-        action,V = self.q_network.chooseBestAction(state, mode)
+        action,V = self.learning_algo.chooseBestAction(state, mode)
         return action, V
 
     def randomAction(self):
diff --git a/deer/base_classes/__init__.py b/deer/base_classes/__init__.py
index ff9375ae..34ee3607 100644
--- a/deer/base_classes/__init__.py
+++ b/deer/base_classes/__init__.py
@@ -1,3 +1,3 @@
-from .Environment import Environment
-from .QNetwork import QNetwork
-from .Policy import Policy
\ No newline at end of file
+from .environment import Environment
+from .learning_algo import LearningAlgo
+from .policy import Policy
\ No newline at end of file
diff --git a/deer/base_classes/QNetwork.py b/deer/base_classes/learning_algo.py
similarity index 91%
rename from deer/base_classes/QNetwork.py
rename to deer/base_classes/learning_algo.py
index bb24145b..f1da26dc 100644
--- a/deer/base_classes/QNetwork.py
+++ b/deer/base_classes/learning_algo.py
@@ -1,11 +1,12 @@
 """
-.. Authors: Vincent Francois-Lavet, David Taralla
+This module defines the base class for the learning algorithms.
+
 """
 
 import numpy as np
 
-class QNetwork(object):
-    """ All the Q-networks and actor-critic networks should inherit this interface.
+class LearningAlgo(object):
+    """ All the Q-networks, actor-critic networks, etc. should inherit this interface.
 
     Parameters
     -----------
diff --git a/deer/learning_algo/AC_net_keras.py b/deer/learning_algo/AC_net_keras.py
index bbc2edd0..9f82b92a 100644
--- a/deer/learning_algo/AC_net_keras.py
+++ b/deer/learning_algo/AC_net_keras.py
@@ -6,7 +6,7 @@
 
 import sys
 import numpy as np
-from ..base_classes import QNetwork as ACNetwork
+from ..base_classes import LearningAlgo as ACNetwork
 from .NN_keras import NN # Default Neural network used
 from warnings import warn
 from keras.optimizers import SGD,RMSprop
diff --git a/deer/learning_algo/CRAR_keras.py b/deer/learning_algo/CRAR_keras.py
index 0628508c..31691b10 100644
--- a/deer/learning_algo/CRAR_keras.py
+++ b/deer/learning_algo/CRAR_keras.py
@@ -1,15 +1,14 @@
 """
-Code for general deep Q-learning using Keras that can take as inputs scalars, vectors and matrices
+Code for the CRAR agent using Keras
 
-.. Author: Vincent Francois-Lavet
 """
 
 import numpy as np
 np.set_printoptions(threshold=np.nan)
 from keras.optimizers import SGD,RMSprop
 from keras import backend as K
-from ..base_classes import QNetwork
-from .NN_keras_lp_high_int_dim import NN # Default Neural network used
+from ..base_classes import LearningAlgo
+from .NN_CRAR_keras import NN # Default Neural network used
 import tensorflow as tf
 config = tf.ConfigProto()
 config.gpu_options.allow_growth=True
@@ -38,9 +37,9 @@ def cosine_proximity2(y_true, y_pred):
 def loss_diff_s_s_(y_true, y_pred):
     return K.square(   1.    -    K.sqrt(  K.clip( K.sum(y_pred,axis=-1,keepdims=True), 0.000001 , 1. )  )     ) # tend to increase y_pred --> loss -1
 
-class MyQNetwork(QNetwork):
+class CRAR(LearningAlgo):
     """
-    Deep Q-learning network using Keras (with any backend)
+    Combined Reinforcement learning via Abstract Representations (CRAR) using Keras
     
     Parameters
     -----------
diff --git a/deer/learning_algo/NN_CRAR_keras.py b/deer/learning_algo/NN_CRAR_keras.py
index 176a1d33..4bfee42b 100644
--- a/deer/learning_algo/NN_CRAR_keras.py
+++ b/deer/learning_algo/NN_CRAR_keras.py
@@ -1,6 +1,6 @@
 """
-Neural network using Keras (called by q_net_keras)
-.. Author: Vincent Francois-Lavet
+CRAR Neural network using Keras
+
 """
 
 import numpy as np
diff --git a/deer/learning_algo/NN_keras.py b/deer/learning_algo/NN_keras.py
index 9256b649..56273aff 100644
--- a/deer/learning_algo/NN_keras.py
+++ b/deer/learning_algo/NN_keras.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, merge, Activation, Conv2D, MaxPooling2D, Reshape, Permute
+from keras.layers import Input, Layer, Dense, Flatten, concatenate, Activation, Conv2D, MaxPooling2D, Reshape, Permute
 
 class NN():
     """
@@ -92,7 +92,7 @@ def _buildDQN(self):
                 outs_conv.append(input)
         
         if len(outs_conv)>1:
-            x = merge(outs_conv, mode='concat')
+            x = concatenate(outs_conv)
         else:
             x= outs_conv [0]
         
diff --git a/deer/learning_algo/q_net_keras.py b/deer/learning_algo/q_net_keras.py
index 35fe7618..17bb8818 100644
--- a/deer/learning_algo/q_net_keras.py
+++ b/deer/learning_algo/q_net_keras.py
@@ -7,7 +7,7 @@
 import numpy as np
 from keras.optimizers import SGD,RMSprop
 from keras import backend as K
-from ..base_classes import QNetwork
+from ..base_classes import LearningAlgo as QNetwork
 from .NN_keras import NN # Default Neural network used
 
 class MyQNetwork(QNetwork):
@@ -147,7 +147,7 @@ def qValues(self, state_val):
         """ 
         return self.q_vals.predict([np.expand_dims(state,axis=0) for state in state_val])[0]
 
-    def chooseBestAction(self, state):
+    def chooseBestAction(self, state, *args, **kwargs):
         """ Get the best action for a belief state
 
         Arguments
diff --git a/deer/policies/EpsilonGreedyPolicy.py b/deer/policies/EpsilonGreedyPolicy.py
index 547142ae..638ba070 100644
--- a/deer/policies/EpsilonGreedyPolicy.py
+++ b/deer/policies/EpsilonGreedyPolicy.py
@@ -10,8 +10,8 @@ class EpsilonGreedyPolicy(Policy):
     epsilon : float
         Proportion of random steps
     """
-    def __init__(self, q_network, n_actions, random_state, epsilon):
-        Policy.__init__(self, q_network, n_actions, random_state)
+    def __init__(self, learning_algo, n_actions, random_state, epsilon):
+        Policy.__init__(self, learning_algo, n_actions, random_state)
         self._epsilon = epsilon
 
     def action(self, state, mode=None):
diff --git a/deer/policies/LongerExplorationPolicy.py b/deer/policies/LongerExplorationPolicy.py
index b9c82183..35a1f55b 100644
--- a/deer/policies/LongerExplorationPolicy.py
+++ b/deer/policies/LongerExplorationPolicy.py
@@ -19,8 +19,8 @@ class LongerExplorationPolicy(Policy):
     length : int
         Length of the exploration sequences that will be considered
     """
-    def __init__(self, q_network, n_actions, random_state, epsilon, length=10):
-        Policy.__init__(self, q_network, n_actions, random_state)
+    def __init__(self, learning_algo, n_actions, random_state, epsilon, length=10):
+        Policy.__init__(self, learning_algo, n_actions, random_state)
         self._epsilon = epsilon
         self._l = length
         self._count_down = -1
diff --git a/examples/MG_two_storages/run_MG_two_storages.py b/examples/MG_two_storages/run_MG_two_storages.py
index fa4e7c38..6286d3c5 100644
--- a/examples/MG_two_storages/run_MG_two_storages.py
+++ b/examples/MG_two_storages/run_MG_two_storages.py
@@ -16,7 +16,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_theano import MyQNetwork
+from deer.learning_algo.q_net_keras import MyQNetwork
 from MG_two_storages_env import MyEnv as MG_two_storages_env
 import deer.experiment.base_controllers as bc
 
diff --git a/examples/gym/run_mountain_car.py b/examples/gym/run_mountain_car.py
index d49958c9..e4e877ea 100644
--- a/examples/gym/run_mountain_car.py
+++ b/examples/gym/run_mountain_car.py
@@ -11,7 +11,7 @@
 import deer.experiment.base_controllers as bc
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_theano import MyQNetwork
+from deer.learning_algo.q_net_keras import MyQNetwork
 from mountain_car_env import MyEnv as mountain_car_env
 
 class Defaults:
diff --git a/examples/gym/run_mountain_car_continuous.py b/examples/gym/run_mountain_car_continuous.py
index 0b2a7c0d..128b7570 100644
--- a/examples/gym/run_mountain_car_continuous.py
+++ b/examples/gym/run_mountain_car_continuous.py
@@ -11,7 +11,7 @@
 import deer.experiment.base_controllers as bc
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.AC_net_keras import MyACNetwork
+from deer.learning_algo.AC_net_keras import MyACNetwork
 from mountain_car_continuous_env import MyEnv as mountain_car_continuous_env
 from deer.policies import LongerExplorationPolicy
 
diff --git a/examples/gym/run_pendulum.py b/examples/gym/run_pendulum.py
index 7fd1617a..36372d1a 100644
--- a/examples/gym/run_pendulum.py
+++ b/examples/gym/run_pendulum.py
@@ -11,7 +11,7 @@
 import deer.experiment.base_controllers as bc
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_theano import MyQNetwork
+from deer.learning_algo.q_net_keras import MyQNetwork
 from pendulum_env import MyEnv as pendulum_env
 
 class Defaults:
diff --git a/examples/pendulum/pendulum_env.py b/examples/pendulum/pendulum_env.py
deleted file mode 100644
index eef3f302..00000000
--- a/examples/pendulum/pendulum_env.py
+++ /dev/null
@@ -1,159 +0,0 @@
-""" The environment simulates the behavior of an inverted pendulum.
-The goal of the agent, as suggested by the reward function, is 
-to balance a pole on a cart that can either move left or right.
-
-Code is based on the following inverted pendulum implementations
-in C : http://webdocs.cs.ualberta.ca/%7Esutton/book/code/pole.c
-in Python : https://github.com/toddsifleet/inverted_pendulum
-
-Please refer to the wiki for a complete decription of the problem.
-
-Author: Aaron Zixiao Qiu
-"""
-
-import numpy as np
-import copy
-
-import theano
-
-from render_movie import save_mp4
-from deer.base_classes import Environment
-
-# Physics constants
-G = 9.8 
-M_CART = 1.0
-M_POLE = 0.1
-L = 0.5
-F = 100
-DELTA_T = 0.02
-PI = np.pi
-MU_C = 0.0005
-MU_P = 0.000002
-
-class MyEnv(Environment):
-    def __init__(self, rng):
-        """ Initialize environment.
-
-        Arguments:
-            rng - the numpy random number generator            
-        """
-        # Defining the type of environment
-        self._rng = rng
-        # Observations = (x, x_dot, theta, theta_dot, timestamp)
-        self._last_observation = [0, 0, 0, 0]
-        self._input_dim = [(1,), (1,), (1,), (1,)]
-        self._video = 0
-
-    def act(self, action):
-        """ This is the most important function in the environment. 
-        We simulate one time step in the environment. Given an input 
-        action, compute the next state of the system (position, speed, 
-        angle, angular speed) and return a reward. 
-        
-        Argument:
-            action - 0: move left (F = -10N); 1: move right (F = +10N)
-        Return:
-            reward - reward for this transition
-        """
-        # Direction of the force
-        force = F
-        if (action == 0):
-            force = -F
-
-        # Divide DELTA_T into smaller tau's, to better take into account
-        # the transitions
-        n_tau = 10
-        tau = DELTA_T / n_tau
-        for i in range(n_tau):
-            # Physics -> See wiki for the formulas
-            x, x_dot, theta, theta_dot, = self._last_observation#_ = self._last_observation
-            cos_theta = np.cos(theta)
-            sin_theta = np.sin(theta)
-
-            f_cart = MU_C * np.sign(x_dot)
-            f_pole = MU_P * theta_dot / (M_POLE*L)
-
-            tmp = (force + M_POLE*L*sin_theta*theta_dot**2 - f_cart) \
-                  / (M_POLE + M_CART)
-            theta_dd = (G*sin_theta - cos_theta*tmp - f_pole) \
-                       / (L*(4/3. - M_POLE*cos_theta**2/(M_POLE + M_CART))) 
-            x_dd = tmp - M_POLE*theta_dd*cos_theta/(M_POLE + M_CART)
-
-            # Update observation vector
-            self._last_observation = [
-                x + tau*x_dot,
-                x_dot + tau*x_dd,
-                self._to_range(theta + tau*theta_dot),
-                theta_dot + tau*theta_dd,
-                ]
-
-        # Simple reward
-        reward = - abs(theta) 
-        reward -= abs(self._last_observation[0])/2.
-
-        # The cart cannot move beyond -5 or 5
-        if(self._last_observation[0]<-5):
-            self._last_observation[0]=-5
-        if(self._last_observation[0]>5):
-            self._last_observation[0]=5
- 
-
-        return reward
-                
-    def reset(self, mode=0):
-        """ Reset environment for a new episode.
-
-        Arguments:
-            mode - Not used in this example.
-        """
-        # Reset initial observation to a random x and theta
-        x = self._rng.uniform(-1, 1)
-        theta = self._rng.uniform(-PI, PI)
-        self._last_observation = [x, 0, theta, 0]
-
-        return self._last_observation
-        
-    def summarizePerformance(self, test_data_set, *args, **kwargs):
-        """ This function is called at every PERIOD_BTW_SUMMARY_PERFS.
-
-        Arguments:
-            test_data_set - Simulation data returned by the agent.
-        """
-        print ("Summary Perf")
-
-        # Save the data in the correct input format for video generation
-        observations = test_data_set.observations()
-        data = np.zeros((len(observations[0]), len(observations)))
-        for i in range(1, 4):
-            data[:,i] = observations[i - 1]
-        data[:,0]=np.arange(len(observations[0]))*0.02
-        save_mp4(data, self._video)
-        self._video += 1
-        return
-
-    def _to_range(self, angle):
-        # Convert theta in the range [-PI, PI]
-        n = abs(angle) // (2*PI)
-        if (angle < 0):
-            angle += n*2*PI
-        else:
-            angle -= n*2*PI
-
-        if (angle < -PI):
-            angle = 2*PI - abs(angle)
-        elif (angle > PI):
-            angle = -(2*PI - angle)
-
-        return angle
-
-    def inputDimensions(self):
-        return self._input_dim  
-
-    def nActions(self):
-        # The environment allows two different actions to be taken
-        # at each time step
-        return 2             
-
-    def observe(self):
-        return copy.deepcopy(self._last_observation)  
-
diff --git a/examples/pendulum/render_movie.py b/examples/pendulum/render_movie.py
deleted file mode 100644
index 6784658f..00000000
--- a/examples/pendulum/render_movie.py
+++ /dev/null
@@ -1,101 +0,0 @@
-""" The script is slightly adapted from:
-
-https://github.com/toddsifleet/inverted_pendulum/blob/master/render_movie.py
-
-Author: Aaron Zixiao Qiu
-"""
-
-import os
-import numpy as np
-from math import sin, cos, pi
-
-import matplotlib.pyplot as plt
-
-PI = np.pi
-
-def save_mp4(data, n):
-    if (not os.path.exists('./img')):
-        os.makedirs('./img')
-
-    if (not os.path.exists('./video')):
-        os.makedirs('./video')
-
-    # Create temporal layout at the bottom
-    fig = plt.figure(0)
-    fig.suptitle("Pendulum on Cart")
-
-    cart_time_line = plt.subplot2grid(
-        (12, 12),
-        (9, 0),
-        colspan=12,
-        rowspan=3
-    )
-
-    # Draw displacement curve
-    t_max = max(data[:,0])
-    cart_time_line.axis([
-        0,
-        t_max,
-        min(data[:,1])*1.1,
-        max(data[:,1])*1.1+.1,
-    ])
-    cart_time_line.set_xlabel('time (s)')
-    cart_time_line.set_ylabel('x (m)')
-    cart_time_line.plot(data[:,0], data[:,1],'r-')
-
-    # Draw theta curve
-    pendulum_time_line = cart_time_line.twinx()
-    pendulum_time_line.axis([
-        0,
-        t_max,
-        min(data[:,3])*1.1-.1,
-        max(data[:,3])*1.1
-    ])
-    pendulum_time_line.set_ylabel('theta (rad)')
-    pendulum_time_line.plot(data[:,0], data[:,3],'g-')
-
-    # Cart layout
-    cart_plot = plt.subplot2grid(
-        (12,12),
-        (0,0),
-        rowspan=8,
-        colspan=12
-    )
-    cart_plot.axes.get_yaxis().set_visible(False)
-
-    # Draw cart and pole
-    t = 0
-    fps = 25.
-    frame_number = 1
-    x_min = min([min(data[:,1]), -1.1])
-    x_max = max([max(data[:,1]), 1.1])
-
-    time_bar, = cart_time_line.plot([0,0], [10000, -10000], lw=3)
-    for point in data:
-        if point[0] >= t + 1./fps or not t:
-            _draw_point(point, time_bar, t, x_min, x_max, cart_plot)
-            t = point[0]
-            fig.savefig('img/_tmp%03d.png' % frame_number)
-            frame_number += 1
-
-    print(os.system("ffmpeg -framerate 25 -i img/_tmp%03d.png "  \
-          + "-c:v libx264 -pix_fmt yuv420p video/out" + str(n) + ".mp4"))
-
-    return
-
-def _draw_point(point, time_bar, t, x_min, x_max, cart_plot):
-    # Draw cart
-    time_bar.set_xdata([t, t])
-    cart_plot.cla()
-    cart_plot.axis([x_min,x_max,-.5,.5])
-    l_cart = 0.05 * (x_max + abs(x_min))
-    cart_plot.plot([point[1]-l_cart,point[1]+l_cart], [0,0], 'r-', lw=5)
-
-    # Draw pole
-    theta = point[3] 
-    x = sin(theta)
-    y = cos(theta)
-    l_pole = 0.2 * (x_max + abs(x_min))
-    cart_plot.plot([point[1],point[1]+l_pole*x],[0,.4*y],'g-', lw=4)
-
-    return 
diff --git a/examples/pendulum/run_pendulum.py b/examples/pendulum/run_pendulum.py
deleted file mode 100644
index dd4ceaa6..00000000
--- a/examples/pendulum/run_pendulum.py
+++ /dev/null
@@ -1,127 +0,0 @@
-""" Pendulum environment launcher.
-Same principles as run_toy_env. See the docs for more details.
-
-Authors: Vincent Francois-Lavet, David Taralla
-"""
-
-import sys
-import logging
-import numpy as np
-
-import deer.experiment.base_controllers as bc
-from deer.default_parser import process_args
-from deer.agent import NeuralAgent
-from deer.q_networks.q_net_theano import MyQNetwork
-from pendulum_env import MyEnv as pendulum_env
-
-class Defaults:
-    # ----------------------
-    # Experiment Parameters
-    # ----------------------
-    STEPS_PER_EPOCH = 1000
-    EPOCHS = 200
-    STEPS_PER_TEST = 1000
-    PERIOD_BTW_SUMMARY_PERFS = 10
-
-    # ----------------------
-    # Environment Parameters
-    # ----------------------
-    FRAME_SKIP = 1
-
-    # ----------------------
-    # DQN Agent parameters:
-    # ----------------------
-    UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.005
-    LEARNING_RATE_DECAY = 0.99
-    DISCOUNT = 0.9
-    DISCOUNT_INC = .99
-    DISCOUNT_MAX = 0.95
-    RMS_DECAY = 0.9
-    RMS_EPSILON = 0.0001
-    MOMENTUM = 0
-    CLIP_DELTA = 1.0
-    EPSILON_START = 1.0
-    EPSILON_MIN = .2
-    EPSILON_DECAY = 10000
-    UPDATE_FREQUENCY = 1
-    REPLAY_MEMORY_SIZE = 1000000
-    BATCH_SIZE = 32
-    FREEZE_INTERVAL = 100
-    DETERMINISTIC = True
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-
-    # --- Parse parameters ---
-    parameters = process_args(sys.argv[1:], Defaults)
-    if parameters.deterministic:
-        rng = np.random.RandomState(12345)
-    else:
-        rng = np.random.RandomState()
-    
-    # --- Instantiate environment ---
-    env = pendulum_env(rng)
-
-    # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
-        env,
-        parameters.rms_decay,
-        parameters.rms_epsilon,
-        parameters.momentum,
-        parameters.clip_delta,
-        parameters.freeze_interval,
-        parameters.batch_size,
-        parameters.update_rule,
-        rng)
-    
-    # --- Instantiate agent ---
-    agent = NeuralAgent(
-        env,
-        qnetwork,
-        parameters.replay_memory_size,
-        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
-        parameters.batch_size,
-        rng)
-
-    # --- Bind controllers to the agent ---
-    # For comments, please refer to run_toy_env.py
-    agent.attach(bc.VerboseController(
-        evaluate_on='epoch', 
-        periodicity=1))
-
-    agent.attach(bc.TrainerController(
-        evaluate_on='action', 
-        periodicity=parameters.update_frequency, 
-        show_episode_avg_V_value=True, 
-        show_avg_Bellman_residual=True))
-
-    agent.attach(bc.LearningRateController(
-        initial_learning_rate=parameters.learning_rate,
-        learning_rate_decay=parameters.learning_rate_decay,
-        periodicity=1))
-
-    agent.attach(bc.DiscountFactorController(
-        initial_discount_factor=parameters.discount,
-        discount_factor_growth=parameters.discount_inc,
-        discount_factor_max=parameters.discount_max,
-        periodicity=1))
-
-    agent.attach(bc.EpsilonController(
-        initial_e=parameters.epsilon_start, 
-        e_decays=parameters.epsilon_decay, 
-        e_min=parameters.epsilon_min,
-        evaluate_on='action', 
-        periodicity=1, 
-        reset_every='none'))
-
-    agent.attach(bc.InterleavedTestEpochController(
-        id=0, 
-        epoch_length=parameters.steps_per_test, 
-        controllers_to_disable=[0, 1, 2, 3, 4], 
-        periodicity=2, 
-        show_score=True,
-        summarize_every=parameters.period_btw_summary_perfs))
-    
-    # --- Run the experiment ---
-    agent.run(parameters.epochs, parameters.steps_per_epoch)
diff --git a/examples/simplest_test_PLI/run_catcher.py b/examples/simplest_test_PLI/run_catcher.py
index bc75386f..cc921e41 100644
--- a/examples/simplest_test_PLI/run_catcher.py
+++ b/examples/simplest_test_PLI/run_catcher.py
@@ -10,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
+from deer.learning_algo.CRAR_keras import CRAR
 from catcher_env import MyEnv as catcher_env
 import deer.experiment.base_controllers as bc
 
@@ -70,8 +70,8 @@ class Defaults:
     # --- Instantiate environment ---
     env = catcher_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=False)
     
-    # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
+    # --- Instantiate learning algorithm ---
+    learning_algo = CRAR(
         env,
         parameters.rms_decay,
         parameters.rms_epsilon,
@@ -85,12 +85,12 @@ class Defaults:
         high_int_dim=HIGH_INT_DIM,
         internal_dim=3)
     
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.1)#1.)
+    test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1)#1.)
 
     # --- Instantiate agent ---
     agent = NeuralAgent(
         env,
-        qnetwork,
+        learning_algo,
         parameters.replay_memory_size,
         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
         parameters.batch_size,
@@ -178,12 +178,12 @@ class Defaults:
     ###
     # TRANSFER
     ###
-    optimized_params=qnetwork.getAllParams()
+    optimized_params=learning_algo.getAllParams()
     print "optimized_params"
     print optimized_params
 
-    # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
+    # --- Instantiate learning_algo ---
+    learning_algo = CRAR(
         env,
         parameters.rms_decay,
         parameters.rms_epsilon,
@@ -196,7 +196,7 @@ class Defaults:
         double_Q=True,
         high_int_dim=HIGH_INT_DIM,
         internal_dim=3)
-    qnetwork.setAllParams(optimized_params)
+    learning_algo.setAllParams(optimized_params)
 
     samples_transfer=500
     rand_ind=np.random.random_integers(0,20000,samples_transfer)
@@ -207,7 +207,7 @@ class Defaults:
     print original[0][0:10], transfer[0][0:10]
 
     # Transfer between the two repr
-    qnetwork.transfer(original, transfer, 5000)
+    learning_algo.transfer(original, transfer, 5000)
 
     
     # --- Instantiate environment with reverse=True ---
@@ -216,7 +216,7 @@ class Defaults:
     # --- Re instantiate agent ---
     agent = NeuralAgent(
         env,
-        qnetwork,
+        learning_algo,
         parameters.replay_memory_size,
         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
         parameters.batch_size,
diff --git a/examples/simplest_test_PLI/run_simple_maze.py b/examples/simplest_test_PLI/run_simple_maze.py
index ab72a1f7..bbc9fc22 100644
--- a/examples/simplest_test_PLI/run_simple_maze.py
+++ b/examples/simplest_test_PLI/run_simple_maze.py
@@ -1,6 +1,5 @@
 """Simple maze launcher
 
-Authors: Vincent Francois-Lavet
 """
 
 import sys
@@ -11,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
+from deer.learning_algo.CRAR_keras import CRAR
 from simple_maze_env import MyEnv as simple_maze_env
 import deer.experiment.base_controllers as bc
 
@@ -70,8 +69,8 @@ class Defaults:
     # --- Instantiate environment ---
     env = simple_maze_env(rng, higher_dim_obs=False)
     
-    # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
+    # --- Instantiate learning_algo ---
+    learning_algo = CRAR(
         env,
         parameters.rms_decay,
         parameters.rms_epsilon,
@@ -84,12 +83,12 @@ class Defaults:
         high_int_dim=False,
         internal_dim=2)
     
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
+    test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.)
 
     # --- Instantiate agent ---
     agent = NeuralAgent(
         env,
-        qnetwork,
+        learning_algo,
         parameters.replay_memory_size,
         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
         parameters.batch_size,
diff --git a/examples/toy_env/run_toy_env.py b/examples/toy_env/run_toy_env.py
index 4545766d..acf8b9bf 100644
--- a/examples/toy_env/run_toy_env.py
+++ b/examples/toy_env/run_toy_env.py
@@ -11,7 +11,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_theano import MyQNetwork
+from deer.learning_algo.q_net_keras import MyQNetwork
 from Toy_env import MyEnv as Toy_env
 import deer.experiment.base_controllers as bc
 from deer.policies import EpsilonGreedyPolicy
diff --git a/examples/toy_env/run_toy_env_simple.py b/examples/toy_env/run_toy_env_simple.py
index fc1dcf0c..638bce03 100644
--- a/examples/toy_env/run_toy_env_simple.py
+++ b/examples/toy_env/run_toy_env_simple.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_theano import MyQNetwork
+from deer.learning_algo.q_net_keras import MyQNetwork
 from Toy_env import MyEnv as Toy_env
 import deer.experiment.base_controllers as bc
 

From bfec3ef1037659f8e8362b768462c772f50895f0 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 11:55:48 -0400
Subject: [PATCH 57/96] fix

---
 deer/learning_algo/CRAR_keras.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deer/learning_algo/CRAR_keras.py b/deer/learning_algo/CRAR_keras.py
index 31691b10..cc4d1198 100644
--- a/deer/learning_algo/CRAR_keras.py
+++ b/deer/learning_algo/CRAR_keras.py
@@ -70,7 +70,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         """ Initialize environment
         
         """
-        QNetwork.__init__(self,environment, batch_size)
+        LearningAlgo.__init__(self,environment, batch_size)
 
         self._rho = rho
         self._rms_epsilon = rms_epsilon

From 7d73f57097282e0aa7cba2753ba6d44e268c1ff4 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 12:02:42 -0400
Subject: [PATCH 58/96] removing unnecessary examples

---
 examples/simplest_test_PLI/run_test.py  | 191 ------------
 examples/simplest_test_PLI/run_test2.py | 191 ------------
 examples/simplest_test_PLI/test_env.py  | 336 ---------------------
 examples/simplest_test_PLI/test_env2.py | 372 ------------------------
 4 files changed, 1090 deletions(-)
 delete mode 100644 examples/simplest_test_PLI/run_test.py
 delete mode 100644 examples/simplest_test_PLI/run_test2.py
 delete mode 100644 examples/simplest_test_PLI/test_env.py
 delete mode 100644 examples/simplest_test_PLI/test_env2.py

diff --git a/examples/simplest_test_PLI/run_test.py b/examples/simplest_test_PLI/run_test.py
deleted file mode 100644
index e8925b2f..00000000
--- a/examples/simplest_test_PLI/run_test.py
+++ /dev/null
@@ -1,191 +0,0 @@
-"""ALE launcher. See Wiki for more details about this experiment.
-
-Authors: Vincent Francois-Lavet, David Taralla
-"""
-
-import sys
-import logging
-import numpy as np
-from joblib import hash, dump
-import os
-
-from deer.default_parser import process_args
-from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
-from test_env import MyEnv as test_env
-import deer.experiment.base_controllers as bc
-
-from deer.policies import EpsilonGreedyPolicy
-
-
-class Defaults:
-    # ----------------------
-    # Experiment Parameters
-    # ----------------------
-    STEPS_PER_EPOCH = 500
-    EPOCHS = 500
-    STEPS_PER_TEST = 20
-    PERIOD_BTW_SUMMARY_PERFS = 1
-    
-    # ----------------------
-    # Environment Parameters
-    # ----------------------
-    FRAME_SKIP = 2
-
-    # ----------------------
-    # DQN Agent parameters:
-    # ----------------------
-    UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0002
-    LEARNING_RATE_DECAY = 0.99
-    DISCOUNT = 0.9
-    DISCOUNT_INC = 1
-    DISCOUNT_MAX = 0.99
-    RMS_DECAY = 0.9
-    RMS_EPSILON = 0.0001
-    MOMENTUM = 0
-    CLIP_DELTA = 1.0
-    EPSILON_START = 1.0
-    EPSILON_MIN = .1
-    EPSILON_DECAY = 10000
-    UPDATE_FREQUENCY = 1
-    REPLAY_MEMORY_SIZE = 1000000
-    BATCH_SIZE = 32
-    FREEZE_INTERVAL = 1000
-    DETERMINISTIC = True
-
-
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    
-    # --- Parse parameters ---
-    parameters = process_args(sys.argv[1:], Defaults)
-    if parameters.deterministic:
-        rng = np.random.RandomState(123456)
-    else:
-        rng = np.random.RandomState()
-    
-    # --- Instantiate environment ---
-    env = test_env()
-    
-    # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
-        env,
-        parameters.rms_decay,
-        parameters.rms_epsilon,
-        parameters.momentum,
-        parameters.clip_delta,
-        parameters.freeze_interval,
-        parameters.batch_size,
-        parameters.update_rule,
-        rng)
-    
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
-
-    # --- Instantiate agent ---
-    agent = NeuralAgent(
-        env,
-        qnetwork,
-        parameters.replay_memory_size,
-        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
-        parameters.batch_size,
-        rng,
-        test_policy=test_policy)
-
-    # --- Create unique filename for FindBestController ---
-    h = hash(vars(parameters), hash_name="sha1")
-    fname = "test_" + h
-    print("The parameters hash is: {}".format(h))
-    print("The parameters are: {}".format(parameters))
-
-    # --- Bind controllers to the agent ---
-    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
-    # learning rate as well as the training epoch number.
-    agent.attach(bc.VerboseController(
-        evaluate_on='epoch', 
-        periodicity=1))
-    
-    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
-    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
-    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
-    agent.attach(bc.TrainerController(
-        evaluate_on='action', 
-        periodicity=parameters.update_frequency, 
-        show_episode_avg_V_value=True, 
-        show_avg_Bellman_residual=True))
-    
-    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
-    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
-    agent.attach(bc.LearningRateController(
-        initial_learning_rate=parameters.learning_rate, 
-        learning_rate_decay=parameters.learning_rate_decay,
-        periodicity=1))
-    
-    # Same for the discount factor.
-    agent.attach(bc.DiscountFactorController(
-        initial_discount_factor=parameters.discount, 
-        discount_factor_growth=parameters.discount_inc, 
-        discount_factor_max=parameters.discount_max,
-        periodicity=1))
-    
-    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
-    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
-    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
-    # episode or epoch (or never, hence the resetEvery='none').
-    agent.attach(bc.EpsilonController(
-        initial_e=parameters.epsilon_start, 
-        e_decays=parameters.epsilon_decay, 
-        e_min=parameters.epsilon_min,
-        evaluate_on='action',
-        periodicity=1,
-        reset_every='none'))
-    
-    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
-    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
-    # "true generalization score", or "test score".
-    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
-    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
-    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
-    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
-    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
-    # application.
-    agent.attach(bc.FindBestController(
-        validationID=test_env.VALIDATION_MODE,
-        testID=None,
-        unique_fname=fname))
-    
-    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
-    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
-    # these validation epoch to interfere with the training of the agent, which is well established by the 
-    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
-    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
-    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
-    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
-    # [parameters.period_btw_summary_perfs] *validation* epochs.
-    agent.attach(bc.InterleavedTestEpochController(
-        id=test_env.VALIDATION_MODE, 
-        epoch_length=parameters.steps_per_test,
-        controllers_to_disable=[0, 1, 2, 3, 4],
-        periodicity=2,
-        show_score=True,
-        summarize_every=1))
-    
-    # --- Run the experiment ---
-    try:
-        os.mkdir("params")
-    except Exception:
-        pass
-    dump(vars(parameters), "params/" + fname + ".jldump")
-    agent.run(parameters.epochs, parameters.steps_per_epoch)
-    
-    # --- Show results ---
-    basename = "scores/" + fname
-    scores = joblib.load(basename + "_scores.jldump")
-    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
-    plt.legend()
-    plt.xlabel("Number of epochs")
-    plt.ylabel("Score")
-    plt.savefig(basename + "_scores.pdf")
-    plt.show()
diff --git a/examples/simplest_test_PLI/run_test2.py b/examples/simplest_test_PLI/run_test2.py
deleted file mode 100644
index 5075e14c..00000000
--- a/examples/simplest_test_PLI/run_test2.py
+++ /dev/null
@@ -1,191 +0,0 @@
-"""ALE launcher. See Wiki for more details about this experiment.
-
-Authors: Vincent Francois-Lavet, David Taralla
-"""
-
-import sys
-import logging
-import numpy as np
-from joblib import hash, dump
-import os
-
-from deer.default_parser import process_args
-from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
-from test_env2 import MyEnv as test_env
-import deer.experiment.base_controllers as bc
-
-from deer.policies import EpsilonGreedyPolicy
-
-
-class Defaults:
-    # ----------------------
-    # Experiment Parameters
-    # ----------------------
-    STEPS_PER_EPOCH = 500
-    EPOCHS = 500
-    STEPS_PER_TEST = 200
-    PERIOD_BTW_SUMMARY_PERFS = 1
-    
-    # ----------------------
-    # Environment Parameters
-    # ----------------------
-    FRAME_SKIP = 2
-
-    # ----------------------
-    # DQN Agent parameters:
-    # ----------------------
-    UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0002
-    LEARNING_RATE_DECAY = 0.99
-    DISCOUNT = 0.9
-    DISCOUNT_INC = 1
-    DISCOUNT_MAX = 0.99
-    RMS_DECAY = 0.9
-    RMS_EPSILON = 0.0001
-    MOMENTUM = 0
-    CLIP_DELTA = 1.0
-    EPSILON_START = 1.0
-    EPSILON_MIN = .3
-    EPSILON_DECAY = 10000
-    UPDATE_FREQUENCY = 1
-    REPLAY_MEMORY_SIZE = 1000000
-    BATCH_SIZE = 32
-    FREEZE_INTERVAL = 1000
-    DETERMINISTIC = False
-
-
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    
-    # --- Parse parameters ---
-    parameters = process_args(sys.argv[1:], Defaults)
-    if parameters.deterministic:
-        rng = np.random.RandomState(123456)
-    else:
-        rng = np.random.RandomState()
-    
-    # --- Instantiate environment ---
-    env = test_env()
-    
-    # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
-        env,
-        parameters.rms_decay,
-        parameters.rms_epsilon,
-        parameters.momentum,
-        parameters.clip_delta,
-        parameters.freeze_interval,
-        parameters.batch_size,
-        parameters.update_rule,
-        rng)
-    
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
-
-    # --- Instantiate agent ---
-    agent = NeuralAgent(
-        env,
-        qnetwork,
-        parameters.replay_memory_size,
-        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
-        parameters.batch_size,
-        rng,
-        test_policy=test_policy)
-
-    # --- Create unique filename for FindBestController ---
-    h = hash(vars(parameters), hash_name="sha1")
-    fname = "test_" + h
-    print("The parameters hash is: {}".format(h))
-    print("The parameters are: {}".format(parameters))
-
-    # --- Bind controllers to the agent ---
-    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
-    # learning rate as well as the training epoch number.
-    agent.attach(bc.VerboseController(
-        evaluate_on='epoch', 
-        periodicity=1))
-    
-    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
-    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
-    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
-    agent.attach(bc.TrainerController(
-        evaluate_on='action', 
-        periodicity=parameters.update_frequency, 
-        show_episode_avg_V_value=True, 
-        show_avg_Bellman_residual=True))
-    
-    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
-    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
-    agent.attach(bc.LearningRateController(
-        initial_learning_rate=parameters.learning_rate, 
-        learning_rate_decay=parameters.learning_rate_decay,
-        periodicity=1))
-    
-    # Same for the discount factor.
-    agent.attach(bc.DiscountFactorController(
-        initial_discount_factor=parameters.discount, 
-        discount_factor_growth=parameters.discount_inc, 
-        discount_factor_max=parameters.discount_max,
-        periodicity=1))
-    
-    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
-    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
-    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
-    # episode or epoch (or never, hence the resetEvery='none').
-    agent.attach(bc.EpsilonController(
-        initial_e=parameters.epsilon_start, 
-        e_decays=parameters.epsilon_decay, 
-        e_min=parameters.epsilon_min,
-        evaluate_on='action',
-        periodicity=1,
-        reset_every='none'))
-    
-    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
-    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
-    # "true generalization score", or "test score".
-    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
-    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
-    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
-    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
-    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
-    # application.
-    agent.attach(bc.FindBestController(
-        validationID=test_env.VALIDATION_MODE,
-        testID=None,
-        unique_fname=fname))
-    
-    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
-    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
-    # these validation epoch to interfere with the training of the agent, which is well established by the 
-    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
-    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
-    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
-    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
-    # [parameters.period_btw_summary_perfs] *validation* epochs.
-    agent.attach(bc.InterleavedTestEpochController(
-        id=test_env.VALIDATION_MODE, 
-        epoch_length=parameters.steps_per_test,
-        controllers_to_disable=[0, 1, 2, 3, 4],
-        periodicity=2,
-        show_score=True,
-        summarize_every=1))
-    
-    # --- Run the experiment ---
-    try:
-        os.mkdir("params")
-    except Exception:
-        pass
-    dump(vars(parameters), "params/" + fname + ".jldump")
-    agent.run(parameters.epochs, parameters.steps_per_epoch)
-    
-    # --- Show results ---
-    basename = "scores/" + fname
-    scores = joblib.load(basename + "_scores.jldump")
-    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
-    plt.legend()
-    plt.xlabel("Number of epochs")
-    plt.ylabel("Score")
-    plt.savefig(basename + "_scores.pdf")
-    plt.show()
diff --git a/examples/simplest_test_PLI/test_env.py b/examples/simplest_test_PLI/test_env.py
deleted file mode 100644
index 4239f4de..00000000
--- a/examples/simplest_test_PLI/test_env.py
+++ /dev/null
@@ -1,336 +0,0 @@
-""" Interface with the test environment
-
-Authors: Vincent Francois-Lavet
-"""
-import numpy as np
-import cv2
-
-from deer.base_classes import Environment
-
-import matplotlib
-matplotlib.use('qt5agg')
-from mpl_toolkits.axes_grid1 import host_subplot
-import mpl_toolkits.axisartist as AA
-import matplotlib.pyplot as plt
-
-class MyEnv(Environment):
-    VALIDATION_MODE = 0
-
-    def __init__(self):
-
-        self._mode = -1
-        self._mode_score = 0.0
-        self._mode_episode_count = 0
-
-        self._actions = [0,1]
-        self._length_chain=11
-
-                
-    def reset(self, mode):
-        if mode == MyEnv.VALIDATION_MODE:
-            if self._mode != MyEnv.VALIDATION_MODE:
-                self._mode = MyEnv.VALIDATION_MODE
-                self._mode_score = 0.0
-                self._mode_episode_count = 0
-            else:
-                self._mode_episode_count += 1
-        elif self._mode != -1: # and thus mode == -1
-            self._mode = -1
-        
-        self.state=np.zeros(self._length_chain)
-        self.state[0]=1
-                
-        return self.state
-        
-        
-    def act(self, action):
-        action = self._actions[action]
-        
-        self.reward = 0
-        if( self.state[-3]==1 and action==0):        
-            self.reward = 1
-
-        # self.state[-2] is the end state
-        # at self.state[-1] the env is reset
-        if (self.state[-2]==1):
-                    self.state[-2]=0
-                    self.state[-1]=1
-        elif (self.state[-1]==1):
-                    self.state[-1]=0
-                    self.state[0]=1
-
-        for i in range(self._length_chain-2):
-            if(self.state[i]==1):
-                if (action==0):
-                    self.state[i]=0
-                    self.state[i+1]=1
-                else:
-                    self.state[i]=0
-                    self.state[-2]=1
-                break
-              
-        self._mode_score += self.reward
-        return self.reward
-
-    def summarizePerformance(self, test_data_set, learning_algo):
-        #print "test_data_set.observations.shape"
-        #print test_data_set.observations()[0][0:1]
-        print "print test_data_set.observations()"
-        print test_data_set.observations()
-        n=self._length_chain-1
-        historics=[]
-        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
-            historics.append(np.expand_dims(observ,axis=0))
-        historics=np.array(historics)
-        print "historics"
-        print historics
-        abs_states=learning_algo.encoder.predict(historics)
-        print "abs_states"
-        print abs_states
-        actions=test_data_set.actions()[0:n]
-        print "actions"
-        print actions
-
-        print actions
-        print "test_data_set.rewards()[0:n]"
-        print test_data_set.rewards()[0:n]
-        print "test_data_set.terminals()[0:n]"
-        print test_data_set.terminals()[0:n]
-        if self.inTerminalState() == False:
-            self._mode_episode_count += 1
-        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
-                
-        
-        import matplotlib.pyplot as plt
-        from mpl_toolkits.mplot3d import Axes3D
-        import matplotlib.cm as cm
-        m = cm.ScalarMappable(cmap=cm.jet)
-        
-        x = np.array(abs_states)[:,0]
-        y = np.array(abs_states)[:,1]
-        z = np.array(abs_states)[:,2]
-        
-        #Colors
-        #onehot_actions = np.zeros((n, 4))
-        #onehot_actions[np.arange(n), actions] = 1
-        
-        fig = plt.figure()
-        ax = fig.add_subplot(111,projection='3d')
-        for j in range(3):
-            # Plot the trajectory
-            for i in xrange(n-1):
-                ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
-
-        # Plot the fitted one-step trajectory from time t=10
-        for i in range(n-1):
-            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
-            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75)
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.75)
-
-#        for xx in [-2,-1.,0, 1., 2.]:
-#            for yy in [-2,-1.,0, 1., 2.]:
-#                for zz in [-2,-1.,0, 1., 2.]:
-#                    predicted1=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[1,0,0]])])
-#                    predicted2=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,1,0]])])
-#                    predicted3=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,0,1]])])
-#                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
-#                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
-#                    ax.plot(np.concatenate([np.array([xx]),predicted3[0,:1]]), np.concatenate([np.array([yy]),predicted3[0,1:2]]), np.concatenate([np.array([zz]),predicted3[0,2:]]), color="0", alpha=0.5)
-                    #ax.plot(np.concatenate([x[i:i+1],predicted[0,:1]]), np.concatenate([y[i:i+1],predicted[0,1:2]]), np.concatenate([z[i:i+1],predicted[0,2:]]), color="g")
-        
-
-        # Plot the colorbar for the trajectory
-        fig.subplots_adjust(right=0.7)
-        ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
-        # Set the colormap and norm to correspond to the data for which the colorbar will be used.
-        cmap = matplotlib.cm.cool
-        norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
-        # everything needed for a standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
-                                norm=norm,
-                                orientation='vertical')
-        cb1.set_label('Beginning to end of trajectory')
-
-
-        # Plot the dots at each time step depending on the action taken
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
-        axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
-        zrange=axes_lims[2][1]-axes_lims[2][0]
-        
-        # Plot the legend for the dots
-        from matplotlib.patches import Circle, Rectangle
-        from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" State representation (action 0, action 1) : ", textprops=dict(color="k"))
-        
-        box2 = DrawingArea(60, 20, 0, 0)
-        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k", alpha=0.75)
-        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k", alpha=0.75) 
-        #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
-        box2.add_artist(el1)
-        box2.add_artist(el2)
-        #box2.add_artist(el3)
-        
-        box = HPacker(children=[box1, box2],
-                      align="center",
-                      pad=0, sep=5)
-        
-        anchored_box = AnchoredOffsetbox(loc=3,
-                                         child=box, pad=0.,
-                                         frameon=True,
-                                         bbox_to_anchor=(0., 1.07),
-                                         bbox_transform=ax.transAxes,
-                                         borderpad=0.,
-                                         )        
-        ax.add_artist(anchored_box)
-
-        # Plot the legend for transition estimates
-        box1b = TextArea(" Estimated transitions (action 0, action 1): ", textprops=dict(color="k"))
-        box2b = DrawingArea(60, 20, 0, 0)
-        el1b = Rectangle((5, 10), 15,2, fc="0.75", alpha=0.75)
-        el2b = Rectangle((25, 10), 15,2, fc="0.25", alpha=0.75) 
-        box2b.add_artist(el1b)
-        box2b.add_artist(el2b)
-
-        boxb = HPacker(children=[box1b, box2b],
-                      align="center",
-                      pad=0, sep=5)
-        
-        anchored_box = AnchoredOffsetbox(loc=3,
-                                         child=boxb, pad=0.,
-                                         frameon=True,
-                                         bbox_to_anchor=(0., 0.98),
-                                         bbox_transform=ax.transAxes,
-                                         borderpad=0.,
-                                         )
-        ax.add_artist(anchored_box)
-
-        ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
-
-
-        # Plot the Q_vals
-        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-        #print "actions,C"
-        #print actions
-        #print c
-        #c=np.max(c,axis=1)
-        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-        
-        #plt.colorbar(m3)
-        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.RdYlGn
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
-
-        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
-        #plt.show()
-
-        # fig_visuV
-        fig = plt.figure()
-        ax = fig.add_subplot(111, projection='3d')
-        
-        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
-        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
-        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-
-        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-        c=np.max(c,axis=1)
-        #print "c"
-        #print c
-        
-        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-        #plt.colorbar(m)
-        fig.subplots_adjust(right=0.8)
-        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.hot
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
-
-
-        #plt.show()
-        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
-
-
-        # fig_visuR
-        fig = plt.figure()
-        ax = fig.add_subplot(111, projection='3d')
-        
-        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
-        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
-        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-
-        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
-        repeat_nactions_coord=np.repeat(coords,self.nActions(),axis=0)
-        identity_matrix = np.diag(np.ones(self.nActions()))
-        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
-
-        c = learning_algo.R.predict([repeat_nactions_coord,tile_identity_matrix])
-        c=np.max(np.reshape(c,(125,self.nActions())),axis=1)
-        #print "c"
-        #print c
-        #mini=np.min(c)
-        #maxi=np.max(c)
-        
-        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-        #plt.colorbar(m)
-        fig.subplots_adjust(right=0.8)
-        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.hot
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
-
-        #plt.show()
-        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
-
-        matplotlib.pyplot.close("all") # avoids memory leaks
-
-    def inputDimensions(self):
-        return [(1,self._length_chain)]
-
-    def observationType(self, subject):
-        return np.float32
-
-    def nActions(self):
-        return len(self._actions)
-
-    def observe(self):
-        return [np.array(self.state)]
-
-    def inTerminalState(self):
-        if (self.state[-1]==1):
-            return True
-        else:
-            return False
-
-
-
-if __name__ == "__main__":
-    pass
diff --git a/examples/simplest_test_PLI/test_env2.py b/examples/simplest_test_PLI/test_env2.py
deleted file mode 100644
index 6c543fe2..00000000
--- a/examples/simplest_test_PLI/test_env2.py
+++ /dev/null
@@ -1,372 +0,0 @@
-""" Interface with the test environment
-
-Authors: Vincent Francois-Lavet
-"""
-import numpy as np
-import cv2
-
-from deer.base_classes import Environment
-
-import matplotlib
-matplotlib.use('qt5agg')
-from mpl_toolkits.axes_grid1 import host_subplot
-import mpl_toolkits.axisartist as AA
-import matplotlib.pyplot as plt
-import copy 
-
-class MyEnv(Environment):
-    VALIDATION_MODE = 0
-
-    def __init__(self):
-
-        self._mode = -1
-        self._mode_score = 0.0
-        self._mode_episode_count = 0
-
-        self._actions = [0,1]
-        self._height=15
-        self._width=7 #preferably an odd number so that it's symmetrical
-        self._nx_block=3 #number of different x positions of the falling blocks
-        if(self._nx_block==1):
-            self._x_block=self._width//2
-        else:
-            rand=np.random.randint(self._nx_block) # random selection of the pos for falling block
-            self._x_block=rand*((self._width-1)//(self._nx_block-1)) # traduction in a number in [0,self._width] of rand
-
-                
-    def reset(self, mode):
-        if mode == MyEnv.VALIDATION_MODE:
-            if self._mode != MyEnv.VALIDATION_MODE:
-                self._mode = MyEnv.VALIDATION_MODE
-                self._mode_score = 0.0
-                self._mode_episode_count = 0
-            else:
-                self._mode_episode_count += 1
-        elif self._mode != -1: # and thus mode == -1
-            self._mode = -1
-        
-        self.y=self._height-1
-        self.x=self._width//2
-        if(self._nx_block==1):
-            self._x_block=self._width//2
-        else:
-            rand=np.random.randint(self._nx_block) # random selection of the pos for falling block
-            self._x_block=rand*((self._width-1)//(self._nx_block-1)) # traduction in a number in [0,self._width] of rand
-                
-        return np.array([[0,0,0,1,0,1,0]]) #[0,0,1]+[0,1,0]
-        
-        
-    def act(self, action):
-        action = self._actions[action]
-        
-        if(action==0):        
-            self.x = max(self.x-1,0)
-        if(action==1):        
-            self.x = min(self.x+1,self._width-1)
-
-        self.y = self.y-1
-              
-        if(self.y==0 and self.x==self._x_block):
-            self.reward = 1
-        elif(self.y==0):
-            self.reward = -1
-        else:
-            self.reward = 0
-
-        self._mode_score += self.reward
-        return self.reward
-
-    def summarizePerformance(self, test_data_set, learning_algo):
-        #print "test_data_set.observations.shape"
-        #print test_data_set.observations()[0][0:1]
-        
-        possib_y = np.zeros((self._height-1, self._height))
-        possib_y[np.arange(self._height-1), 1+np.arange(self._height-1)] = 1
-        possib_x=np.diag(np.ones(self._width))
-        rep_x=np.tile(np.repeat(possib_x,self._height-1,axis=0),(self._nx_block,1))
-        rep_y=np.tile(np.tile(possib_y,(self._width,1)),(self._nx_block,1))
-        if(self._nx_block==1):
-            possib_x_block=np.zeros((1,self._width))
-            possib_x_block[0,self._width//2]=1
-        else:
-            possib_x_block=[]
-            for i in range(self._nx_block):
-                one_hot_x_block=np.zeros((self._width))
-                j=i*((self._width-1)//(self._nx_block-1))
-                one_hot_x_block[j]=1
-                possib_x_block.append(one_hot_x_block)
-        rep_x_block=np.repeat(np.array(possib_x_block),(self._height-1)*self._width,axis=0)
-        all_possib_inp=np.expand_dims(np.concatenate((rep_y,rep_x,rep_x_block),axis=1),axis=1)
-        all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
-        print "learning_algo.encoder.predict(all_possib_inp)"
-        print all_possib_abs_states
-        
-        print "print test_data_set.observations()"
-        print test_data_set.observations()
-        n=self._height-1
-        historics=[]
-        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
-            historics.append(np.expand_dims(observ,axis=0))
-        historics=np.array(historics)
-        print "historics"
-        print historics
-        abs_states=learning_algo.encoder.predict(historics)
-        print "abs_states"
-        print abs_states
-        actions=test_data_set.actions()[0:n]
-        print "actions"
-        print actions
-
-        print actions
-        print "test_data_set.rewards()[0:n]"
-        print test_data_set.rewards()[0:n]
-        print "test_data_set.terminals()[0:n]"
-        print test_data_set.terminals()[0:n]
-        if self.inTerminalState() == False:
-            self._mode_episode_count += 1
-        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
-                
-        
-        import matplotlib.pyplot as plt
-        from mpl_toolkits.mplot3d import Axes3D
-        import matplotlib.cm as cm
-        m = cm.ScalarMappable(cmap=cm.jet)
-        
-        x = np.array(abs_states)[:,0]
-        y = np.array(abs_states)[:,1]
-        z = np.array(abs_states)[:,2]
-        
-        #Colors
-        #onehot_actions = np.zeros((n, 4))
-        #onehot_actions[np.arange(n), actions] = 1
-        
-        fig = plt.figure()
-        ax = fig.add_subplot(111,projection='3d')
-        for j in range(3):
-            # Plot the trajectory
-            for i in xrange(n-1):
-                ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
-
-        # Plot the estimated transitions
-        for i in range(n-1):
-            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
-            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75)
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.75)
-
-#        for xx in np.arange(self._width)-self._width//2:
-#            for yy in np.arange(self._width)-self._width//2:
-#                for zz in np.arange(self._width)-self._width//2:
-#                    predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
-#                    predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
-#                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
-#                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
-        
-
-        # Plot the colorbar for the trajectory
-        fig.subplots_adjust(right=0.7)
-        ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
-        # Set the colormap and norm to correspond to the data for which the colorbar will be used.
-        cmap = matplotlib.cm.cool
-        norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
-        # everything needed for a standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
-                                norm=norm,
-                                orientation='vertical')
-        cb1.set_label('Beginning to end of trajectory')
-
-
-        # Plot the dots at each time step depending on the action taken
-        length_block=(self._height-1)*self._width
-        for i in range(self._nx_block):
-            line3 = ax.scatter(all_possib_abs_states[i*length_block:(i+1)*length_block,0], all_possib_abs_states[i*length_block:(i+1)*length_block,1] ,all_possib_abs_states[i*length_block:(i+1)*length_block,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.2)
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/2.,axis=1),(1,3))-0.25, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
-        axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
-        zrange=axes_lims[2][1]-axes_lims[2][0]
-        
-        # Plot the legend for the dots
-        from matplotlib.patches import Circle, Rectangle
-        from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" State representation (action 0, action 1): ", textprops=dict(color="k"))
-        
-        box2 = DrawingArea(60, 20, 0, 0)
-        el1 = Circle((10, 10), 5, fc="0.75", edgecolor="k", alpha=0.75)
-        el2 = Circle((30, 10), 5, fc="0.25", edgecolor="k", alpha=0.75) 
-        #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
-        box2.add_artist(el1)
-        box2.add_artist(el2)
-        #box2.add_artist(el3)
-
-
-        box = HPacker(children=[box1, box2],
-                      align="center",
-                      pad=0, sep=5)
-        
-        anchored_box = AnchoredOffsetbox(loc=3,
-                                         child=box, pad=0.,
-                                         frameon=True,
-                                         bbox_to_anchor=(0., 1.07),
-                                         bbox_transform=ax.transAxes,
-                                         borderpad=0.,
-                                         )
-        ax.add_artist(anchored_box)
-
-
-        # Plot the legend for transition estimates
-        box1b = TextArea(" Estimated transitions (action 0, action 1): ", textprops=dict(color="k"))
-        box2b = DrawingArea(60, 20, 0, 0)
-        el1b = Rectangle((5, 10), 15,2, fc="0.75", alpha=0.75)
-        el2b = Rectangle((25, 10), 15,2, fc="0.25", alpha=0.75) 
-        box2b.add_artist(el1b)
-        box2b.add_artist(el2b)
-
-        boxb = HPacker(children=[box1b, box2b],
-                      align="center",
-                      pad=0, sep=5)
-        
-        anchored_box = AnchoredOffsetbox(loc=3,
-                                         child=boxb, pad=0.,
-                                         frameon=True,
-                                         bbox_to_anchor=(0., 0.98),
-                                         bbox_transform=ax.transAxes,
-                                         borderpad=0.,
-                                         )        
-        ax.add_artist(anchored_box)
-
-        
-
-        ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        #plt.show()
-        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
-
-
-        # Plot the Q_vals
-        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-        #print "actions,C"
-        #print actions
-        #print c
-        #c=np.max(c,axis=1)
-        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-        
-        #plt.colorbar(m3)
-        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.RdYlGn
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
-
-        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
-
-
-        # fig_visuV
-        fig = plt.figure()
-        ax = fig.add_subplot(111, projection='3d')
-        
-        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
-        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
-        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-
-        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-        c=np.max(c,axis=1)
-        #print "c"
-        #print c
-        
-        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-        #plt.colorbar(m)
-        fig.subplots_adjust(right=0.8)
-        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.hot
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
-
-        #plt.show()
-        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
-
-
-        # fig_visuR
-        fig = plt.figure()
-        ax = fig.add_subplot(111, projection='3d')
-        
-        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
-        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
-        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-
-        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
-        repeat_nactions_coord=np.repeat(coords,self.nActions(),axis=0)
-        identity_matrix = np.diag(np.ones(self.nActions()))
-        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
-
-        c = learning_algo.R.predict([repeat_nactions_coord,tile_identity_matrix])
-        c=np.max(np.reshape(c,(125,self.nActions())),axis=1)
-        #print "c"
-        #print c
-        #mini=np.min(c)
-        #maxi=np.max(c)
-        
-        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-        #plt.colorbar(m)
-        fig.subplots_adjust(right=0.8)
-        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
-        cmap = matplotlib.cm.hot
-        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar
-        # in a specified axes, so it has everything needed for a
-        # standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks
-        # and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-        cb1.set_label('Estimated expected return')
-
-        #plt.show()
-        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
-
-        matplotlib.pyplot.close("all") # avoids memory leaks
-
-    def inputDimensions(self):
-        return [(1,self._height+self._width+self._width)]
-
-    def observationType(self, subject):
-        return np.float32
-
-    def nActions(self):
-        return len(self._actions)
-
-    def observe(self):
-        one_hot_x=np.zeros(self._width)
-        one_hot_x[self.x]=1
-        one_hot_y=np.zeros(self._height)
-        one_hot_y[self.y]=1
-        one_hot_x_block=np.zeros(self._width)
-        one_hot_x_block[self._x_block]=1
-        return [np.array(list(one_hot_y)+list(one_hot_x)+list(one_hot_x_block))]
-
-    def inTerminalState(self):
-        if (self.y==0):
-            return True
-        else:
-            return False
-
-
-
-if __name__ == "__main__":
-    pass

From 8437e87285ee4a177ffe4e25aa612c5cb4874c92 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 12:05:03 -0400
Subject: [PATCH 59/96] some rename

---
 examples/{simplest_test_PLI => test_CRAR}/catcher_env.py     | 0
 examples/{simplest_test_PLI => test_CRAR}/run_catcher.py     | 0
 examples/{simplest_test_PLI => test_CRAR}/run_simple_maze.py | 0
 examples/{simplest_test_PLI => test_CRAR}/simple_maze_env.py | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/{simplest_test_PLI => test_CRAR}/catcher_env.py (100%)
 rename examples/{simplest_test_PLI => test_CRAR}/run_catcher.py (100%)
 rename examples/{simplest_test_PLI => test_CRAR}/run_simple_maze.py (100%)
 rename examples/{simplest_test_PLI => test_CRAR}/simple_maze_env.py (100%)

diff --git a/examples/simplest_test_PLI/catcher_env.py b/examples/test_CRAR/catcher_env.py
similarity index 100%
rename from examples/simplest_test_PLI/catcher_env.py
rename to examples/test_CRAR/catcher_env.py
diff --git a/examples/simplest_test_PLI/run_catcher.py b/examples/test_CRAR/run_catcher.py
similarity index 100%
rename from examples/simplest_test_PLI/run_catcher.py
rename to examples/test_CRAR/run_catcher.py
diff --git a/examples/simplest_test_PLI/run_simple_maze.py b/examples/test_CRAR/run_simple_maze.py
similarity index 100%
rename from examples/simplest_test_PLI/run_simple_maze.py
rename to examples/test_CRAR/run_simple_maze.py
diff --git a/examples/simplest_test_PLI/simple_maze_env.py b/examples/test_CRAR/simple_maze_env.py
similarity index 100%
rename from examples/simplest_test_PLI/simple_maze_env.py
rename to examples/test_CRAR/simple_maze_env.py

From 44ae64be2dcce0a38d234074e937da180a3599d9 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 12:21:25 -0400
Subject: [PATCH 60/96] remove some unnecessary comments

---
 deer/agent.py                                        | 1 -
 deer/base_classes/{Environment.py => environment.py} | 3 ++-
 deer/base_classes/{Policy.py => policy.py}           | 5 +++++
 deer/default_parser.py                               | 1 -
 deer/learning_algo/AC_net_keras.py                   | 1 -
 deer/policies/LongerExplorationPolicy.py             | 2 +-
 examples/ALE/ALE_env.py                              | 2 +-
 examples/ALE/run_ALE.py                              | 3 +--
 examples/MG_two_storages/MG_two_storages_env.py      | 1 -
 examples/MG_two_storages/run_MG_two_storages.py      | 1 -
 examples/PLE/PLE_env.py                              | 4 ++--
 examples/gym/run_mountain_car.py                     | 1 -
 examples/gym/run_mountain_car_continuous.py          | 1 -
 examples/test_CRAR/catcher_env.py                    | 1 -
 examples/test_CRAR/simple_maze_env.py                | 2 --
 examples/toy_env/Toy_env.py                          | 1 -
 examples/toy_env/run_toy_env.py                      | 1 -
 examples/toy_env/run_toy_env_simple.py               | 1 -
 18 files changed, 12 insertions(+), 20 deletions(-)
 rename deer/base_classes/{Environment.py => environment.py} (99%)
 rename deer/base_classes/{Policy.py => policy.py} (96%)

diff --git a/deer/agent.py b/deer/agent.py
index 8d774f10..edd26ca7 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -2,7 +2,6 @@
 It relies on the controllers, the chosen training/test policy and the learning algorithm
 to specify its behavior in the environment.
 
-.. Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import os
diff --git a/deer/base_classes/Environment.py b/deer/base_classes/environment.py
similarity index 99%
rename from deer/base_classes/Environment.py
rename to deer/base_classes/environment.py
index e7f7af89..26e2bcba 100644
--- a/deer/base_classes/Environment.py
+++ b/deer/base_classes/environment.py
@@ -1,5 +1,6 @@
 """
-.. Authors: Vincent Francois-Lavet, David Taralla
+This module defines the base class for the environments.
+
 """
 
 import numpy as np
diff --git a/deer/base_classes/Policy.py b/deer/base_classes/policy.py
similarity index 96%
rename from deer/base_classes/Policy.py
rename to deer/base_classes/policy.py
index 673088a3..436eb53a 100644
--- a/deer/base_classes/Policy.py
+++ b/deer/base_classes/policy.py
@@ -1,3 +1,8 @@
+"""
+This module defines the base class for the policies.
+
+"""
+
 import numpy as np
 
 class Policy(object):
diff --git a/deer/default_parser.py b/deer/default_parser.py
index 9af0976d..e8470a0a 100644
--- a/deer/default_parser.py
+++ b/deer/default_parser.py
@@ -1,6 +1,5 @@
 """This module contains a function to help parse command-line arguments.
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 
diff --git a/deer/learning_algo/AC_net_keras.py b/deer/learning_algo/AC_net_keras.py
index 9f82b92a..00dac986 100644
--- a/deer/learning_algo/AC_net_keras.py
+++ b/deer/learning_algo/AC_net_keras.py
@@ -1,7 +1,6 @@
 """
 Code for the actor-critic "DDPG" (https://arxiv.org/abs/1509.02971)
 
-.. Author: Vincent Francois-Lavet
 """
 
 import sys
diff --git a/deer/policies/LongerExplorationPolicy.py b/deer/policies/LongerExplorationPolicy.py
index 35a1f55b..3ad07aaf 100644
--- a/deer/policies/LongerExplorationPolicy.py
+++ b/deer/policies/LongerExplorationPolicy.py
@@ -1,5 +1,5 @@
 """ Exploration policy for permutation invariant environments
-Authors: Vincent Francois-Lavet, Adrien Couetoux
+
 """
 
 from ..base_classes import Policy
diff --git a/examples/ALE/ALE_env.py b/examples/ALE/ALE_env.py
index 7180a565..8a16f0a9 100644
--- a/examples/ALE/ALE_env.py
+++ b/examples/ALE/ALE_env.py
@@ -1,7 +1,7 @@
 """ Interface with the ALE environment
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
+
 import numpy as np
 import cv2
 from ale_python_interface import ALEInterface
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index c210df58..27c4dbd5 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -1,6 +1,5 @@
 """ALE launcher. See Wiki for more details about this experiment.
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import sys
@@ -11,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
+from deer.learning_algo.CRAR_keras import CRAR
 from ALE_env_gym import MyEnv as ALE_env
 import deer.experiment.base_controllers as bc
 
diff --git a/examples/MG_two_storages/MG_two_storages_env.py b/examples/MG_two_storages/MG_two_storages_env.py
index 5d3cfbbf..8561871f 100644
--- a/examples/MG_two_storages/MG_two_storages_env.py
+++ b/examples/MG_two_storages/MG_two_storages_env.py
@@ -11,7 +11,6 @@
 More information can be found in the paper to be published :
 Efficient decision making in stochastic micro-grids using deep reinforcement learning, Vincent Francois-Lavet, David Taralla, Raphael Fonteneau, Damien Ernst
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import numpy as np
diff --git a/examples/MG_two_storages/run_MG_two_storages.py b/examples/MG_two_storages/run_MG_two_storages.py
index 6286d3c5..36aec0ef 100644
--- a/examples/MG_two_storages/run_MG_two_storages.py
+++ b/examples/MG_two_storages/run_MG_two_storages.py
@@ -1,6 +1,5 @@
 """2-Storage Microgrid launcher. See the docs for more details about this experiment.
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import sys
diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
index 30b9cc78..b4bd8788 100644
--- a/examples/PLE/PLE_env.py
+++ b/examples/PLE/PLE_env.py
@@ -1,7 +1,7 @@
 """ Interface with the PLE environment
-Authors: Vincent Francois-Lavet, David Taralla
-Modified by: Norman Tasfi
+
 """
+
 import numpy as np
 import cv2
 from ple import PLE
diff --git a/examples/gym/run_mountain_car.py b/examples/gym/run_mountain_car.py
index e4e877ea..3d95f846 100644
--- a/examples/gym/run_mountain_car.py
+++ b/examples/gym/run_mountain_car.py
@@ -1,7 +1,6 @@
 """ Mountain car environment launcher.
 Same principles as run_toy_env. See the docs for more details.
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import sys
diff --git a/examples/gym/run_mountain_car_continuous.py b/examples/gym/run_mountain_car_continuous.py
index 128b7570..344c819b 100644
--- a/examples/gym/run_mountain_car_continuous.py
+++ b/examples/gym/run_mountain_car_continuous.py
@@ -1,7 +1,6 @@
 """ Launcher for mountain car environment with continuous action space.
 Same principles as run_toy_env. See the wiki for more details.
 
-Author: Vincent Francois-Lavet
 """
 
 import sys
diff --git a/examples/test_CRAR/catcher_env.py b/examples/test_CRAR/catcher_env.py
index 983d80fb..6f726e7d 100644
--- a/examples/test_CRAR/catcher_env.py
+++ b/examples/test_CRAR/catcher_env.py
@@ -1,6 +1,5 @@
 """ Interface with the catcher environment
 
-Authors: Vincent Francois-Lavet
 """
 import numpy as np
 import cv2
diff --git a/examples/test_CRAR/simple_maze_env.py b/examples/test_CRAR/simple_maze_env.py
index 82499be5..fd027a8a 100644
--- a/examples/test_CRAR/simple_maze_env.py
+++ b/examples/test_CRAR/simple_maze_env.py
@@ -1,7 +1,5 @@
 """ Simple maze environment
 
-Authors: Vincent Francois-Lavet
-
 """
 import numpy as np
 import cv2
diff --git a/examples/toy_env/Toy_env.py b/examples/toy_env/Toy_env.py
index 7b9ffa50..ddcd59e6 100644
--- a/examples/toy_env/Toy_env.py
+++ b/examples/toy_env/Toy_env.py
@@ -8,7 +8,6 @@
 - Either the agent possesses the good or not (1 or 0)
 The price signal is build following the same rules for the training and the validation environment. That allows the agent to learn a strategy that exploits this successfully.
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import numpy as np
diff --git a/examples/toy_env/run_toy_env.py b/examples/toy_env/run_toy_env.py
index acf8b9bf..cadaf9f2 100644
--- a/examples/toy_env/run_toy_env.py
+++ b/examples/toy_env/run_toy_env.py
@@ -1,6 +1,5 @@
 """Toy environment launcher. See the docs for more details about this environment.
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import sys
diff --git a/examples/toy_env/run_toy_env_simple.py b/examples/toy_env/run_toy_env_simple.py
index 638bce03..14bf68b3 100644
--- a/examples/toy_env/run_toy_env_simple.py
+++ b/examples/toy_env/run_toy_env_simple.py
@@ -1,6 +1,5 @@
 """Toy environment launcher. See the docs for more details about this environment.
 
-Authors: Vincent Francois-Lavet, David Taralla
 """
 
 import numpy as np

From c40851ddb055f69fdc61aa6d0151177b679ab6cd Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 13:24:00 -0400
Subject: [PATCH 61/96] some cleaning

---
 deer/learning_algo/CRAR_keras.py    | 38 +++--------------------------
 deer/learning_algo/NN_CRAR_keras.py | 38 +----------------------------
 2 files changed, 4 insertions(+), 72 deletions(-)

diff --git a/deer/learning_algo/CRAR_keras.py b/deer/learning_algo/CRAR_keras.py
index cc4d1198..7b6d4da9 100644
--- a/deer/learning_algo/CRAR_keras.py
+++ b/deer/learning_algo/CRAR_keras.py
@@ -16,9 +16,8 @@
 import copy
 
 def mean_squared_error_p(y_true, y_pred):
-    return K.clip(K.max(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
-    #return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = mse error
-    #return K.mean(  K.square( K.clip(K.abs(y_pred - y_true)-1,0.,100.) )  ,  axis=-1  )   # = mse error
+    return K.clip(K.max(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)     # = modified mse error L_inf
+    #return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = modified mse error L_2
 
 def exp_dec_error(y_true, y_pred):
     return K.exp( - 5.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
@@ -28,12 +27,6 @@ def cosine_proximity2(y_true, y_pred):
     y_pred = K.l2_normalize(y_pred[:,0:2], axis=-1)
     return -K.sum(y_true * y_pred, axis=-1)
 
-#def rms_from_squared_components(y_true, y_pred):
-#    return - K.sum(  K.sqrt( K.clip(y_pred,0.000001,1))  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1#
-#
-#def squared_error_from_squared_components(y_true, y_pred):
-#    return - K.sum(  K.clip(y_pred,0.,1)  , axis=-1, keepdims=True ) # tend to increase y_pred --> loss -1
-
 def loss_diff_s_s_(y_true, y_pred):
     return K.square(   1.    -    K.sqrt(  K.clip( K.sum(y_pred,axis=-1,keepdims=True), 0.000001 , 1. )  )     ) # tend to increase y_pred --> loss -1
 
@@ -190,12 +183,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1
         states_val=list(states_val)
         next_states_val=list(next_states_val)
-        #for i,o in enumerate(states_val):
-        #    if(o.ndim==5): #FIXME
-        #        states_val[i]=states_val[i][:,0,:,:,:]/128.-1
-        #for i,o in enumerate(next_states_val):
-        #    if(o.ndim==5): #FIXME
-        #        next_states_val[i]=next_states_val[i][:,0,:,:,:]/128.-1
             
         Es_=self.encoder.predict(next_states_val)
         Es=self.encoder.predict(states_val)
@@ -331,7 +318,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
         
-        #next_q_vals = self.next_full_Q.predict([next_states_val[0],np.zeros_like(Es)]) #np.zeros((32,self.learn_and_plan.internal_dim))])
         next_q_vals = self.full_Q_target.predict(next_states_val)
         
         if(self._double_Q==True):
@@ -346,7 +332,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         
-        #q_vals=self.full_Q.predict([states_val[0],np.zeros_like(Es)]) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])
         q_vals=self.full_Q.predict([states_val[0]])
         
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
@@ -366,8 +351,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         loss=0
         #loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
         loss=self.full_Q.train_on_batch(states_val , q_vals ) 
-        #print "self.q_vals.optimizer.lr"
-        #print K.eval(self.q_vals.optimizer.lr)
         self.loss_Q+=loss
 
         if(self.update_counter%100==0):
@@ -437,9 +420,6 @@ def qValues(self, state_val):
         The q values for the provided belief state
         """ 
         copy_state=copy.deepcopy(state_val) #Required because of the "hack" below
-        #for i,o in enumerate(state):
-        #    if(o.ndim==4): #FIXME
-        #        copy_state[i]=copy_state[i][0,:,:,:]/128.-1
 
         #return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
         return self.full_Q.predict([np.expand_dims(state,axis=0) for state in copy_state])[0]
@@ -610,8 +590,6 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
             r_vals_d0=r_vals_d0.flatten()
             
             gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
-            #print "r_vals_d0"
-            #print r_vals_d0
             gamma_vals_d0=gamma_vals_d0.flatten()
 
             next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
@@ -629,9 +607,6 @@ def chooseBestAction(self, state, mode):
         The best action : int
         """
         copy_state=copy.deepcopy(state) #Required because of the "hack" below
-        #for i,o in enumerate(state):
-        #    if(o.ndim==4): #FIXME
-        #        copy_state[i]=copy_state[i][0,:,:,:]/128.-1
 
         if(mode==None):
             mode=0
@@ -723,15 +698,8 @@ def transfer(self, original, transfer, epochs=1):
         # First, make sure that the target network and the current network are the same
         self._resetQHat()
         # modify the loss of the encoder
-        #self.encoder=self.learn_and_plan.encoder_model()
-        #for l in self.encoder.layers[-5:]:
-        #    l.trainable = False # Freeze dense layers # DOES NOT SEEM TO HELP (transfer on catcher)
-        #print "self.encoder.layers[-1].get_weights()"
-        #print self.encoder.layers[-1].get_weights()
-        
         optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        self.encoder.compile(optimizer=optimizer4,
-                  loss='mse')
+        self.encoder.compile(optimizer=optimizer4, loss='mse')
         
         # Then, train the encoder such that the original and transfer states are mapped into the same abstract representation
         x_original=self.encoder.predict(original)#[0]
diff --git a/deer/learning_algo/NN_CRAR_keras.py b/deer/learning_algo/NN_CRAR_keras.py
index 4bfee42b..4ceffd40 100644
--- a/deer/learning_algo/NN_CRAR_keras.py
+++ b/deer/learning_algo/NN_CRAR_keras.py
@@ -363,44 +363,8 @@ def force_features(self,encoder_model,transition_model,plan_depth=0):
         
         print "Tx._keras_shape"
         print Tx._keras_shape
-        
-#        input = Input(shape=(self.internal_dim,self._n_actions))
-#        inputs.append(input)
-#        
-#        #if(high_int_dim==True):
-#        #    Tx_tiled=K.tile(Tx,(self._n_actions,1,1,1))
-#        #else:
-#        #    Tx_tiled=K.tile(Tx,(self._n_actions,1))
-#        
-#        for i in range self._n_actions:            
-#            #constants = np.zeros((self._n_actions))
-#            #k_constants = K.variable(constants)
-#            #fixed_input = Input(tensor=k_constants)
-#            Tx= transition_model([Tx,constants])
-#        Tx_tiled=Dot(axes=(-1))([Tx,fixed_input])
-#
-#        print "Tx_tiled._keras_shape"
-#        print Tx_tiled._keras_shape
-            
+                    
         diff_features = Subtract()([Tx,enc_x]) # Modification of the features after (sequence of) action(s)
-
-        #print "K.eval(diff_features)"
-        #print diff_features.output
-        #inputs.append(Input(shape=(self.internal_dim,)))
-        #cos_proxi=Dot(axes=(-1),normalize=True)([diff_features,inputs[-1]]) # Cosine proximity between diff_features and target_modif_features
-        
-        #constants = np.ones((self.internal_dim,))#((self._batch_size*self._n_actions,self.internal_dim,))
-        #k_constants = K.variable(constants)
-        #fixed_input = Input(tensor=k_constants)
-        #inputs.append(fixed_input)
-        #print "fixed_input._keras_shape"
-        #print fixed_input._keras_shape
-        #cos_proxi_add1=Subtract()([fixed_input,cos_proxi])
-        
-        #print "cos_proxi.output"
-        #print cos_proxi.output
-        #print "cos_proxi._keras_shape"
-        #print cos_proxi._keras_shape
         
         model = Model(inputs=inputs, outputs=diff_features )
         

From 87c2024467926dc0b4fd5ddac0ce05d2aa24448d Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 13:53:15 -0400
Subject: [PATCH 62/96] fixing doc

---
 deer/agent.py                               | 59 ++++++++-------------
 deer/base_classes/learning_algo.py          |  4 +-
 deer/learning_algo/AC_net_keras.py          |  4 +-
 deer/learning_algo/q_net_keras.py           |  4 +-
 docs/index.rst                              |  2 +-
 docs/modules/q-networks.rst                 | 32 -----------
 docs/user/environments.rst                  |  3 +-
 docs/user/environments/toy_env_pendulum.rst | 59 ---------------------
 docs/user/tutorial.rst                      |  4 +-
 examples/gym/mountain_car_env.py            |  3 +-
 examples/gym/pendulum_env.py                |  2 +-
 examples/toy_env/Toy_env.py                 |  2 -
 12 files changed, 35 insertions(+), 143 deletions(-)
 delete mode 100644 docs/modules/q-networks.rst
 delete mode 100644 docs/user/environments/toy_env_pendulum.rst

diff --git a/deer/agent.py b/deer/agent.py
index edd26ca7..6f7667d7 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -1,4 +1,5 @@
-"""This module contains classes used to define the standard behavior of the agent.
+""" 
+This module contains classes used to define the standard behavior of the agent.
 It relies on the controllers, the chosen training/test policy and the learning algorithm
 to specify its behavior in the environment.
 
@@ -16,7 +17,7 @@
 from deer.policies import EpsilonGreedyPolicy
 
 class NeuralAgent(object):
-    """The NeuralAgent class wraps a deep Q-network for training and testing in a given environment.
+    """The NeuralAgent class wraps a learning algorithm (such as a deep Q-network) for training and testing in a given environment.
     
     Attach controllers to it in order to conduct an experiment (when to train the agent, when to test,...).
     
@@ -24,8 +25,8 @@ class NeuralAgent(object):
     -----------
     environment : object from class Environment
         The environment in which the agent interacts
-    q_network : object from class QNetwork
-        The q_network associated to the agent
+    learning_algo : object from class LearningAlgo
+        The learning algorithm associated to the agent
     replay_memory_size : int
         Size of the replay memory. Default : 1000000
     replay_start_size : int
@@ -47,7 +48,7 @@ class NeuralAgent(object):
         observations before the beginning of the episode
     """
 
-    def __init__(self, environment, q_network, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True):
+    def __init__(self, environment, learning_algo, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True):
         inputDims = environment.inputDimensions()
         
         if replay_start_size == None:
@@ -57,7 +58,7 @@ def __init__(self, environment, q_network, replay_memory_size=1000000, replay_st
         
         self._controllers = []
         self._environment = environment
-        self._network = q_network
+        self._learning_algo = learning_algo
         self._replay_memory_size = replay_memory_size
         self._replay_start_size = replay_start_size
         self._batch_size = batch_size
@@ -77,11 +78,11 @@ def __init__(self, environment, q_network, replay_memory_size=1000000, replay_st
         for i in range(len(inputDims)):
             self._state.append(np.zeros(inputDims[i], dtype=float))
         if (train_policy==None):
-            self._train_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.1)
+            self._train_policy = EpsilonGreedyPolicy(learning_algo, environment.nActions(), random_state, 0.1)
         else:
             self._train_policy = train_policy
         if (test_policy==None):
-            self._test_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.)
+            self._test_policy = EpsilonGreedyPolicy(learning_algo, environment.nActions(), random_state, 0.)
         else:
             self._test_policy = test_policy
         self.gathering_data=True    # Whether the agent is gathering data or not
@@ -96,22 +97,22 @@ def setControllersActive(self, toDisable, active):
     def setLearningRate(self, lr):
         """ Set the learning rate for the gradient descent
         """
-        self._network.setLearningRate(lr)
+        self._learning_algo.setLearningRate(lr)
 
     def learningRate(self):
         """ Get the learning rate
         """
-        return self._network.learningRate()
+        return self._learning_algo.learningRate()
 
     def setDiscountFactor(self, df):
         """ Set the discount factor
         """
-        self._network.setDiscountFactor(df)
+        self._learning_algo.setDiscountFactor(df)
 
     def discountFactor(self):
         """ Get the discount factor
         """
-        return self._network.discountFactor()
+        return self._learning_algo.discountFactor()
 
     def overrideNextAction(self, action):
         """ Possibility to override the chosen action. This possibility should be used on the signal OnActionChosen.
@@ -140,13 +141,6 @@ def totalRewardOverLastTest(self):
         """
         return self._total_mode_reward/self._totalModeNbrEpisode, self._totalModeNbrEpisode
 
-#    def bestAction(self):
-#        """ Returns the best Action
-#        """
-#        action = self._network.chooseBestAction(self._state)
-#        V = max(self._network.qValues(self._state))
-#        return action, V
-     
     def attach(self, controller):
         if (isinstance(controller, controllers.Controller)):
             self._controllers.append(controller)
@@ -179,12 +173,12 @@ def summarizeTestPerformance(self):
         if self._mode == -1:
             raise AgentError("Cannot summarize test performance outside test environment.")
 
-        self._environment.summarizePerformance(self._tmp_dataset, self._network, train_data_set=self._dataset)
+        self._environment.summarizePerformance(self._tmp_dataset, self._learning_algo, train_data_set=self._dataset)
 
     def train(self):
         """
         This function selects a random batch of data (with self._dataset.randomBatch) and performs a 
-        Q-learning iteration (with self._network.train).        
+        Q-learning iteration (with self._learning_algo.train).        
         """
         # We make sure that the number of elements in the replay memory
         # is strictly superior to self._replay_start_size before taking 
@@ -193,12 +187,12 @@ def train(self):
             return
 
         try:
-            if hasattr(self._network, 'nstep'):
-                observations, actions, rewards, terminals, rndValidIndices = self._dataset.randomBatch_nstep(self._batch_size, self._network.nstep, self._exp_priority)
-                loss, loss_ind = self._network.train(observations, actions, rewards, terminals)
+            if hasattr(self._learning_algo, 'nstep'):
+                observations, actions, rewards, terminals, rndValidIndices = self._dataset.randomBatch_nstep(self._batch_size, self._learning_algo.nstep, self._exp_priority)
+                loss, loss_ind = self._learning_algo.train(observations, actions, rewards, terminals)
             else:
                 states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch(self._batch_size, self._exp_priority)
-                loss, loss_ind = self._network.train(states, actions, rewards, next_states, terminals)
+                loss, loss_ind = self._learning_algo.train(states, actions, rewards, next_states, terminals)
 
             self._training_loss_averages.append(loss)
             if (self._exp_priority):
@@ -227,7 +221,7 @@ def dumpNetwork(self, fname, nEpoch=-1):
             if fname in f:
                 os.remove("nnets/" + f)
 
-        all_params = self._network.getAllParams()
+        all_params = self._learning_algo.getAllParams()
 
         if (nEpoch>=0):
             joblib.dump(all_params, basename + ".epoch={}".format(nEpoch))
@@ -252,7 +246,7 @@ def setNetwork(self, fname, nEpoch=-1):
         else:
             all_params = joblib.load(basename)
 
-        self._network.setAllParams(all_params)
+        self._learning_algo.setAllParams(all_params)
 
     def run(self, n_epochs, epoch_length):
         """
@@ -274,16 +268,7 @@ def run(self, n_epochs, epoch_length):
         while i < n_epochs or self._mode_epochs_length > 0:
             self._training_loss_averages = []
 
-            if self._mode != -1:
-                #loss=0
-                #for ii in range(10000):
-                #    states, actions, rewards, next_states, terminals, rndValidIndices = self._dataset.randomBatch(self._batch_size, self._exp_priority)
-                #    loss+=self._network.train_model(states, actions, rewards, next_states, terminals)
-                #    if(ii%100==99):
-                #        print "loss T before valid or test"
-                #        print loss/100.
-                #        loss=0
-                
+            if self._mode != -1:                
                 self._totalModeNbrEpisode=0
                 while self._mode_epochs_length > 0:
                     self._totalModeNbrEpisode += 1
diff --git a/deer/base_classes/learning_algo.py b/deer/base_classes/learning_algo.py
index f1da26dc..80fecefe 100644
--- a/deer/base_classes/learning_algo.py
+++ b/deer/base_classes/learning_algo.py
@@ -29,12 +29,12 @@ def train(self, states, actions, rewards, nextStates, terminals):
         raise NotImplementedError()
 
     def chooseBestAction(self, state):
-        """ Get the best action for a belief state
+        """ Get the best action for a pseudo-state
         """        
         raise NotImplementedError()
 
     def qValues(self, state):
-        """ Get the q value for one belief state
+        """ Get the q value for one pseudo-state
         """        
         raise NotImplementedError()
 
diff --git a/deer/learning_algo/AC_net_keras.py b/deer/learning_algo/AC_net_keras.py
index 00dac986..b3f5433c 100644
--- a/deer/learning_algo/AC_net_keras.py
+++ b/deer/learning_algo/AC_net_keras.py
@@ -207,11 +207,11 @@ def gradients(self, states, actions):
         return out
 
     def chooseBestAction(self, state):
-        """ Get the best action for a belief state
+        """ Get the best action for a pseudo-state
 
         Arguments
         ---------
-        state : one belief state
+        state : one pseudo-state
 
         Returns
         -------
diff --git a/deer/learning_algo/q_net_keras.py b/deer/learning_algo/q_net_keras.py
index 17bb8818..ddd41113 100644
--- a/deer/learning_algo/q_net_keras.py
+++ b/deer/learning_algo/q_net_keras.py
@@ -148,11 +148,11 @@ def qValues(self, state_val):
         return self.q_vals.predict([np.expand_dims(state,axis=0) for state in state_val])[0]
 
     def chooseBestAction(self, state, *args, **kwargs):
-        """ Get the best action for a belief state
+        """ Get the best action for a pseudo-state
 
         Arguments
         ---------
-        state : one belief state
+        state : one pseudo-state
 
         Returns
         -------
diff --git a/docs/index.rst b/docs/index.rst
index 52abf77d..46e6f8d0 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -65,7 +65,7 @@ If you are looking for information on a specific function, class or method, this
   modules/agents
   modules/controllers
   modules/environments
-  modules/q-networks
+  modules/learning-algorithms
   modules/policies
   
 Indices and tables
diff --git a/docs/modules/q-networks.rst b/docs/modules/q-networks.rst
deleted file mode 100644
index 1d7b7323..00000000
--- a/docs/modules/q-networks.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-:mod:`Learning algorithms`
-==========================
-
-Q-learning
----------------------
-.. autosummary::
-
-    deer.base_classes.QNetwork
-    deer.q_networks.q_net_theano.MyQNetwork
-    deer.q_networks.q_net_keras.MyQNetwork
-
-.. _actor-critic:
-
-Actor-critic learning
----------------------
-.. autosummary::
-    deer.q_networks.AC_net_keras.MyACNetwork
-
-Detailed description
---------------------
-
-.. autoclass:: deer.base_classes.QNetwork
-   :members:
-.. autoclass:: deer.q_networks.q_net_theano.MyQNetwork
-   :members:
-   :show-inheritance:
-.. autoclass:: deer.q_networks.AC_net_keras.MyACNetwork
-   :members:
-   :show-inheritance:
-.. autoclass:: deer.q_networks.q_net_keras.MyQNetwork
-   :members:
-   :show-inheritance:
diff --git a/docs/user/environments.rst b/docs/user/environments.rst
index 2f1b00d7..a4567afa 100644
--- a/docs/user/environments.rst
+++ b/docs/user/environments.rst
@@ -16,7 +16,7 @@ You can find these examples at the |package_root|. For each example at least two
 
 The launcher file performs different actions:
 
-* It instantiates the environment, the agent (along with a q-network).
+* It instantiates the environment and the agent along with a learning algorithm (such as a q-network).
 * It binds controllers to the agent
 * it finally runs the experiment
 
@@ -26,7 +26,6 @@ Examples are better than precepts and the best is to get started with the follow
   :maxdepth: 2
   
   environments/toy_env_time_series.rst
-  environments/toy_env_pendulum.rst
   environments/gym.rst
   environments/two_storages.rst  
   environments/PLE.rst
diff --git a/docs/user/environments/toy_env_pendulum.rst b/docs/user/environments/toy_env_pendulum.rst
deleted file mode 100644
index ba25dbd3..00000000
--- a/docs/user/environments/toy_env_pendulum.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-.. _toy_env_pendulum:
-
-:mod:`The pendulum on a cart`
-=============================
-
-
-Description
-########### 
-
-The environment simulates the behavior of an inverted pendulum. The theoretical system with its equations are as described in |barto-sutton-anderson|:
-
-.. |barto-sutton-anderson| raw:: html
-
-   <a href="https://webdocs.cs.ualberta.ca/~sutton/papers/barto-sutton-anderson-83.pdf" target="_blank">Barto et al. (1983)</a>
-
-
-* A cart of mass :math:`M` that can move horizontally;
-* A pole of mass :math:`m` and length :math:`l` attached to the cart, with :math:`\theta` in :math:`[0, -\pi]` for the lefthand plane, and :math:`[0, \pi]` for the righthand side. We are supposing that the cart is moving on a rail and the pole can go under it.
-
-.. image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Cart-pendulum.svg/2000px-Cart-pendulum.svg.png
-   :width: 200 px
-   :alt: Inverted Pendulum
-   :align: center
-
-
-The goal of the agent is to balance the pole above its supporting cart (:math:`\theta=0`), by displacing the cart left or right - thus, 2 actions are possible. To do so, the environment communicates to the agent:
-
-* A vector (position, speed, angle, angular speed);
-* The reward associated to the action chosen by the agent. 
-
-Results
-########
-
-In a terminal windown go to the folder ``examples/pendulum``. The example can then be run with 
-
-.. code-block:: bash
-
-    python run_pendulum.py
-
-Here are the outputs of the agent after respectively 20 and 70 learning epochs, with 1000 steps in each. We clearly see the final success of the agent in controlling the inverted pendulum. 
-
-Note: a MP4 is generated every `PERIOD_BTW_SUMMARY_PERFS` epochs and you need the [FFmpeg](https://www.ffmpeg.org/) library to do so. If you do not want to install this library or to generate the videos, just set `PERIOD_BTW_SUMMARY_PERFS = -1`.
-
-.. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/output2.gif
-   :width: 500 px
-   :align: center
-
-.. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/output7.gif
-   :width: 500 px
-   :align: center
-
-Details on the implementation
-##############################
-
-The main focus in the environment is to implement `act(self, action)` which specifies how the cart-pole system behaves in response to an input action. So first, we transcript the physical laws that rule the motion of the pole and the cart. The simulation timestep of the agent is :math:`\Delta_t=0.02` second. But we discretize this value even further in `act(self, action)`, in order to obtain dynamics that are closer to the exact differential equations. 
-Secondly, we chose the reward function as the sum of :
-
-* :math:`- |\theta|` such that the agent receives 0 when the pole is standing up, and a negative reward proportional to the angle otherwise.
-* :math:`- \frac{|x|}{2}` such that the agent receives a negative reward when it is far from :math:`x=0`.
\ No newline at end of file
diff --git a/docs/user/tutorial.rst b/docs/user/tutorial.rst
index a36d02dd..4cfe5b6b 100644
--- a/docs/user/tutorial.rst
+++ b/docs/user/tutorial.rst
@@ -46,9 +46,9 @@ How can I get started?
 
 First, make sure you have installed the package properly by following the steps descibed in :ref:`installation`.
 
-The general idea of this framework is that you need to instantiate an agent (along with a q-network) and an environment. In order to perform an experiment, you also need to attach to the agent some controllers for controlling the training and the various parameters of your agent.
+The general idea of this framework is that you need to instantiate an agent (along with a learning algorithm) and an environment. In order to perform an experiment, you also need to attach to the agent some controllers for controlling the training and the various parameters of your agent.
 
-The environment should be built specifically for any specific task while q-networks, the DQN agent and many controllers are provided within this package. 
+The environment should be built specifically for any specific task while learning algorithms (such as q-networks) and many controllers are provided within this package. 
 
 The best to get started is to have a look at the :ref:`examples` and in particular the two first environments that are simple to understand: 
 
diff --git a/examples/gym/mountain_car_env.py b/examples/gym/mountain_car_env.py
index b46eef20..06be69e5 100644
--- a/examples/gym/mountain_car_env.py
+++ b/examples/gym/mountain_car_env.py
@@ -17,7 +17,8 @@ def __init__(self, rng):
         self._last_observation = self.env.reset()
         self.is_terminal=False
         self._input_dim = [(1,), (1,)]      # self.env.observation_space.shape is equal to 4 
-                                            # and we use only the current value in the belief state
+                                            # and we use only the current observation in the pseudo-state
+
     def act(self, action):
         """ Simulate one time step in the environment.
         """
diff --git a/examples/gym/pendulum_env.py b/examples/gym/pendulum_env.py
index d552eb55..4aa63147 100644
--- a/examples/gym/pendulum_env.py
+++ b/examples/gym/pendulum_env.py
@@ -16,7 +16,7 @@ def __init__(self, rng):
         self._last_observation = self.env.reset()
         self.is_terminal=False
         self._input_dim = [(1,), (1,), (1,), (1,)]  # self.env.observation_space.shape is equal to 4 
-                                                    # and we use only the current value in the belief state
+                                                    # and we use only the current observations in the pseudo-state
 
     def act(self, action):
         """ Simulate one time step in the environment.
diff --git a/examples/toy_env/Toy_env.py b/examples/toy_env/Toy_env.py
index ddcd59e6..9b33935d 100644
--- a/examples/toy_env/Toy_env.py
+++ b/examples/toy_env/Toy_env.py
@@ -14,8 +14,6 @@
 from mpl_toolkits.axes_grid1 import host_subplot
 import mpl_toolkits.axisartist as AA
 import matplotlib.pyplot as plt
-import theano
-import copy
 
 from deer.base_classes import Environment
 

From 9d44775a61fb992b0379f146b16d91db38bf6f30 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 14:41:01 -0400
Subject: [PATCH 63/96] additional fixes

---
 examples/ALE/ALE_env_gym.py | 122 ++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 examples/ALE/ALE_env_gym.py

diff --git a/examples/ALE/ALE_env_gym.py b/examples/ALE/ALE_env_gym.py
new file mode 100644
index 00000000..39b012b7
--- /dev/null
+++ b/examples/ALE/ALE_env_gym.py
@@ -0,0 +1,122 @@
+""" Interface with the ALE environment
+
+Authors: Vincent Francois-Lavet
+"""
+import numpy as np
+np.set_printoptions(threshold=np.nan)
+import cv2
+#from ale_python_interface import ALEInterface
+import gym
+from deer.base_classes import Environment
+
+import matplotlib
+matplotlib.use('qt5agg')
+from mpl_toolkits.axes_grid1 import host_subplot
+import mpl_toolkits.axisartist as AA
+import matplotlib.pyplot as plt
+from PIL import Image
+
+import gym
+    
+class MyEnv(Environment):
+    VALIDATION_MODE = 0
+
+    def __init__(self, rng, **kwargs):
+        """ Initialize environment.
+
+        Arguments:
+            rng - the numpy random number generator            
+        """
+        self.env = gym.make('SpaceInvaders-v4')#Breakout-v4')#BeamRider-v4')#Qbert-v4')#Seaquest-v4')#Freeway-v4')
+        self._random_state=rng
+        self.env.reset()
+        frame_skip=kwargs.get('frame_skip',1)
+        self._frame_skip = frame_skip if frame_skip >= 1 else 1
+        
+        self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1)
+        self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
+        #plt.imshow(self._reduced_screen, cmap='gray')
+        #plt.show()
+        
+        self._mode = -1
+        self._mode_score = 0.0
+        self._mode_episode_count = 0
+
+
+                
+    def reset(self, mode):
+        if mode == self._mode:
+            # already in the right mode
+            self._mode_episode_count += 1
+        else:
+            # switching mode
+            self._mode = mode
+            self._mode_score = 0.0
+            self._mode_episode_count = 0
+
+        self.env.reset()
+        for _ in range(self._random_state.randint(15)):
+            action = self.env.action_space.sample()
+
+            # this executes the environment with an action,
+            # and returns the observation of the environment,
+            # the reward, if the env is over, and other info.
+            observation, reward, self.terminal, info = self.env.step(action)
+
+        self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1)
+        self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
+        self.state=np.zeros((84,84), dtype=np.uint8) #FIXME
+        
+        return [4 * [84 * [84 * [0]]]]
+        
+        
+    def act(self, action):
+        #print "action"
+        #print action
+        
+        self.state=np.zeros((84,84), dtype=np.uint8)
+        reward=0
+        for t in range(4):
+            observation, r, self.terminal, info = self.env.step(action)
+            #print "observation, reward, self.terminal"
+            #print observation, reward, self.terminal
+            reward+=r
+            if self.inTerminalState():
+                break
+
+        self._screen=np.average(observation,axis=-1) # Gray levels
+        self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_NEAREST)  # 84*84
+        #plt.imshow(self._screen, cmap='gray')
+        #plt.show()
+        self.state=self._reduced_screen
+            
+        self._mode_score += reward
+        return np.sign(reward)
+
+    def summarizePerformance(self, test_data_set, learning_algo):
+        if self.inTerminalState() == False:
+            self._mode_episode_count += 1
+        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
+
+
+    def inputDimensions(self):
+        return [(4, 84, 84)] #FIXME
+
+    def observationType(self, subject):
+        return np.float #np.uint8
+
+    def nActions(self):
+        print "self.env.action_space"
+        print self.env.action_space
+        return self.env.action_space.n
+
+    def observe(self):
+        return [(np.array(self.state)-128.)/128.]
+
+    def inTerminalState(self):
+        return self.terminal
+                
+
+
+if __name__ == "__main__":
+    pass
\ No newline at end of file

From c38f1e97572f24696a28273414c3feaef7b806d6 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 14:50:11 -0400
Subject: [PATCH 64/96] improve doc and fixes

---
 README.rst                          | 12 ++----------
 deer/base_classes/environment.py    |  6 +++---
 deer/experiment/base_controllers.py | 12 ++++++------
 examples/ALE/run_ALE.py             |  2 +-
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/README.rst b/README.rst
index 34a5daa6..e6ad2144 100644
--- a/README.rst
+++ b/README.rst
@@ -27,7 +27,7 @@ Dependencies
 
 This framework is tested to work under Python 2.7, and Python 3.5. It should also work with Python 3.3 and 3.4.
 
-The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need theano >= 0.8 or tensorflow >= 0.9 along with the keras library.
+The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need Keras>=2.1.
 
 For running the examples, Matplotlib >= 1.1.1 is required.
 For running the atari games environment, you need to install ALE >= 0.4.
@@ -35,12 +35,4 @@ For running the atari games environment, you need to install ALE >= 0.4.
 Full Documentation
 ==================
 
-The documentation is available at : http://deer.readthedocs.io/
-
-Here are a few examples :
-
-.. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/seaquest.gif
-   :width: 200 px
-   
-.. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/output7.gif
-   :width: 200 px
+The documentation is available at : http://deer.readthedocs.io/
\ No newline at end of file
diff --git a/deer/base_classes/environment.py b/deer/base_classes/environment.py
index 26e2bcba..0c83d78e 100644
--- a/deer/base_classes/environment.py
+++ b/deer/base_classes/environment.py
@@ -80,13 +80,13 @@ def inTerminalState(self):
         that occured was terminal).
 
         As the majority of control tasks considered have no end (a continuous control should be operated), by default 
-        this returns always False. But in the context of a video game for instance, terminal states can occurs and 
-        these cases this method should be overriden.
+        this returns always False. But in the context of a video game for instance, terminal states can happen and in
+        these cases, this method should be overridden.
         
         Returns
         -------
         isTerminal : bool
-
+            Whether or not the current state is terminal
         """
 
         return False
diff --git a/deer/experiment/base_controllers.py b/deer/experiment/base_controllers.py
index 32d11d8a..5ea5bf87 100644
--- a/deer/experiment/base_controllers.py
+++ b/deer/experiment/base_controllers.py
@@ -134,7 +134,7 @@ def onStart(self, agent):
             return
 
         self._epoch_count = 0
-        agent._network.setLearningRate(self._init_lr)
+        agent._learning_algo.setLearningRate(self._init_lr)
         self._lr = self._init_lr * self._lr_decay
 
     def onEpochEnd(self, agent):
@@ -143,7 +143,7 @@ def onEpochEnd(self, agent):
 
         self._epoch_count += 1
         if self._periodicity <= 1 or self._epoch_count % self._periodicity == 0:
-            agent._network.setLearningRate(self._lr)
+            agent._learning_algo.setLearningRate(self._lr)
             self._lr *= self._lr_decay
 
 class EpsilonController(Controller):
@@ -265,7 +265,7 @@ def onStart(self, agent):
             return
 
         self._epoch_count = 0
-        agent._network.setDiscountFactor(self._init_df)
+        agent._learning_algo.setDiscountFactor(self._init_df)
         if (self._init_df < self._df_max):
             self._df = 1 - (1 - self._init_df) * self._df_growth
         else:
@@ -278,7 +278,7 @@ def onEpochEnd(self, agent):
         self._epoch_count += 1
         if self._periodicity <= 1 or self._epoch_count % self._periodicity == 0:
             if (self._df < self._df_max):
-                agent._network.setDiscountFactor(self._df)
+                agent._learning_algo.setDiscountFactor(self._df)
                 self._df = 1 - (1 - self._df) * self._df_growth
 
 
@@ -488,8 +488,8 @@ def onActionTaken(self, agent):
     def _print(self, agent):
         if self._periodicity <= 1 or self._count % self._periodicity == 0:
             print("{} {}:".format(self._string, self._count + 1))
-            print("Learning rate: {}".format(agent._network.learningRate()))
-            print("Discount factor: {}".format(agent._network.discountFactor()))
+            print("Learning rate: {}".format(agent._learning_algo.learningRate()))
+            print("Discount factor: {}".format(agent._learning_algo.discountFactor()))
             print("Epsilon: {}".format(agent._train_policy.epsilon()))
         self._count += 1
 
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index 27c4dbd5..5fb03a14 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -74,7 +74,7 @@ class Defaults:
     env = ALE_env(rng, frame_skip=parameters.frame_skip)
     
     # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
+    qnetwork = CRAR(
         env,
         parameters.rms_decay,
         parameters.rms_epsilon,

From f4f53f9e33e0f9756195304103f7006c93742a29 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 19 Jul 2018 18:35:59 -0400
Subject: [PATCH 65/96] improve docs and folder learning_algo becomes
 learning_algos

---
 deer/{learning_algo => learning_algos}/AC_net_keras.py  | 4 ++--
 deer/{learning_algo => learning_algos}/CRAR_keras.py    | 7 ++++---
 deer/{learning_algo => learning_algos}/NN_CRAR_keras.py | 0
 deer/{learning_algo => learning_algos}/NN_keras.py      | 0
 deer/{learning_algo => learning_algos}/NN_keras_LSTM.py | 0
 deer/{learning_algo => learning_algos}/__init__.py      | 0
 deer/{learning_algo => learning_algos}/q_net_keras.py   | 2 +-
 docs/conf.py                                            | 2 +-
 docs/index.rst                                          | 6 +++++-
 examples/ALE/run_ALE.py                                 | 2 +-
 examples/MG_two_storages/run_MG_two_storages.py         | 2 +-
 examples/gym/run_mountain_car.py                        | 2 +-
 examples/gym/run_mountain_car_continuous.py             | 2 +-
 examples/gym/run_pendulum.py                            | 2 +-
 examples/test_CRAR/run_catcher.py                       | 2 +-
 examples/test_CRAR/run_simple_maze.py                   | 2 +-
 examples/toy_env/run_toy_env.py                         | 2 +-
 examples/toy_env/run_toy_env_simple.py                  | 2 +-
 18 files changed, 22 insertions(+), 17 deletions(-)
 rename deer/{learning_algo => learning_algos}/AC_net_keras.py (98%)
 rename deer/{learning_algo => learning_algos}/CRAR_keras.py (99%)
 rename deer/{learning_algo => learning_algos}/NN_CRAR_keras.py (100%)
 rename deer/{learning_algo => learning_algos}/NN_keras.py (100%)
 rename deer/{learning_algo => learning_algos}/NN_keras_LSTM.py (100%)
 rename deer/{learning_algo => learning_algos}/__init__.py (100%)
 rename deer/{learning_algo => learning_algos}/q_net_keras.py (99%)

diff --git a/deer/learning_algo/AC_net_keras.py b/deer/learning_algos/AC_net_keras.py
similarity index 98%
rename from deer/learning_algo/AC_net_keras.py
rename to deer/learning_algos/AC_net_keras.py
index b3f5433c..705874ab 100644
--- a/deer/learning_algo/AC_net_keras.py
+++ b/deer/learning_algos/AC_net_keras.py
@@ -46,9 +46,9 @@ class MyACNetwork(ACNetwork):
         Activate or not the double_Q learning.
         More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
     neural_network_critic : object, optional
-        default is deer.qnetworks.NN_keras
+        default is deer.learning_algos.NN_keras
     neural_network_actor : object, optional
-        default is deer.qnetworks.NN_keras
+        default is deer.learning_algos.NN_keras
     """
 
     def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network_critic=NN, neural_network_actor=NN):
diff --git a/deer/learning_algo/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
similarity index 99%
rename from deer/learning_algo/CRAR_keras.py
rename to deer/learning_algos/CRAR_keras.py
index 7b6d4da9..f351415f 100644
--- a/deer/learning_algo/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -31,8 +31,7 @@ def loss_diff_s_s_(y_true, y_pred):
     return K.square(   1.    -    K.sqrt(  K.clip( K.sum(y_pred,axis=-1,keepdims=True), 0.000001 , 1. )  )     ) # tend to increase y_pred --> loss -1
 
 class CRAR(LearningAlgo):
-    """
-    Combined Reinforcement learning via Abstract Representations (CRAR) using Keras
+    """ Combined Reinforcement learning via Abstract Representations (CRAR) using Keras
     
     Parameters
     -----------
@@ -56,7 +55,7 @@ class CRAR(LearningAlgo):
         Activate or not the double_Q learning.
         More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
     neural_network : object, optional
-        default is deer.qnetworks.NN_keras
+        default is deer.learning_algos.NN_keras
     """
 
     def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN, **kwargs):
@@ -147,6 +146,8 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._resetQHat()
 
     def getAllParams(self):
+        """ Provides all parameters used by the learning algorithm
+        """
         params_value=[]
         for i,p in enumerate(self.params):
             params_value.append(K.get_value(p))
diff --git a/deer/learning_algo/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
similarity index 100%
rename from deer/learning_algo/NN_CRAR_keras.py
rename to deer/learning_algos/NN_CRAR_keras.py
diff --git a/deer/learning_algo/NN_keras.py b/deer/learning_algos/NN_keras.py
similarity index 100%
rename from deer/learning_algo/NN_keras.py
rename to deer/learning_algos/NN_keras.py
diff --git a/deer/learning_algo/NN_keras_LSTM.py b/deer/learning_algos/NN_keras_LSTM.py
similarity index 100%
rename from deer/learning_algo/NN_keras_LSTM.py
rename to deer/learning_algos/NN_keras_LSTM.py
diff --git a/deer/learning_algo/__init__.py b/deer/learning_algos/__init__.py
similarity index 100%
rename from deer/learning_algo/__init__.py
rename to deer/learning_algos/__init__.py
diff --git a/deer/learning_algo/q_net_keras.py b/deer/learning_algos/q_net_keras.py
similarity index 99%
rename from deer/learning_algo/q_net_keras.py
rename to deer/learning_algos/q_net_keras.py
index ddd41113..9c49d4c7 100644
--- a/deer/learning_algo/q_net_keras.py
+++ b/deer/learning_algos/q_net_keras.py
@@ -36,7 +36,7 @@ class MyQNetwork(QNetwork):
         Activate or not the double_Q learning.
         More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
     neural_network : object, optional
-        default is deer.qnetworks.NN_keras
+        default is deer.learning_algos.NN_keras
     """
 
     def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN):
diff --git a/docs/conf.py b/docs/conf.py
index 7bd606a0..1b908e93 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -69,7 +69,7 @@
     'sphinx.ext.mathjax',
 #    'sphinx.ext.viewcode',  # create HTML file of source code and link to it
 #    'sphinx.ext.linkcode',  # link to github, see linkcode_resolve() below
-    'numpydoc',
+##    'numpydoc',               # !Generates unwanted tables with autoclass!
 #    'sphinx.ext.napoleon',  # alternative to numpydoc -- looks a bit worse.
 ]
 
diff --git a/docs/index.rst b/docs/index.rst
index 46e6f8d0..9a12724c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,6 +19,11 @@ It is a work in progress and input is welcome. Please submit any contribution vi
 
 What is new
 ------------
+Version 0.4
+************
+- Integration of CRAR that allows to combine the model-free and the model-based approaches via abstract representations (see :ref:`CRAR`)
+- Augmented documentation and some interfaces have been updated.
+
 Version 0.3 
 ************
 - Integration of different exploration/exploitation policies and possibility to easily built your own (see :ref:`policies`)
@@ -26,7 +31,6 @@ Version 0.3
 - :ref:`naming_conv` and some interfaces have been updated. This may cause broken backward compatibility. In that case, make the changes to the new convention by looking at the API in this documentation or by looking at the current version of the examples.
 - Additional automated tests
 
-
 Version 0.2
 ***********
 - Standalone python package (you can simply do ``pip install deer``)
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index 5fb03a14..b5c06e5b 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -10,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algo.CRAR_keras import CRAR
+from deer.learning_algos.CRAR_keras import CRAR
 from ALE_env_gym import MyEnv as ALE_env
 import deer.experiment.base_controllers as bc
 
diff --git a/examples/MG_two_storages/run_MG_two_storages.py b/examples/MG_two_storages/run_MG_two_storages.py
index 36aec0ef..11bc1d44 100644
--- a/examples/MG_two_storages/run_MG_two_storages.py
+++ b/examples/MG_two_storages/run_MG_two_storages.py
@@ -15,7 +15,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algo.q_net_keras import MyQNetwork
+from deer.learning_algos.q_net_keras import MyQNetwork
 from MG_two_storages_env import MyEnv as MG_two_storages_env
 import deer.experiment.base_controllers as bc
 
diff --git a/examples/gym/run_mountain_car.py b/examples/gym/run_mountain_car.py
index 3d95f846..6f338a06 100644
--- a/examples/gym/run_mountain_car.py
+++ b/examples/gym/run_mountain_car.py
@@ -10,7 +10,7 @@
 import deer.experiment.base_controllers as bc
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algo.q_net_keras import MyQNetwork
+from deer.learning_algos.q_net_keras import MyQNetwork
 from mountain_car_env import MyEnv as mountain_car_env
 
 class Defaults:
diff --git a/examples/gym/run_mountain_car_continuous.py b/examples/gym/run_mountain_car_continuous.py
index 344c819b..67220f6e 100644
--- a/examples/gym/run_mountain_car_continuous.py
+++ b/examples/gym/run_mountain_car_continuous.py
@@ -10,7 +10,7 @@
 import deer.experiment.base_controllers as bc
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algo.AC_net_keras import MyACNetwork
+from deer.learning_algos.AC_net_keras import MyACNetwork
 from mountain_car_continuous_env import MyEnv as mountain_car_continuous_env
 from deer.policies import LongerExplorationPolicy
 
diff --git a/examples/gym/run_pendulum.py b/examples/gym/run_pendulum.py
index 36372d1a..6c8ae2a2 100644
--- a/examples/gym/run_pendulum.py
+++ b/examples/gym/run_pendulum.py
@@ -11,7 +11,7 @@
 import deer.experiment.base_controllers as bc
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algo.q_net_keras import MyQNetwork
+from deer.learning_algos.q_net_keras import MyQNetwork
 from pendulum_env import MyEnv as pendulum_env
 
 class Defaults:
diff --git a/examples/test_CRAR/run_catcher.py b/examples/test_CRAR/run_catcher.py
index cc921e41..0320a56c 100644
--- a/examples/test_CRAR/run_catcher.py
+++ b/examples/test_CRAR/run_catcher.py
@@ -10,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algo.CRAR_keras import CRAR
+from deer.learning_algos.CRAR_keras import CRAR
 from catcher_env import MyEnv as catcher_env
 import deer.experiment.base_controllers as bc
 
diff --git a/examples/test_CRAR/run_simple_maze.py b/examples/test_CRAR/run_simple_maze.py
index bbc9fc22..2a7eaa5a 100644
--- a/examples/test_CRAR/run_simple_maze.py
+++ b/examples/test_CRAR/run_simple_maze.py
@@ -10,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algo.CRAR_keras import CRAR
+from deer.learning_algos.CRAR_keras import CRAR
 from simple_maze_env import MyEnv as simple_maze_env
 import deer.experiment.base_controllers as bc
 
diff --git a/examples/toy_env/run_toy_env.py b/examples/toy_env/run_toy_env.py
index cadaf9f2..739d3697 100644
--- a/examples/toy_env/run_toy_env.py
+++ b/examples/toy_env/run_toy_env.py
@@ -10,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algo.q_net_keras import MyQNetwork
+from deer.learning_algos.q_net_keras import MyQNetwork
 from Toy_env import MyEnv as Toy_env
 import deer.experiment.base_controllers as bc
 from deer.policies import EpsilonGreedyPolicy
diff --git a/examples/toy_env/run_toy_env_simple.py b/examples/toy_env/run_toy_env_simple.py
index 14bf68b3..89e32ac7 100644
--- a/examples/toy_env/run_toy_env_simple.py
+++ b/examples/toy_env/run_toy_env_simple.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from deer.agent import NeuralAgent
-from deer.learning_algo.q_net_keras import MyQNetwork
+from deer.learning_algos.q_net_keras import MyQNetwork
 from Toy_env import MyEnv as Toy_env
 import deer.experiment.base_controllers as bc
 

From 5e57c1434c87cd34b21a1ed8a690551e434cd76a Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Mon, 23 Jul 2018 15:41:24 -0400
Subject: [PATCH 66/96] towards 0.4 in the docs

---
 docs/conf.py                  | 4 ++--
 docs/index.rst                | 4 ++--
 docs/modules/agents.rst       | 3 ---
 docs/modules/controllers.rst  | 3 ---
 docs/modules/environments.rst | 3 ---
 docs/modules/policies.rst     | 3 ---
 6 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 1b908e93..775dfbf0 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -95,9 +95,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.3.2'
+version = '0.4'
 # The full version, including alpha/beta/rc tags.
-release = '0.3.2'
+release = '0.4'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/index.rst b/docs/index.rst
index 9a12724c..fbf05760 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,12 +21,12 @@ What is new
 ------------
 Version 0.4
 ************
-- Integration of CRAR that allows to combine the model-free and the model-based approaches via abstract representations (see :ref:`CRAR`)
+- Integration of CRAR that allows to combine the model-free and the model-based approaches via abstract representations.
 - Augmented documentation and some interfaces have been updated.
 
 Version 0.3 
 ************
-- Integration of different exploration/exploitation policies and possibility to easily built your own (see :ref:`policies`)
+- Integration of different exploration/exploitation policies and possibility to easily built your own.
 - Integration of DDPG for continuous action spaces (see :ref:`actor-critic`)
 - :ref:`naming_conv` and some interfaces have been updated. This may cause broken backward compatibility. In that case, make the changes to the new convention by looking at the API in this documentation or by looking at the current version of the examples.
 - Additional automated tests
diff --git a/docs/modules/agents.rst b/docs/modules/agents.rst
index 25644faa..756067ac 100644
--- a/docs/modules/agents.rst
+++ b/docs/modules/agents.rst
@@ -10,9 +10,6 @@
     NeuralAgent
     DataSet
 
-Detailed description
---------------------
-
 .. autoclass:: NeuralAgent
    :members:
 .. autoclass:: DataSet
diff --git a/docs/modules/controllers.rst b/docs/modules/controllers.rst
index 2c119a9b..703f1ca9 100644
--- a/docs/modules/controllers.rst
+++ b/docs/modules/controllers.rst
@@ -16,9 +16,6 @@
     InterleavedTestEpochController
     FindBestController
 
-Detailed description
---------------------
-
 .. autoclass:: Controller
    :members:
 .. autoclass:: LearningRateController
diff --git a/docs/modules/environments.rst b/docs/modules/environments.rst
index 50613eb3..38f3d275 100644
--- a/docs/modules/environments.rst
+++ b/docs/modules/environments.rst
@@ -5,9 +5,6 @@
 
 .. automodule:: deer.base_classes.Environment
 
-Detailed description
---------------------
-
 .. autoclass:: deer.base_classes.Environment
    :members:
 
diff --git a/docs/modules/policies.rst b/docs/modules/policies.rst
index e83fdaa6..c36d047b 100644
--- a/docs/modules/policies.rst
+++ b/docs/modules/policies.rst
@@ -9,9 +9,6 @@
     deer.policies.EpsilonGreedyPolicy
     deer.policies.LongerExplorationPolicy
 
-Detailed description
---------------------
-
 .. autoclass:: deer.base_classes.Policy
    :members:
 .. autoclass:: deer.policies.EpsilonGreedyPolicy

From e0f3c8a020f58d3d6278c1dbb6c5d6902d7c6dd4 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Mon, 23 Jul 2018 15:58:48 -0400
Subject: [PATCH 67/96] clean simple_maze_env

---
 examples/test_CRAR/simple_maze_env.py | 122 ++++----------------------
 1 file changed, 16 insertions(+), 106 deletions(-)

diff --git a/examples/test_CRAR/simple_maze_env.py b/examples/test_CRAR/simple_maze_env.py
index fd027a8a..040480f1 100644
--- a/examples/test_CRAR/simple_maze_env.py
+++ b/examples/test_CRAR/simple_maze_env.py
@@ -36,27 +36,14 @@ def create_map(self):
         self._map[:,-1]=1
         self._map[:,self._size_maze//2]=1
         self._map[self._size_maze//2,self._size_maze//2]=0
-        #self._map[:,self._size_maze//3]=1
-        #self._map[-2,self._size_maze//3]=0
-        #self._map[:,2*self._size_maze//3]=1
-        #self._map[2,2*self._size_maze//3]=0
         self._pos_agent=[2,2]
         self._pos_goal=[self._size_maze-2,self._size_maze-2]
-        #self._map[3,6]=0.66
 
                 
     def reset(self, mode):
         self.create_map()
 
         self._map[self._size_maze//2,self._size_maze//2]=0
-        #if mode == -1:
-        #    i=np.random.randint(2)
-        #    if(i==0):
-        #        self._map[self._size_maze//2-1,self._size_maze//2]=0
-        #    if(i==1):
-        #        self._map[self._size_maze//2+1,self._size_maze//2]=0
-        #else:
-        #    self._map[self._size_maze//2+1,self._size_maze//2]=0
         
         if mode == MyEnv.VALIDATION_MODE:
             if self._mode != MyEnv.VALIDATION_MODE:
@@ -69,10 +56,8 @@ def reset(self, mode):
         elif self._mode != -1:
             self._mode = -1
         
-        #if self._mode == -1:
+        # Setting the starting position of the agent
         self._pos_agent=[self._size_maze//2,self._size_maze//2]
-        #else:
-        #    self._pos_agent=[1,1]
             
         print "reset mode"
         print mode
@@ -98,19 +83,16 @@ def act(self, action):
             if(self._map[self._pos_agent[0],self._pos_agent[1]+1]==0):
                 self._pos_agent[1]=self._pos_agent[1]+1
         
+        # There is no reward in this simple environment
         self.reward = 0
-        #if (self._pos_agent==self._pos_goal):
-        #    self.reward = 1
-        #if (self._pos_agent[1]>=self._size_maze-2 and action==3):
-        #    self.reward = 1 # used to delete those transitions
 
         self._mode_score += self.reward
         return self.reward
 
     def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
-        #print "test_data_set.observations.shape"
-        #print test_data_set.observations()[0][0:1]
-        
+        """ Plot of the low-dimensional representation of the environment built by the model
+        """
+
         for i in range(1):
             all_possib_inp=[]
             self.create_map()
@@ -127,8 +109,6 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
 
             
             all_possib_inp=np.expand_dims(np.array(all_possib_inp,dtype='float'),axis=1)
-            #print "all_possib_inp[0:2]"
-            #print all_possib_inp[0:2]
             print "all_possib_inp.shape"
             print all_possib_inp.shape
             print all_possib_inp.dtype
@@ -141,15 +121,11 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
             print "learning_algo.encoder.predict(all_possib_inp)[0:2]"
             print all_possib_abs_states[0:2]
             
-            #print "print test_data_set.observations()[0:2]"
-            #print test_data_set.observations()[0][0:2]
             n=500
             historics=[]
             for i,observ in enumerate(test_data_set.observations()[0][0:n]):
                 historics.append(np.expand_dims(observ,axis=0))
             historics=np.array(historics)
-            #print "historics[0:2]"
-            #print historics[0:2]
             print "historics.shape"
             print historics.shape
             print historics.dtype
@@ -187,11 +163,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
             y = np.array(abs_states)[:,1]
             if(self.intern_dim>2):
                 z = np.array(abs_states)[:,2]
-            
-            #Colors
-            #onehot_actions = np.zeros((n, 4))
-            #onehot_actions[np.arange(n), actions] = 1
-            
+                        
             fig = plt.figure()
             if(self.intern_dim==2):
                 ax = fig.add_subplot(111)
@@ -202,13 +174,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
                 ax.set_xlabel(r'$X_1$')
                 ax.set_ylabel(r'$X_2$')
                 ax.set_zlabel(r'$X_3$')
-            
-            #for j in range(3):
-            #    # Plot the trajectory
-            #    for i in xrange(n-1):
-            #        #ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
-            #        ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
-            
+                        
             # Plot the estimated transitions
             for i in range(n-1):
                 predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0,0]])])
@@ -224,32 +190,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
                     ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.9", alpha=0.75)
                     ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.65", alpha=0.75)
                     ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.4", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.15", alpha=0.75)
-            
-#            for xx in np.arange(self._size_maze)-self._size_maze//2:
-#                for yy in np.arange(self._size_maze)-self._size_maze//2:
-#                    for zz in np.arange(self._size_maze)-self._size_maze//2:
-#                        predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
-#                        predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
-#                        ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
-#                        ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
-            
-            
-            ## Plot the colorbar for the trajectory
-            #fig.subplots_adjust(right=0.7)
-            #ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
-            ## Set the colormap and norm to correspond to the data for which the colorbar will be used.
-            #cmap = matplotlib.cm.cool
-            #norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
-            #
-            ## ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
-            ## everything needed for a standalone colorbar.  There are many more kwargs, but the
-            ## following gives a basic continuous colorbar with ticks and labels.
-            #cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
-            #                        norm=norm,
-            #                        orientation='vertical')
-            #cb1.set_label('Beginning to end of trajectory')
-            
+                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.15", alpha=0.75)            
             
             # Plot the dots at each time step depending on the action taken
             length_block=[[0,18],[18,19],[19,31]]
@@ -258,45 +199,15 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
                     line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], s=30, marker='x', edgecolors='k', alpha=0.5)
                 else:
                     line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], s=30, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
-            #line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(1-actions/4.,axis=1),(1,3))-0.125, s=50, marker='o', edgecolors='k', alpha=0.75, depthshade=True)
-            #line2 = ax.scatter(x, y, c=np.tile(np.expand_dims(1-actions/4.,axis=1),(1,3))-0.125, s=50, marker='o', edgecolors='k', alpha=0.75)
+
             if(self.intern_dim==2):
                 axes_lims=[ax.get_xlim(),ax.get_ylim()]
             else:
                 axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
             
-            #zrange=axes_lims[2][1]-axes_lims[2][0]
-            
             # Plot the legend for the dots
             from matplotlib.patches import Circle, Rectangle
-            from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-#            box1 = TextArea(" State representation (action 0, action 1): ", textprops=dict(color="k"))
-#            
-#            box2 = DrawingArea(80, 20, 0, 0)
-#            el1 = Circle((10, 10), 5, fc="0.9", edgecolor="k", alpha=0.75)
-#            el2 = Circle((25, 10), 5, fc="0.65", edgecolor="k", alpha=0.75)
-#            el3 = Circle((40, 10), 5, fc="0.4", edgecolor="k", alpha=0.75)
-#            el4 = Circle((55, 10), 5, fc="0.15", edgecolor="k", alpha=0.75) 
-#            #el3 = Circle((50, 10), 5, fc="0", edgecolor="k") 
-#            box2.add_artist(el1)
-#            box2.add_artist(el2)
-#            box2.add_artist(el3)
-#            box2.add_artist(el4)
-#           
-#           
-#            box = HPacker(children=[box1, box2],
-#                          align="center",
-#                          pad=0, sep=5)
-#            
-#            anchored_box = AnchoredOffsetbox(loc=3,
-#                                             child=box, pad=0.,
-#                                             frameon=True,
-#                                             bbox_to_anchor=(0., 1.07),
-#                                             bbox_transform=ax.transAxes,
-#                                             borderpad=0.,
-#                                             )
-#            ax.add_artist(anchored_box)
-            
+            from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker            
             
             # Plot the legend for transition estimates
             box1b = TextArea(" Estimated transitions (action 0, 1, 2 and 3): ", textprops=dict(color="k"))
@@ -324,10 +235,6 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
             ax.add_artist(anchored_box)
             
             
-            
-            #ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-            #ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-            #ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
             #plt.show()
             plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
@@ -478,7 +385,6 @@ def get_higher_dim_obs(self,indices_agent,indices_reward):
         #reward_obs[5,2:4]=0.8
         
         for i in indices_reward:
-            #print self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]
             obs[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=reward_obs
 
         print indices_agent
@@ -492,10 +398,14 @@ def get_higher_dim_obs(self,indices_agent,indices_reward):
 
 
     def inTerminalState(self):
-#        if((self._pos_agent[0]<=1 and self._cur_action==0) ):#((self._pos_agent==[4,1] and self._cur_action==1) or (self._pos_agent==[5,2] and (self._cur_action==1 or self._cur_action==2)) or (self._pos_agent==[6,3] and self._cur_action==2))):
-#        #(self._pos_agent[1]>=self._size_maze-2 and self._cur_action==1) ):
+        # Uncomment the following lines to add some cases where the episode terminates.
+        # This is used to show how the environment representation interpret cases where 
+        # part of the environment could not be explored.
+#        if((self._pos_agent[0]<=1 and self._cur_action==0) ):
 #            return True
         return False
+
+        # If there is a goal, then terminates the environment when the goas is reached.
         #if (self._pos_agent==self._pos_goal):
         #    return True
         #else:

From 3bcd2b1d9933cfdf907087e7e2ce47b079d05be0 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 24 Jul 2018 09:44:59 -0400
Subject: [PATCH 68/96] clean CRAR_keras and NN_CRAR_keras

---
 deer/learning_algos/CRAR_keras.py    | 51 +---------------------------
 deer/learning_algos/NN_CRAR_keras.py | 29 ----------------
 2 files changed, 1 insertion(+), 79 deletions(-)

diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index f351415f..bc8ee546 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -95,7 +95,6 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.Q = self.learn_and_plan.Q_model()
         self.gamma = self.learn_and_plan.R_model()
         self.transition = self.learn_and_plan.transition_model()
-#        self.transition2 = self.learn_and_plan.transition_model2()
 
         self.full_Q=self.learn_and_plan.full_Q_model(self.encoder,self.Q,0,self._df)
         
@@ -363,52 +362,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         return np.sqrt(loss),loss_ind
 
 
-#    def train_model(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
-#        """
-#        Train the model based part
-#
-#        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
-#        2. perform batch training
-#
-#        Parameters
-#        -----------
-#        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-#        actions_val : b x 1 numpy array of integers
-#        rewards_val : b x 1 numpy array
-#        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-#        terminals_val : b x 1 numpy boolean array
-#
-#        Returns
-#        -------
-#        Average loss of the batch training (RMSE)
-#        Individual (square) losses for each tuple
-#        """
-#
-#        onehot_actions = np.zeros((self._batch_size, self._n_actions))
-#        onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
-#        Es_=self.encoder.predict([next_states_val[0]])
-#        Es=self.encoder.predict([states_val[0]])
-#        ETs=self.transition.predict([Es,onehot_actions])
-#
-##        if(self.update_counter>3000):
-#        self.loss_T2=self.transition2.train_on_batch([Es,onehot_actions], Es_)
-##        if(self.update_counter%100==0):
-##            loss=0.
-##            for i in range (100):
-##                loss+=self.transition2.train_on_batch([Es,onehot_actions], Es_)
-##                if(i%10==0):
-##                    print "loss/(i+1)"
-##                    print loss/(i+1)
-##            print "loss/100."
-##            print loss/100.
-#            #print K.get_value(self.transition2.optimizer.lr)
-#            #print [ K.get_value(param)
-#            #        for layer in self.encoder.layers
-#            #        for param in layer.trainable_weights ][0][0]
-#        return self.loss_T2
-
-
-
     def qValues(self, state_val):
         """ Get the q values for one belief state (without planning)
 
@@ -630,8 +583,7 @@ def _compile(self):
         self.full_Q.compile(optimizer=optimizer, loss='mse')
 
         optimizer1=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # Different optimizers for each network; otherwise not possible to modify each
-        optimizer2=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately (e.g. lr)
-        optimizer3=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
+        optimizer3=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately (e.g. lr)
         optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer5=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
         optimizer6=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
@@ -639,7 +591,6 @@ def _compile(self):
         optimizer8=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
 
         self.diff_Tx_x_.compile(optimizer=optimizer1, loss='mse') # Fit transitions
-        #self.transition2.compile(optimizer=optimizer2, loss='mse') # Fit accurate transitions without encoders
         self.full_R.compile(optimizer=optimizer3, loss='mse') # Fit rewards
         self.full_gamma.compile(optimizer=optimizer3, loss='mse') # Fit discount
 
diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index 4ceffd40..24914fbc 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -243,35 +243,6 @@ def transition_model(self):
         
         return model
 
-    def transition_model2(self):
-        """
-    
-        Parameters
-        -----------
-        x
-        a
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ] #x
-
-        x = Concatenate()(inputs)#,axis=-1)
-        x = Dense(10, activation='tanh')(x)
-        x = BatchNormalization()(x)
-        x = Dense(50, activation='tanh')(x)
-        x = BatchNormalization()(x)
-        x = Dense(10, activation='tanh')(x)
-        x = BatchNormalization()(x)
-        x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
-        x = Add()([inputs[0],x])
-        
-        model = Model(inputs=inputs, outputs=x)
-        
-        return model
-
     def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
         """
         Used to fit the transitions

From 19f390551c038de366a1c21bd4b45c31f49e2dfc Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 24 Jul 2018 12:29:21 -0400
Subject: [PATCH 69/96] modify shape actions in train and Dataset.randomBatch,
 improve doc method train in learning_algo, clean CRAR_keras, fix
 summarize_performance in environment and a few other doc improvements

---
 deer/agent.py                        | 121 +++++++++++++--------------
 deer/base_classes/environment.py     |  16 ++--
 deer/base_classes/learning_algo.py   |   3 +-
 deer/learning_algos/AC_net_keras.py  |  19 +++--
 deer/learning_algos/CRAR_keras.py    |  32 +++----
 deer/learning_algos/NN_CRAR_keras.py |  33 +-------
 deer/learning_algos/q_net_keras.py   |  21 +++--
 7 files changed, 113 insertions(+), 132 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index 6f7667d7..c5317e9f 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -486,42 +486,39 @@ def updatePriorities(self, priorities, rndValidIndices):
         for i in range( len(rndValidIndices) ):
             self._prioritiy_tree.update(rndValidIndices[i], priorities[i])
 
-    def randomBatch(self, size, use_priority):
-        """Return corresponding states, actions, rewards, terminal status, and next_states for size randomly
+    def randomBatch(self, batch_size, use_priority):
+        """Returns a batch of states, actions, rewards, terminal status, and next_states for a number batch_size of randomly
         chosen transitions. Note that if terminal[i] == True, then next_states[s][i] == np.zeros_like(states[s][i]) for
-        each subject s.
+        each s.
         
         Parameters
         -----------
-        size : int
+        batch_size : int
             Number of transitions to return.
         use_priority : Boolean
             Whether to use prioritized replay or not
 
         Returns
         -------
-        states : ndarray
-            An ndarray(size=number_of_subjects, dtype='object), where states[s] is a 2+D matrix of dimensions
-            size x s.memorySize x "shape of a given observation for this subject". States were taken randomly in
-            the data with the only constraint that they are complete regarding the histories for each observed
-            subject.
-        actions : ndarray
-            An ndarray(size=number_of_subjects, dtype='int32') where actions[i] is the action taken after
-            having observed states[:][i].
-        rewards : ndarray
-            An ndarray(size=number_of_subjects, dtype='float32') where rewards[i] is the reward obtained for
-            taking actions[i-1].
-        next_states : ndarray
-            Same structure than states, but next_states[s][i] is guaranteed to be the information
-            concerning the state following the one described by states[s][i] for each subject s.
-        terminals : ndarray
-            An ndarray(size=number_of_subjects, dtype='bool') where terminals[i] is True if actions[i] lead
-            to terminal states and False otherwise
+        states : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+            States are taken randomly in the data with the only constraint that they are complete regarding the history size 
+            for each observation.
+        actions : numpy array of integers [batch_size]
+            actions[i] is the action taken after having observed states[:][i].
+        rewards : numpy array of floats [batch_size]
+            rewards[i] is the reward obtained for taking actions[i-1].
+        next_states : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        terminals : numpy array of booleans [batch_size] 
+            terminals[i] is True if the transition leads to a terminal state and False otherwise
 
         Throws
         -------
             SliceError
-                If a batch of this size could not be built based on current data set (not enough data or all
+                If a batch of this batch_size could not be built based on current data set (not enough data or all
                 trajectories are too short).
         """
 
@@ -533,20 +530,20 @@ def randomBatch(self, size, use_priority):
 
         if (self._use_priority):
             #FIXME : take into account the case where self._only_full_history is false
-            rndValidIndices, rndValidIndices_tree = self._randomPrioritizedBatch(size)
+            rndValidIndices, rndValidIndices_tree = self._randomPrioritizedBatch(batch_size)
             if (rndValidIndices.size == 0):
                 raise SliceError("Could not find a state with full histories")
         else:
-            rndValidIndices = np.zeros(size, dtype='int32')
+            rndValidIndices = np.zeros(batch_size, dtype='int32')
             if (self._only_full_history):
-                for i in range(size): # TODO: multithread this loop?
+                for i in range(batch_size): # TODO: multithread this loop?
                     rndValidIndices[i] = self._randomValidStateIndex(self._max_history_size+self.sticky_action-1)
             else:
-                for i in range(size): # TODO: multithread this loop?
+                for i in range(batch_size): # TODO: multithread this loop?
                     rndValidIndices[i] = self._randomValidStateIndex(minimum_without_terminal=self.sticky_action)
                 
 
-        actions   = np.vstack( self._actions.getSliceBySeq(rndValidIndices) )
+        actions   = self._actions.getSliceBySeq(rndValidIndices)
         rewards   = self._rewards.getSliceBySeq(rndValidIndices)
         terminals = self._terminals.getSliceBySeq(rndValidIndices)
     
@@ -564,9 +561,9 @@ def randomBatch(self, size, use_priority):
             first_terminals.append(first_terminal)
             
         for input in range(len(self._batch_dimensions)):
-            states[input] = np.zeros((size,) + self._batch_dimensions[input], dtype=self._observations[input].dtype)
+            states[input] = np.zeros((batch_size,) + self._batch_dimensions[input], dtype=self._observations[input].dtype)
             next_states[input] = np.zeros_like(states[input])
-            for i in range(size):
+            for i in range(batch_size):
                 slice=self._observations[input].getSlice(rndValidIndices[i]-self.sticky_action+2-min(self._batch_dimensions[input][0],first_terminals[i]+self.sticky_action-1), rndValidIndices[i]+1)
                 if (len(slice)==len(states[input][i])):
                     states[input][i] = slice
@@ -590,15 +587,15 @@ def randomBatch(self, size, use_priority):
         else:
             return states, actions, rewards, next_states, terminals, rndValidIndices
 
-    def randomBatch_nstep(self, size, nstep, use_priority):
-        """Return corresponding states, actions, rewards, terminal status, and next_states for size randomly
+    def randomBatch_nstep(self, batch_size, nstep, use_priority):
+        """Return corresponding states, actions, rewards, terminal status, and next_states for a number batch_size of randomly
         chosen transitions. Note that if terminal[i] == True, then next_states[s][i] == np.zeros_like(states[s][i]) for
-        each subject s.
+        each s.
         
         Parameters
         -----------
-        size : int
-            Batch size
+        batch_size : int
+            Number of transitions to return.
         nstep : int
             Number of transitions to be considered for each element
         use_priority : Boolean
@@ -606,23 +603,20 @@ def randomBatch_nstep(self, size, nstep, use_priority):
 
         Returns
         -------
-        states : ndarray
-            An ndarray(size=number_of_subjects, dtype='object), where states[s] is a 2+D matrix of dimensions
-            size x s.memorySize x "shape of a given observation for this subject". States were taken randomly in
-            the data with the only constraint that they are complete regarding the histories for each observed
-            subject.
-        actions : ndarray
-            An ndarray(size=number_of_subjects, dtype='int32') where actions[i] is the action taken after
-            having observed states[:][i].
-        rewards : ndarray
-            An ndarray(size=number_of_subjects, dtype='float32') where rewards[i] is the reward obtained for
-            taking actions[i-1].
-        next_states : ndarray
-            Same structure than states, but next_states[s][i] is guaranteed to be the information
-            concerning the state following the one described by states[s][i] for each subject s.
-        terminals : ndarray
-            An ndarray(size=number_of_subjects, dtype='bool') where terminals[i] is True if actions[i] lead
-            to terminal states and False otherwise
+        states : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * (history size+nstep-1) * size of punctual observation (which is 2D,1D or scalar)]).
+            States are taken randomly in the data with the only constraint that they are complete regarding the history size 
+            for each observation.
+        actions : numpy array of integers [batch_size, nstep]
+            actions[i] is the action taken after having observed states[:][i].
+        rewards : numpy array of floats [batch_size, nstep]
+            rewards[i] is the reward obtained for taking actions[i-1].
+        next_states : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * (history size+nstep-1) * size of punctual observation (which is 2D,1D or scalar)]).
+        terminals : numpy array of booleans [batch_size, nstep] 
+            terminals[i] is True if the transition leads to a terminal state and False otherwise
 
         Throws
         -------
@@ -639,23 +633,23 @@ def randomBatch_nstep(self, size, nstep, use_priority):
 
         if (self._use_priority):
             #FIXME : take into account the case where self._only_full_history is false
-            rndValidIndices, rndValidIndices_tree = self._randomPrioritizedBatch(size)
+            rndValidIndices, rndValidIndices_tree = self._randomPrioritizedBatch(batch_size)
             if (rndValidIndices.size == 0):
                 raise SliceError("Could not find a state with full histories")
         else:
-            rndValidIndices = np.zeros(size, dtype='int32')
+            rndValidIndices = np.zeros(batch_size, dtype='int32')
             if (self._only_full_history):
-                for i in range(size): # TODO: multithread this loop?
+                for i in range(batch_size): # TODO: multithread this loop?
                     rndValidIndices[i] = self._randomValidStateIndex(self._max_history_size+self.sticky_action*nstep-1)
             else:
-                for i in range(size): # TODO: multithread this loop?
+                for i in range(batch_size): # TODO: multithread this loop?
                     rndValidIndices[i] = self._randomValidStateIndex(minimum_without_terminal=self.sticky_action*nstep)
                 
 
-        actions=np.zeros((size,(nstep)*self.sticky_action), dtype=int)
-        rewards=np.zeros((size,(nstep)*self.sticky_action))
-        terminals=np.zeros((size,(nstep)*self.sticky_action))
-        for i in range(size):
+        actions=np.zeros((batch_size,(nstep)*self.sticky_action), dtype=int)
+        rewards=np.zeros((batch_size,(nstep)*self.sticky_action))
+        terminals=np.zeros((batch_size,(nstep)*self.sticky_action))
+        for i in range(batch_size):
             actions[i] = self._actions.getSlice(rndValidIndices[i]-self.sticky_action*nstep+1,rndValidIndices[i]+self.sticky_action)
             rewards[i] = self._rewards.getSlice(rndValidIndices[i]-self.sticky_action*nstep+1,rndValidIndices[i]+self.sticky_action)
             terminals[i] = self._terminals.getSlice(rndValidIndices[i]-self.sticky_action*nstep+1,rndValidIndices[i]+self.sticky_action)
@@ -675,8 +669,8 @@ def randomBatch_nstep(self, size, nstep, use_priority):
         batch_dimensions=copy.deepcopy(self._batch_dimensions)
         for input in range(len(self._batch_dimensions)):
             batch_dimensions[input]=tuple( x + y for x, y in zip(self._batch_dimensions[input],(self.sticky_action*(nstep+1)-1,0,0)) )
-            observations[input] = np.zeros((size,) + batch_dimensions[input], dtype=self._observations[input].dtype)
-            for i in range(size):
+            observations[input] = np.zeros((batch_size,) + batch_dimensions[input], dtype=self._observations[input].dtype)
+            for i in range(batch_size):
                 slice=self._observations[input].getSlice(rndValidIndices[i]-self.sticky_action*nstep+2-min(self._batch_dimensions[input][0],first_terminals[i]-self.sticky_action*nstep+1), rndValidIndices[i]+self.sticky_action+1)
                 if (len(slice)==len(observations[input][i])):
                     observations[input][i] = slice
@@ -726,9 +720,8 @@ def _randomValidStateIndex(self, minimum_without_terminal):
                 # else index was ok according to terminals
                 return index
     
-    def _randomPrioritizedBatch(self, size):
-        indices_tree = self._prioritiy_tree.getBatch(
-            size, self._random_state, self)
+    def _randomPrioritizedBatch(self, batch_size):
+        indices_tree = self._prioritiy_tree.getBatch(batch_size, self._random_state, self)
         indices_replay_mem=np.zeros(indices_tree.size,dtype='int32')
         for i in range(len(indices_tree)):
             indices_replay_mem[i]= int(self._translation_array[indices_tree[i]] \
diff --git a/deer/base_classes/environment.py b/deer/base_classes/environment.py
index 0c83d78e..6e802ec1 100644
--- a/deer/base_classes/environment.py
+++ b/deer/base_classes/environment.py
@@ -55,14 +55,14 @@ def act(self, action):
     def inputDimensions(self):
         """Gets the shape of the input space for this environment.
         
-        This returns a list whose length is the number of subjects observed on the environment. Each element of the 
-        list is a tuple: the first integer is always the history size considered for this subject and the rest describes 
-        the shape of a single observation on this subject:
-        - () or (1,) means each observation on this subject is a single number and the history size is 1 (= only current 
+        This returns a list whose length is the number of observations in the environment. Each element of the 
+        list is a tuple: the first integer is always the history size considered for this observation and the rest describes 
+        the shape of the observation at a given time step:
+        - () or (1,) means each observation at a given time step is a single scalar and the history size is 1 (= only current 
         observation)
-        - (N,) means each observation on this subject is a single number and the history size is N
-        - (N, M) means each observation on this subject is a vector of length M  and the history size is N
-        - (N, M1, M2) means each observation on this subject is a matrix with M1 rows and M2 columns and the history 
+        - (N,) means each observation at a given time step is a single scalar and the history size is N
+        - (N, M) means each observation at a given time step is a vector of length M and the history size is N
+        - (N, M1, M2) means each observation at a given time step is a 2D matrix with M1 rows and M2 columns and the history 
         size is N
         """
 
@@ -104,7 +104,7 @@ def observe(self):
 
         raise NotImplementedError()
 
-    def summarizePerformance(self, test_data_set):
+    def summarizePerformance(self, test_data_set, *args, **kwargs):
         """Optional hook that can be used to show a summary of the performance of the agent on the
         environment in the current mode.
 
diff --git a/deer/base_classes/learning_algo.py b/deer/base_classes/learning_algo.py
index 80fecefe..f65fa0f2 100644
--- a/deer/base_classes/learning_algo.py
+++ b/deer/base_classes/learning_algo.py
@@ -24,7 +24,8 @@ def __init__(self, environment, batch_size):
         self._batch_size = batch_size
 
     def train(self, states, actions, rewards, nextStates, terminals):
-        """ This method performs the Bellman iteration for one batch of tuples.
+        """ This method performs the training step (e.g. using Bellman iteration in a deep Q-network) 
+        for one batch of tuples.
         """
         raise NotImplementedError()
 
diff --git a/deer/learning_algos/AC_net_keras.py b/deer/learning_algos/AC_net_keras.py
index 705874ab..41bf0882 100644
--- a/deer/learning_algos/AC_net_keras.py
+++ b/deer/learning_algos/AC_net_keras.py
@@ -127,11 +127,18 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         Parameters
         -----------
-        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        actions_val : b x 1 numpy array of objects (lists of floats)
-        rewards_val : b x 1 numpy array
-        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        terminals_val : b x 1 numpy boolean array (currently ignored)
+        states_val : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        actions_val : numpy array of integers with size [self._batch_size]
+            actions[i] is the action taken after having observed states[:][i].
+        rewards_val : numpy array of floats with size [self._batch_size]
+            rewards[i] is the reward obtained for taking actions[i-1].
+        next_states_val : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        terminals_val : numpy array of booleans with size [self._batch_size]
+            terminals[i] is True if the transition leads to a terminal state and False otherwise
 
 
         Returns
@@ -206,7 +213,7 @@ def gradients(self, states, actions):
         
         return out
 
-    def chooseBestAction(self, state):
+    def chooseBestAction(self, state, *args, **kwargs):
         """ Get the best action for a pseudo-state
 
         Arguments
diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index bc8ee546..c3e28d02 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -165,11 +165,18 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         Parameters
         -----------
-        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        actions_val : b x 1 numpy array of integers
-        rewards_val : b x 1 numpy array
-        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        terminals_val : b x 1 numpy boolean array
+        states_val : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        actions_val : numpy array of integers with size [self._batch_size]
+            actions[i] is the action taken after having observed states[:][i].
+        rewards_val : numpy array of floats with size [self._batch_size]
+            rewards[i] is the reward obtained for taking actions[i-1].
+        next_states_val : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        terminals_val : numpy array of booleans with size [self._batch_size]
+            terminals[i] is True if the transition leads to a terminal state and False otherwise
 
         Returns
         -------
@@ -178,7 +185,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         """
         
         onehot_actions = np.zeros((self._batch_size, self._n_actions))
-        onehot_actions[np.arange(self._batch_size), actions_val[:,0]] = 1
+        onehot_actions[np.arange(self._batch_size), actions_val] = 1
         onehot_actions_rand = np.zeros((self._batch_size, self._n_actions))
         onehot_actions_rand[np.arange(self._batch_size), np.random.randint(0,2,(32))] = 1
         states_val=list(states_val)
@@ -303,17 +310,14 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             self.loss_disambiguate1=0
             self.loss_disambiguate2=0
             
-            print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
-            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
+            #print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
+            #print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
+            #print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
 
             print "self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,self.learn_and_plan.internal_dim)))"
             print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((32,self.learn_and_plan.internal_dim)))
             print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((32,self.learn_and_plan.internal_dim)))
 
-            print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
-
 
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
@@ -335,11 +339,11 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         q_vals=self.full_Q.predict([states_val[0]])
         
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
-        q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
+        q_val=q_vals[np.arange(self._batch_size), actions_val]#.reshape((-1, 1))        
         diff = - q_val + target 
         loss_ind=pow(diff,2)
                 
-        q_vals[  np.arange(self._batch_size), actions_val.reshape((-1,))  ] = target
+        q_vals[  np.arange(self._batch_size), actions_val  ] = target
                 
         # Is it possible to use something more flexible than this? 
         # Only some elements of next_q_vals are actual value that I target. 
diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index 24914fbc..6b0e5dc9 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -57,8 +57,6 @@ def encoder_model(self):
 
         for i, dim in enumerate(self._input_dimensions):
             # - observation[i] is a FRAME
-            print "dim enc"
-            print dim
             if len(dim) == 3 or len(dim) == 4:
                 input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
@@ -210,8 +208,6 @@ def transition_model(self):
         if(self._high_int_dim==True):
             dim=self._input_dimensions[0] #FIXME
             inputs = [ Input(shape=(-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder),self.n_channels_internal_dim)), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
-            print inputs[0]._keras_shape
-            print inputs[1]._keras_shape
             
             layers_action=inputs[1]
             layers_action=RepeatVector(-(-dim[-2] // self._pooling_encoder)*-(-dim[-1] // self._pooling_encoder))(layers_action)#K.repeat_elements(layers_action,rep=dim[-2]*dim[-1],axis=1)
@@ -281,11 +277,7 @@ def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
         for d in range(plan_depth+1):
             inputs.append(Input(shape=(self._n_actions,)))
             Tx= transition_model([Tx,inputs[-1]])
-                
-        print "Tx._keras_shape"
-        print Tx._keras_shape
-        print enc_x_._keras_shape
-        
+                        
         x = Subtract()([Tx,enc_x_])
 
         input = Input(shape=(1,)) # 1-terminals (0 if transition is terminal)
@@ -332,9 +324,6 @@ def force_features(self,encoder_model,transition_model,plan_depth=0):
             inputs.append(Input(shape=(self._n_actions,)))
             Tx= transition_model([Tx,inputs[-1]])
         
-        print "Tx._keras_shape"
-        print Tx._keras_shape
-                    
         diff_features = Subtract()([Tx,enc_x]) # Modification of the features after (sequence of) action(s)
         
         model = Model(inputs=inputs, outputs=diff_features )
@@ -432,11 +421,6 @@ def diff_sa_sa(self,encoder_model,transition_model):
             x = Subtract()([Tx,rand_Tx])
         else:
             x = Subtract()([Tx,rand_Tx])
-        print "x._keras_shape"
-        print x._keras_shape
-        #x = Dot(axes=-1, normalize=False)([x,x])
-        #print "x._keras_shape"
-        #print x._keras_shape
         
         model = Model(inputs=inputs, outputs=x )
         
@@ -493,10 +477,8 @@ def R_model(self):
             
             layers_action=inputs[1]
             layers_action=RepeatVector(-(-dim[-2] // self._pooling_encoder)*-(-dim[-1] // self._pooling_encoder))(layers_action)
-            print layers_action._keras_shape
             layers_action=Reshape((self._n_actions,-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder)))(layers_action)
             layers_action=Permute((2,3,1), input_shape=(self.n_channels_internal_dim+self._n_actions,-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
-            print layers_action._keras_shape
 
             
             x = Concatenate(axis=-1)([layers_action,inputs[0]])
@@ -575,14 +557,11 @@ def Q_model(self):
             outs_conv=[]
             for i, dim in enumerate(self._input_dimensions):
                 # - observation[i] is a FRAME
-                print "dim Q mod"
-                print dim
                 if len(dim) == 3 or len(dim) == 4:
                     input = Input(shape=(-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder),self.n_channels_internal_dim)) #data_format is already 'channels_last'
                     inputs.append(input)
                     #reshaped=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)
                     x = input     #data_format is already 'channels_last'
-                    print x._keras_shape
             
                     x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
                     x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
@@ -614,16 +593,6 @@ def Q_model(self):
         else:
             inputs = [ Input( shape=(self.internal_dim,) ) ] #x
             x = Dense(20, activation='tanh')(inputs[0])
-
-        
-        #if (self._action_as_input==True):
-        #    if ( isinstance(self._n_actions,int)):
-        #        print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
-        #    else:
-        #        input = Input(shape=(len(self._n_actions),))
-        #        inputs.append(input)
-                
-        #x = Add()([x,inputs[-1]]) #????
         
         # we stack a deep fully-connected network on top
         x = Dense(50, activation='tanh')(x)
diff --git a/deer/learning_algos/q_net_keras.py b/deer/learning_algos/q_net_keras.py
index 9c49d4c7..fe223381 100644
--- a/deer/learning_algos/q_net_keras.py
+++ b/deer/learning_algos/q_net_keras.py
@@ -85,11 +85,18 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         Parameters
         -----------
-        states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        actions_val : b x 1 numpy array of integers
-        rewards_val : b x 1 numpy array
-        next_states_val : list of batch_size * [list of max_num_elements* [list of k * [element 2D,1D or scalar]])
-        terminals_val : b x 1 numpy boolean array
+        states_val : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        actions_val : numpy array of integers with size [self._batch_size]
+            actions[i] is the action taken after having observed states[:][i].
+        rewards_val : numpy array of floats with size [self._batch_size]
+            rewards[i] is the reward obtained for taking actions[i-1].
+        next_states_val : numpy array of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        terminals_val : numpy array of booleans with size [self._batch_size]
+            terminals[i] is True if the transition leads to a terminal state and False otherwise
 
         Returns
         -------
@@ -116,11 +123,11 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         q_vals=self.q_vals.predict(states_val.tolist())
 
         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
-        q_val=q_vals[np.arange(self._batch_size), actions_val.reshape((-1,))]#.reshape((-1, 1))        
+        q_val=q_vals[np.arange(self._batch_size), actions_val]       
         diff = - q_val + target 
         loss_ind=pow(diff,2)
                 
-        q_vals[  np.arange(self._batch_size), actions_val.reshape((-1,))  ] = target
+        q_vals[  np.arange(self._batch_size), actions_val  ] = target
                 
         # Is it possible to use something more flexible than this? 
         # Only some elements of next_q_vals are actual value that I target. 

From 76287a9493788fc2e3d320c140d398012d78494b Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 24 Jul 2018 15:03:30 -0400
Subject: [PATCH 70/96] modif doc

---
 README.rst     | 2 +-
 docs/index.rst | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index e6ad2144..59602735 100644
--- a/README.rst
+++ b/README.rst
@@ -25,7 +25,7 @@ DeeR is a python library for Deep Reinforcement. It is build with modularity in
 Dependencies
 ============
 
-This framework is tested to work under Python 2.7, and Python 3.5. It should also work with Python 3.3 and 3.4.
+This framework is tested to work under Python 2.7, and Python 3.5.
 
 The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need Keras>=2.1.
 
diff --git a/docs/index.rst b/docs/index.rst
index fbf05760..54e49536 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,7 +13,14 @@ In addition, the framework is made in such a way that it is easy to
 
 * build any environment
 * modify any part of the learning process
-* use your favorite python-based framework to code your own neural network architecture. The provided neural network architectures are based on Keras (or pure Theano) but you may easily use others.
+* use your favorite python-based framework to code your own learning algorithm or neural network architecture. The provided learning algorithms and neural network architectures are based on Keras.
+
+.. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/schema_deer.png
+   :scale: 50 %
+   :alt: alternate text
+   :align: right
+
+:Figure: General schema of the different elements implement in DeeR.
 
 It is a work in progress and input is welcome. Please submit any contribution via pull request.
 

From 00670281c9b5379eb3546ade102fe0c4a6d5f222 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 31 Jul 2018 09:57:08 -0400
Subject: [PATCH 71/96] Pass the dataset to policies

---
 deer/agent.py                            | 4 ++--
 deer/base_classes/policy.py              | 4 ++--
 deer/learning_algos/CRAR_keras.py        | 2 +-
 deer/learning_algos/q_net_keras.py       | 4 ++--
 deer/policies/EpsilonGreedyPolicy.py     | 4 ++--
 deer/policies/LongerExplorationPolicy.py | 4 ++--
 examples/test_CRAR/run_simple_maze.py    | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index c5317e9f..7169a05b 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -374,11 +374,11 @@ def _chooseAction(self):
         
         if self._mode != -1:
             # Act according to the test policy if not in training mode
-            action, V = self._test_policy.action(self._state, mode=self._mode)
+            action, V = self._test_policy.action(self._state, mode=self._mode, dataset=self._dataset)
         else:
             if self._dataset.n_elems > self._replay_start_size:
                 # follow the train policy
-                action, V = self._train_policy.action(self._state)     #is self._state the only way to store/pass the state?
+                action, V = self._train_policy.action(self._state, mode=None, dataset=self._dataset)     #is self._state the only way to store/pass the state?
             else:
                 # Still gathering initial data: choose dummy action
                 action, V = self._train_policy.randomAction()
diff --git a/deer/base_classes/policy.py b/deer/base_classes/policy.py
index 436eb53a..a72e7c90 100644
--- a/deer/base_classes/policy.py
+++ b/deer/base_classes/policy.py
@@ -24,10 +24,10 @@ def __init__(self, learning_algo, n_actions,random_state):
 
         pass
 
-    def bestAction(self, state, mode=None):
+    def bestAction(self, state, mode=None, *args, **kwargs):
         """ Returns the best Action for the given state. This is an additional encapsulation for q-network.
         """
-        action,V = self.learning_algo.chooseBestAction(state, mode)
+        action,V = self.learning_algo.chooseBestAction(state, mode, *args, **kwargs)
         return action, V
 
     def randomAction(self):
diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index c3e28d02..0a0ad2fb 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -553,7 +553,7 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
             next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
             return r_vals_d0+gamma_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor[0]),axis=1).flatten()
 
-    def chooseBestAction(self, state, mode):
+    def chooseBestAction(self, state, mode, *args, **kwargs):
         """ Get the best action for a belief state
 
         Arguments
diff --git a/deer/learning_algos/q_net_keras.py b/deer/learning_algos/q_net_keras.py
index fe223381..a4eac61d 100644
--- a/deer/learning_algos/q_net_keras.py
+++ b/deer/learning_algos/q_net_keras.py
@@ -87,14 +87,14 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         -----------
         states_val : numpy array of objects
             Each object is a numpy array that relates to one of the observations
-            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)].
         actions_val : numpy array of integers with size [self._batch_size]
             actions[i] is the action taken after having observed states[:][i].
         rewards_val : numpy array of floats with size [self._batch_size]
             rewards[i] is the reward obtained for taking actions[i-1].
         next_states_val : numpy array of objects
             Each object is a numpy array that relates to one of the observations
-            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)].
         terminals_val : numpy array of booleans with size [self._batch_size]
             terminals[i] is True if the transition leads to a terminal state and False otherwise
 
diff --git a/deer/policies/EpsilonGreedyPolicy.py b/deer/policies/EpsilonGreedyPolicy.py
index 638ba070..854b2aa4 100644
--- a/deer/policies/EpsilonGreedyPolicy.py
+++ b/deer/policies/EpsilonGreedyPolicy.py
@@ -14,11 +14,11 @@ def __init__(self, learning_algo, n_actions, random_state, epsilon):
         Policy.__init__(self, learning_algo, n_actions, random_state)
         self._epsilon = epsilon
 
-    def action(self, state, mode=None):
+    def action(self, state, mode=None, *args, **kwargs):
         if self.random_state.rand() < self._epsilon:
             action, V = self.randomAction()
         else:
-            action, V = self.bestAction(state, mode)
+            action, V = self.bestAction(state, mode, *args, **kwargs)
 
         return action, V
 
diff --git a/deer/policies/LongerExplorationPolicy.py b/deer/policies/LongerExplorationPolicy.py
index 3ad07aaf..e0726ee4 100644
--- a/deer/policies/LongerExplorationPolicy.py
+++ b/deer/policies/LongerExplorationPolicy.py
@@ -26,7 +26,7 @@ def __init__(self, learning_algo, n_actions, random_state, epsilon, length=10):
         self._count_down = -1
         self._action_sequence = []
 
-    def action(self, state):
+    def action(self, state, mode=None, *args, **kwargs):
         if self._count_down >= 0:
             # Take the next exploration action in the sequence
             V = 0
@@ -42,7 +42,7 @@ def action(self, state):
                 self._count_down -= 1
             else:
                 # Simply act greedily with respect to what is currently believed to be the best action
-                action, V = self.bestAction(state)
+                action, V = self.bestAction(state, mode, args, kwargs)
         
         return np.array(action), V
 
diff --git a/examples/test_CRAR/run_simple_maze.py b/examples/test_CRAR/run_simple_maze.py
index 2a7eaa5a..d3b74e2e 100644
--- a/examples/test_CRAR/run_simple_maze.py
+++ b/examples/test_CRAR/run_simple_maze.py
@@ -36,7 +36,7 @@ class Defaults:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
     LEARNING_RATE = 0.0005
-    LEARNING_RATE_DECAY = 0.98
+    LEARNING_RATE_DECAY = 0.9
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
     DISCOUNT_MAX = 0.99

From 5e3117253b2afaaeba54f76872b550d6f54cbab8 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 1 Aug 2018 11:09:16 -0400
Subject: [PATCH 72/96] clean comments+code, introduce clip_norm instead of
 unused clip_delta in all learning algos

---
 deer/default_parser.py                        |   6 +-
 deer/learning_algos/AC_net_keras.py           |  31 ++-
 deer/learning_algos/CRAR_keras.py             | 245 ++++++------------
 deer/learning_algos/NN_CRAR_keras.py          |  31 ---
 deer/learning_algos/q_net_keras.py            |  32 ++-
 examples/ALE/run_ALE.py                       |   4 +-
 .../MG_two_storages/run_MG_two_storages.py    |   4 +-
 examples/gym/run_mountain_car.py              |   4 +-
 examples/gym/run_mountain_car_continuous.py   |   4 +-
 examples/gym/run_pendulum.py                  |   4 +-
 examples/test_CRAR/run_catcher.py             |   6 +-
 examples/test_CRAR/run_simple_maze.py         |   4 +-
 examples/toy_env/run_toy_env.py               |   4 +-
 13 files changed, 143 insertions(+), 236 deletions(-)

diff --git a/deer/default_parser.py b/deer/default_parser.py
index e8470a0a..9b300883 100644
--- a/deer/default_parser.py
+++ b/deer/default_parser.py
@@ -50,9 +50,9 @@ def process_args(args, defaults):
     parser.add_argument('--momentum', type=float, default=defaults.MOMENTUM,
                         help=('Momentum term for Nesterov momentum. '+
                               '(default: %(default)s)'))
-    parser.add_argument('--clip-delta', dest="clip_delta", type=float,
-                        default=defaults.CLIP_DELTA,
-                        help=('Max absolute value for Q-update delta value. ' +
+    parser.add_argument('--clip-norm', dest="clip_norm", type=float,
+                        default=defaults.CLIP_NORM,
+                        help=('Max L2 norm for the gradient. ' +
                               '(default: %(default)s)'))
     parser.add_argument('--discount', type=float, default=defaults.DISCOUNT,
                         help='Discount rate init')
diff --git a/deer/learning_algos/AC_net_keras.py b/deer/learning_algos/AC_net_keras.py
index 41bf0882..5b93a3b0 100644
--- a/deer/learning_algos/AC_net_keras.py
+++ b/deer/learning_algos/AC_net_keras.py
@@ -30,9 +30,9 @@ class MyACNetwork(ACNetwork):
     rms_epsilon : float
         Parameter for rmsprop. Default : 0.0001
     momentum : float
-        Default : 0
-    clip_delta : float
-        Not implemented.
+        Momentum for SGD. Default : 0
+    clip_norm : float
+        The gradient tensor will be clipped to a maximum L2 norm given by this value.
     freeze_interval : int
         Period during which the target network is freezed and after which the target network is updated. Default : 1000
     batch_size : int
@@ -51,7 +51,7 @@ class MyACNetwork(ACNetwork):
         default is deer.learning_algos.NN_keras
     """
 
-    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network_critic=NN, neural_network_actor=NN):
+    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network_critic=NN, neural_network_actor=NN):
         """ Initialize environment
         
         """
@@ -60,6 +60,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._rho = rho
         self._rms_epsilon = rms_epsilon
         self._momentum = momentum
+        self._clip_norm = clip_norm
         self._freeze_interval = freeze_interval
         self._double_Q = double_Q
         self._random_state = random_state
@@ -74,9 +75,9 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.q_vals, self.params, self.inputsQ = Q_net._buildDQN()
         
         if (update_rule=="sgd"):
-            optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False)
+            optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
         elif (update_rule=="rmsprop"):
-            optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon)
+            optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
         else:
             raise Exception('The update_rule '+update_rule+ 'is not implemented.')
         
@@ -104,6 +105,12 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
 
 
     def getAllParams(self):
+        """ Get all parameters used by the learning algorithm
+
+        Returns
+        -------
+        Values of the parameters: list of numpy arrays
+        """
         params_value=[]
         for i,p in enumerate(self.params):
             params_value.append(K.get_value(p))
@@ -113,6 +120,13 @@ def getAllParams(self):
         return params_value
 
     def setAllParams(self, list_of_values):
+        """ Set all parameters used by the learning algorithm
+
+        Arguments
+        ---------
+        list_of_values : list of numpy arrays
+             list of the parameters to be set (same order than given by getAllParams()).
+        """
         for i,p in enumerate(self.params):
             K.set_value(p,list_of_values[i])
         for j,p in enumerate(self.params_policy):
@@ -120,10 +134,7 @@ def setAllParams(self, list_of_values):
 
     def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
         """
-        Train one batch.
-
-        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
-        2. perform batch training
+        Train the actor-critic algorithm from one batch of data.
 
         Parameters
         -----------
diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index 0a0ad2fb..1de6df15 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -16,6 +16,8 @@
 import copy
 
 def mean_squared_error_p(y_true, y_pred):
+    """ Modified mean square error that clips
+    """
     return K.clip(K.max(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)     # = modified mse error L_inf
     #return K.clip(K.mean(  K.square( y_pred - y_true )  ,  axis=-1  )-1,0.,100.)   # = modified mse error L_2
 
@@ -41,9 +43,9 @@ class CRAR(LearningAlgo):
     rms_epsilon : float
         Parameter for rmsprop. Default : 0.0001
     momentum : float
-        Default : 0
-    clip_delta : float
-        Not implemented.
+        Momentum for SGD. Default : 0
+    clip_norm : float
+        The gradient tensor will be clipped to a maximum L2 norm given by this value.
     freeze_interval : int
         Period during which the target network is freezed and after which the target network is updated. Default : 1000
     batch_size : int
@@ -58,8 +60,8 @@ class CRAR(LearningAlgo):
         default is deer.learning_algos.NN_keras
     """
 
-    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN, **kwargs):
-        """ Initialize environment
+    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN, **kwargs):
+        """ Initialize the environment
         
         """
         LearningAlgo.__init__(self,environment, batch_size)
@@ -67,6 +69,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._rho = rho
         self._rms_epsilon = rms_epsilon
         self._momentum = momentum
+        self._clip_norm = clip_norm
         self._update_rule = update_rule
         self._freeze_interval = freeze_interval
         self._double_Q = double_Q
@@ -84,7 +87,6 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self.loss_disambiguate1=0
         self.loss_disambiguate2=0
         self.loss_gamma=0
-
         
         self.learn_and_plan = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
 
@@ -113,11 +115,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
                 
         # constraint on consecutive t
         self.diff_s_s_ = self.learn_and_plan.encoder_diff_model(self.encoder)
-#        self.diff_Tx = self.learn_and_plan.diff_Tx(self.transition)
 
-        # used to disentangle actions
-        self.diff_sa_sa = self.learn_and_plan.diff_sa_sa(self.encoder,self.transition)
-                
         layers=self.encoder.layers+self.Q.layers+self.R.layers+self.gamma.layers+self.transition.layers
         # Grab all the parameters together.
         self.params = [ param
@@ -146,6 +144,10 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
 
     def getAllParams(self):
         """ Provides all parameters used by the learning algorithm
+
+        Returns
+        -------
+        Values of the parameters: list of numpy arrays
         """
         params_value=[]
         for i,p in enumerate(self.params):
@@ -153,15 +155,19 @@ def getAllParams(self):
         return params_value
 
     def setAllParams(self, list_of_values):
+        """ Set all parameters used by the learning algorithm
+
+        Arguments
+        ---------
+        list_of_values : list of numpy arrays
+             list of the parameters to be set (same order than given by getAllParams()).
+        """
         for i,p in enumerate(self.params):
             K.set_value(p,list_of_values[i])
 
     def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
         """
-        Train one batch.
-
-        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
-        2. perform batch training
+        Train CRAR from one batch of data.
 
         Parameters
         -----------
@@ -180,8 +186,8 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         Returns
         -------
-        Average loss of the batch training (RMSE)
-        Individual (square) losses for each tuple
+        Average loss of the batch training for the Q-values (RMSE)
+        Individual (square) losses for the Q-values for each tuple
         """
         
         onehot_actions = np.zeros((self._batch_size, self._n_actions))
@@ -213,11 +219,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print R[0]
             
         # Fit transition
-        #print "states_val+next_states_val+[onehot_actions]+[(1-terminals_val)]"
-        #print states_val+next_states_val+[onehot_actions]+[(1-terminals_val)]
-        l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions]+[(1-terminals_val)], np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim))
-        #print "l"
-        #print l
+        l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions]+[(1-terminals_val)], np.zeros_like(Es))
         self.loss_T+=l
         
         # Interpretable AI
@@ -250,41 +252,20 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
         # reduce the squared value of the abstract features
-        #print "states_val[0][0:2]"
-        #print states_val[0][0:2]
-        #print "self.encoder.predict(states_val)"
-        #print self.encoder.predict(states_val)
-        l=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
-        #print l
-        self.loss_disambiguate1+=l
+        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
         
         # Increase the entropy in the abstract features of two states
         # This is done only when states_val is made up of only one observation --> FIXME
         rolled=np.roll(states_val[0],1,axis=0)
-#        for i in range(self._batch_size):
-#            j=0
-#            l=0
-#            while((states_val[0][i]==rolled[i+j-l]).all()):
-#                if(i+j==31):
-#                    l=self._batch_size
-#                if(j==31):
-#                    break
-#                j=j+1
-#            rolled[i]=rolled[i+j-l]
-        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
-
-
-        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.reshape(np.zeros_like(Es),(self._batch_size,-1)))#np.ones(self._batch_size)) #np.ones((self._batch_size,3))*2) 
+        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
 
-        # Disentangle actions
-        self.loss_disentangle_a+=self.diff_sa_sa.train_on_batch(states_val+[onehot_actions,onehot_actions_rand], np.reshape(np.zeros_like(Es),(self._batch_size,-1)))#np.ones(self._batch_size))
+        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
 
 #
 #        # Loss to have all s' following s,a with a to a distance 1 of s,a)
 #        tiled_x=np.tile(Es,(self._n_actions,1))
 #        tiled_onehot_actions=np.tile(onehot_actions,(self._n_actions,1))
 #        tiled_onehot_actions2=np.repeat(np.diag(np.ones(self._n_actions)),self._batch_size,axis=0)
-#        #self.loss_disentangle_a+=self.diff_Tx.train_on_batch([tiled_x,tiled_onehot_actions,tiled_x,tiled_onehot_actions2], np.ones(self._batch_size*self._n_actions)) 
 
 
         
@@ -310,13 +291,9 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             self.loss_disambiguate1=0
             self.loss_disambiguate2=0
             
-            #print "self.encoder.train_on_batch([states_val[0]],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            #print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
-            #print self.encoder.train_on_batch([states_val[0]],np.zeros_like(Es)) #np.zeros((32,self.learn_and_plan.internal_dim)))
-
             print "self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((32,self.learn_and_plan.internal_dim)))
-            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1))) #np.zeros((32,self.learn_and_plan.internal_dim)))
+            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
+            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
 
 
         if self.update_counter % self._freeze_interval == 0:
@@ -325,7 +302,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         next_q_vals = self.full_Q_target.predict(next_states_val)
         
         if(self._double_Q==True):
-            #next_q_vals_current_qnet=self.full_Q.predict(next_states_val+[np.zeros_like(Es)])
             next_q_vals_current_qnet=self.full_Q.predict(next_states_val)
             argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1)
             max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1))
@@ -349,20 +325,16 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # Only some elements of next_q_vals are actual value that I target. 
         # My loss should only take these into account.
         # Workaround here is that many values are already "exact" in this update
-        #if (self.update_counter<10000):
-        noise_to_be_robust=np.zeros_like(Es) #np.random.normal(size=(self._batch_size,self.learn_and_plan.internal_dim))*0.#25
 
         loss=0
-        #loss=self.full_Q.train_on_batch([states_val[0],noise_to_be_robust] , q_vals ) 
         loss=self.full_Q.train_on_batch(states_val , q_vals ) 
         self.loss_Q+=loss
 
         if(self.update_counter%100==0):
-            print self.update_counter
+            print ("Number of training steps:"+str(self.update_counter)+".")
         
         self.update_counter += 1        
 
-        # loss*self._n_actions = np.average(loss_ind)
         return np.sqrt(loss),loss_ind
 
 
@@ -371,7 +343,7 @@ def qValues(self, state_val):
 
         Arguments
         ---------
-        state_val : one belief state
+        state_val : one pseudo state
 
         Returns
         -------
@@ -387,58 +359,49 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
 
         Arguments
         ---------
-        state_val : one belief state
+        state_val : one pseudo state
         d : planning depth
 
         Returns
         -------
         The q values with planning depth d for the provided belief state
         """
-        #print "state_val[0]"
-        #print state_val[0]
-        #print len(state_val)
-#        print "state_val[0][0]"
-#        print state_val[0][0]
-#        print state_val[0].shape
         print "self.full_Q.predict(state_val)[0]"
         print self.full_Q.predict(state_val)[0]
         encoded_x = self.encoder.predict(state_val)
-        ## DEBUG PURPOSES
-#        print "encoded_x[0]"
-#        print encoded_x[0]
-        
-        identity_matrix = np.diag(np.ones(self._n_actions))
-        if(encoded_x.ndim==2):
-            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
-        elif(encoded_x.ndim==4):
-            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1,1,1))
-        else:
-            print ("error")
-        
-        repeat_identity=np.repeat(identity_matrix,len(encoded_x),axis=0)
-        ##print tile3_encoded_x
-        ##print repeat_identity
-        r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
-        #print "r_vals_d0"
-        #print r_vals_d0
-        r_vals_d0=r_vals_d0.flatten()
-        print "r_vals_d0"
-        print r_vals_d0
-        next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
-        #print "next_x_predicted"
-        #print next_x_predicted
-        one_hot_first_action=np.zeros((1,self._n_actions))
-        one_hot_first_action[0]=1
-        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
-        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
-        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
-        #print "next_x_predicted action 0 t4"
-        #print next_x_predicted
-        ## END DEBUG PURPOSES
+
+#        ## DEBUG PURPOSES
+#        identity_matrix = np.diag(np.ones(self._n_actions))
+#        if(encoded_x.ndim==2):
+#            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
+#        elif(encoded_x.ndim==4):
+#            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1,1,1))
+#        else:
+#            print ("error")
+#        
+#        repeat_identity=np.repeat(identity_matrix,len(encoded_x),axis=0)
+#        ##print tile3_encoded_x
+#        ##print repeat_identity
+#        r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
+#        #print "r_vals_d0"
+#        #print r_vals_d0
+#        r_vals_d0=r_vals_d0.flatten()
+#        print "r_vals_d0"
+#        print r_vals_d0
+#        next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
+#        #print "next_x_predicted"
+#        #print next_x_predicted
+#        one_hot_first_action=np.zeros((1,self._n_actions))
+#        one_hot_first_action[0]=1
+#        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
+#        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
+#        next_x_predicted=T.predict([next_x_predicted[0:1],one_hot_first_action])
+#        #print "next_x_predicted action 0 t4"
+#        #print next_x_predicted
+#        ## END DEBUG PURPOSES
 
         QD_plan=0
-        for i in range(d+1): #TO DO: improve planning algorithm
-            #print encoded_x
+        for i in range(d+1):
             Qd=self.qValues_planning_abstr(encoded_x, R, gamma, T, Q, d=i, branching_factor=[self._n_actions,2,2,2,2,2,2,2]).reshape(len(encoded_x),-1)
             print "Qd,i"
             print Qd,i
@@ -449,43 +412,6 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         print QD_plan
 
         return QD_plan
-
-#    def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
-#        """ 
-#        """
-#        branching_factor=self._n_actions #TO IMPROVE, use MCTS, etc...
-#        n=len(state_abstr_val)
-#        identity_matrix = np.diag(np.ones(self._n_actions))
-#        
-#        this_branching_factor=branching_factor
-#                         
-#        if (d==0):
-#            return Q.predict([state_abstr_val]) # no change in the order of the actions
-#        else:
-#            # All actions are considered in the tree
-#            repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0) # no change in the order of the actions
-#            if(state_abstr_val.ndim==2):
-#                tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
-#            elif(state_abstr_val.ndim==4):
-#                tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1,1,1))
-#            else:
-#                print ("error")
-#            
-#            #print tile3_encoded_x
-#            #print repeat_identity
-#            r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
-#            #print "r_vals_d0"
-#            #print r_vals_d0
-#            r_vals_d0=r_vals_d0.flatten()
-#            
-#            gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
-#            #print "r_vals_d0"
-#            #print r_vals_d0
-#            gamma_vals_vals_d0=gamma_vals_d0.flatten()
-#
-#            next_x_predicted=T.predict([tile3_encoded_x,repeat_identity])
-#            return r_vals_d0+gamma_vals_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor),axis=1).flatten()
-  
   
     def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
         """ 
@@ -575,49 +501,46 @@ def chooseBestAction(self, state, mode, *args, **kwargs):
         return np.argmax(q_vals),np.max(q_vals)
         
     def _compile(self):
-        """ compile self.q_vals
+        """ Compile all the optimizers for the different losses
         """
         if (self._update_rule=="sgd"):
-            optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False)
+            optimizer=SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
+            optimizer1=SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm) # Different optimizers for each network; 
+            optimizer3=SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm) # to possibly modify them separately
+            optimizer4=SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
+            optimizer5=SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
+            optimizer6=SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
+            optimizer7=SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
         elif (self._update_rule=="rmsprop"):
-            optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon)
+            optimizer=RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
+            optimizer1=RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm) # Different optimizers for each network; 
+            optimizer3=RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm) # to possibly modify them separately
+            optimizer4=RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
+            optimizer5=RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
+            optimizer6=RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
+            optimizer7=RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
+
         else:
             raise Exception('The update_rule '+self._update_rule+' is not implemented.')
         
         self.full_Q.compile(optimizer=optimizer, loss='mse')
 
-        optimizer1=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # Different optimizers for each network; otherwise not possible to modify each
-        optimizer3=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06) # separately (e.g. lr)
-        optimizer4=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer5=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer6=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer7=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
-        optimizer8=RMSprop(lr=self._lr, rho=0.9, epsilon=1e-06)
 
         self.diff_Tx_x_.compile(optimizer=optimizer1, loss='mse') # Fit transitions
         self.full_R.compile(optimizer=optimizer3, loss='mse') # Fit rewards
         self.full_gamma.compile(optimizer=optimizer3, loss='mse') # Fit discount
 
         if(self._high_int_dim==False):
-            self.force_features.compile(optimizer=optimizer8,
+            self.force_features.compile(optimizer=optimizer7,
                   loss=cosine_proximity2)
 
         self.encoder.compile(optimizer=optimizer4,
                   loss=mean_squared_error_p)
         self.encoder_diff.compile(optimizer=optimizer5,
                   loss=exp_dec_error)
-                  #metrics=['accuracy'])
 
         self.diff_s_s_.compile(optimizer=optimizer6,
-                  loss=exp_dec_error)#'mse')#loss_diff_s_s_)
-                  #metrics=['accuracy'])
-
-        self.diff_sa_sa.compile(optimizer=optimizer7,
-                  loss=exp_dec_error)#loss_diff_s_s_)
-
-#        self.diff_Tx.compile(optimizer=optimizer,
-#                  loss=mean_squared_error)
-#                  #metrics=['accuracy'])
+                  loss=exp_dec_error)
 
     def _resetQHat(self):
         for i,(param,next_param) in enumerate(zip(self.params, self.params_target)):
@@ -632,7 +555,7 @@ def setLearningRate(self, lr):
             The learning rate that has to be set
         """
         self._lr = lr
-        print "modif lr"
+        print ("New learning rate set to "+str(self._lr)+".")
         # Changing the learning rates (NB:recompiling seems to lead to memory leaks!)
         K.set_value(self.full_Q.optimizer.lr, self._lr)
 
@@ -647,8 +570,6 @@ def setLearningRate(self, lr):
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
         K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
-        K.set_value(self.diff_sa_sa.optimizer.lr, 0) # 0 !
-#        K.set_value(self.diff_Tx.optimizer.lr, self._lr/10.)
 
     def transfer(self, original, transfer, epochs=1):
         # First, make sure that the target network and the current network are the same
@@ -663,14 +584,10 @@ def transfer(self, original, transfer, epochs=1):
         print x_original[0:10]
         for i in range(epochs):
             size = original[0].shape[0]
-            #print size
-            #print transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)]
             print "train"
             print self.encoder.train_on_batch(transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
-            #print self.encoder.train_on_batch(original[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
             print "validation"
             print self.encoder.test_on_batch(transfer[0][int(size*0.8):] , x_original[int(size*0.8):])
-            #print self.encoder.test_on_batch(original[0][int(size*0.8):] , x_original[int(size*0.8):] )
          
         self.encoder.compile(optimizer=optimizer4,
                   loss=mean_squared_error_p)
diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index 6b0e5dc9..5ae4de33 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -426,37 +426,6 @@ def diff_sa_sa(self,encoder_model,transition_model):
         
         return model
 
-    def diff_Tx(self,transition_model):
-        """
-    
-        Parameters
-        -----------
-        x
-        a
-        x
-        a
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ), Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) )] #x,a,x,a
-        
-        #identity_mat=inputs[2]#K.constant(np.diag(np.ones(self._n_actions)), name="identity_mat")
-        Tx = transition_model(inputs[:2])
-        Tx2 = transition_model(inputs[2:])
-        
-        #tile_x=K.tile(inputs[0],(self._n_actions,1))        
-        #Tx_ = transition_model([tile_x]+[identity_mat])
-        
-        x = Subtract()([Tx,Tx2])
-        x = Dot(axes=-1, normalize=False)([x,x])
-        
-        model = Model(inputs=inputs, outputs=x )
-        
-        return model
-
     def R_model(self):
         """
         Build a network consistent with each type of inputs
diff --git a/deer/learning_algos/q_net_keras.py b/deer/learning_algos/q_net_keras.py
index a4eac61d..0f08c8cf 100644
--- a/deer/learning_algos/q_net_keras.py
+++ b/deer/learning_algos/q_net_keras.py
@@ -22,9 +22,9 @@ class MyQNetwork(QNetwork):
     rms_epsilon : float
         Parameter for rmsprop. Default : 0.0001
     momentum : float
-        Default : 0
-    clip_delta : float
-        Not implemented.
+        Momentum for SGD. Default : 0
+    clip_norm : float
+        The gradient tensor will be clipped to a maximum L2 norm given by this value.
     freeze_interval : int
         Period during which the target network is freezed and after which the target network is updated. Default : 1000
     batch_size : int
@@ -39,7 +39,7 @@ class MyQNetwork(QNetwork):
         default is deer.learning_algos.NN_keras
     """
 
-    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN):
+    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN):
         """ Initialize environment
         
         """
@@ -49,8 +49,8 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._rho = rho
         self._rms_epsilon = rms_epsilon
         self._momentum = momentum
+        self._clip_norm = clip_norm
         self._update_rule = update_rule
-        #self.clip_delta = clip_delta
         self._freeze_interval = freeze_interval
         self._double_Q = double_Q
         self._random_state = random_state
@@ -67,21 +67,31 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_de
         self._resetQHat()
 
     def getAllParams(self):
+        """ Get all parameters used by the learning algorithm
+
+        Returns
+        -------
+        Values of the parameters: list of numpy arrays
+        """
         params_value=[]
         for i,p in enumerate(self.params):
             params_value.append(K.get_value(p))
         return params_value
 
     def setAllParams(self, list_of_values):
+        """ Set all parameters used by the learning algorithm
+
+        Arguments
+        ---------
+        list_of_values : list of numpy arrays
+             list of the parameters to be set (same order than given by getAllParams()).
+        """
         for i,p in enumerate(self.params):
             K.set_value(p,list_of_values[i])
 
     def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
         """
-        Train one batch.
-
-        1. Set shared variable in states_shared, next_states_shared, actions_shared, rewards_shared, terminals_shared         
-        2. perform batch training
+        Train the Q-network from one batch of data.
 
         Parameters
         -----------
@@ -173,9 +183,9 @@ def _compile(self):
         """ compile self.q_vals
         """
         if (self._update_rule=="sgd"):
-            optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False)
+            optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
         elif (self._update_rule=="rmsprop"):
-            optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon)
+            optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
         else:
             raise Exception('The update_rule '+self._update_rule+' is not implemented.')
         
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index b5c06e5b..391c835c 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -42,7 +42,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = .1
     EPSILON_DECAY = 100000
@@ -79,7 +79,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
diff --git a/examples/MG_two_storages/run_MG_two_storages.py b/examples/MG_two_storages/run_MG_two_storages.py
index 11bc1d44..e6913b7a 100644
--- a/examples/MG_two_storages/run_MG_two_storages.py
+++ b/examples/MG_two_storages/run_MG_two_storages.py
@@ -45,7 +45,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = .3
     EPSILON_DECAY = 500000
@@ -86,7 +86,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
diff --git a/examples/gym/run_mountain_car.py b/examples/gym/run_mountain_car.py
index 6f338a06..9848972d 100644
--- a/examples/gym/run_mountain_car.py
+++ b/examples/gym/run_mountain_car.py
@@ -39,7 +39,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = 0.2
     EPSILON_DECAY = 10000
@@ -68,7 +68,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
diff --git a/examples/gym/run_mountain_car_continuous.py b/examples/gym/run_mountain_car_continuous.py
index 67220f6e..5c8219b8 100644
--- a/examples/gym/run_mountain_car_continuous.py
+++ b/examples/gym/run_mountain_car_continuous.py
@@ -41,7 +41,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = 0.2
     EPSILON_DECAY = 10000
@@ -70,7 +70,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
diff --git a/examples/gym/run_pendulum.py b/examples/gym/run_pendulum.py
index 6c8ae2a2..5baaeb06 100644
--- a/examples/gym/run_pendulum.py
+++ b/examples/gym/run_pendulum.py
@@ -40,7 +40,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = 0.2
     EPSILON_DECAY = 10000
@@ -69,7 +69,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
diff --git a/examples/test_CRAR/run_catcher.py b/examples/test_CRAR/run_catcher.py
index 0320a56c..2428912e 100644
--- a/examples/test_CRAR/run_catcher.py
+++ b/examples/test_CRAR/run_catcher.py
@@ -43,7 +43,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = 1.0
     EPSILON_DECAY = 10000
@@ -76,7 +76,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
@@ -188,7 +188,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
diff --git a/examples/test_CRAR/run_simple_maze.py b/examples/test_CRAR/run_simple_maze.py
index d3b74e2e..1928cb08 100644
--- a/examples/test_CRAR/run_simple_maze.py
+++ b/examples/test_CRAR/run_simple_maze.py
@@ -43,7 +43,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = 1.0
     EPSILON_DECAY = 10000
@@ -75,7 +75,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
diff --git a/examples/toy_env/run_toy_env.py b/examples/toy_env/run_toy_env.py
index 739d3697..6dbbda1e 100644
--- a/examples/toy_env/run_toy_env.py
+++ b/examples/toy_env/run_toy_env.py
@@ -42,7 +42,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = .1
     EPSILON_DECAY = 10000
@@ -72,7 +72,7 @@ class Defaults:
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,

From d3d6ba612977d7c836b86c06519c699d4b6faadd Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 1 Aug 2018 14:57:02 -0400
Subject: [PATCH 73/96] cleaning+swith to version 0.4

---
 README.rst                            |   2 +-
 deer/learning_algos/CRAR_keras.py     |  62 +++---
 deer/learning_algos/NN_CRAR_keras.py  |   7 +-
 docs/modules/environments.rst         |   2 +-
 docs/user/installation.rst            |  54 ++---
 examples/test_CRAR/catcher_env.py     |  78 ++-----
 examples/test_CRAR/run_catcher.py     | 294 +++++++++++++-------------
 examples/test_CRAR/run_simple_maze.py |  22 --
 examples/test_CRAR/simple_maze_env.py | 282 ++++++++++++------------
 setup.py                              |   8 +-
 10 files changed, 359 insertions(+), 452 deletions(-)

diff --git a/README.rst b/README.rst
index 59602735..d96330d0 100644
--- a/README.rst
+++ b/README.rst
@@ -25,7 +25,7 @@ DeeR is a python library for Deep Reinforcement. It is build with modularity in
 Dependencies
 ============
 
-This framework is tested to work under Python 2.7, and Python 3.5.
+This framework is tested to work under Python 3.6.
 
 The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need Keras>=2.1.
 
diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index 1de6df15..857eb39e 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -203,20 +203,20 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         R=self.R.predict([Es,onehot_actions])
                    
         if(self.update_counter%500==0):
-            print states_val[0][0]
-            print "len(states_val)"
-            print len(states_val)
-            print "next_states_val[0][0]"
-            print next_states_val[0][0]
-            print "actions_val[0], rewards_val[0], terminals_val[0]"
-            print actions_val[0], rewards_val[0], terminals_val[0]
-            print "Es[0],ETs[0],Es_[0]"
+            print ("Printing a few elements useful for debugging:")
+            print ("states_val[0][0]")
+            print (states_val[0][0])
+            print ("next_states_val[0][0]")
+            print (next_states_val[0][0])
+            print ("actions_val[0], rewards_val[0], terminals_val[0]")
+            print (actions_val[0], rewards_val[0], terminals_val[0])
+            print ("Es[0],ETs[0],Es_[0]")
             if(Es.ndim==4):
-                print np.transpose(Es, (0, 3, 1, 2))[0],np.transpose(ETs, (0, 3, 1, 2))[0],np.transpose(Es_, (0, 3, 1, 2))[0]    # data_format='channels_last' --> 'channels_first'
+                print (np.transpose(Es, (0, 3, 1, 2))[0],np.transpose(ETs, (0, 3, 1, 2))[0],np.transpose(Es_, (0, 3, 1, 2))[0])    # data_format='channels_last' --> 'channels_first'
             else:
-                print Es[0],ETs[0],Es_[0]
-            print "R[0]"
-            print R[0]
+                print (Es[0],ETs[0],Es_[0])
+            print ("R[0]")
+            print (R[0])
             
         # Fit transition
         l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions]+[(1-terminals_val)], np.zeros_like(Es))
@@ -270,14 +270,12 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         
         if(self.update_counter%500==0):
-            print "self.loss_Q"
-            print self.loss_Q
-            print "self.loss_T/100.,self.lossR/100.,self.loss_gamma/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100."
-            print self.loss_T/100.,self.lossR/100.,self.loss_gamma/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.
+            print ("self.loss_T/100.,self.lossR/100.,self.loss_gamma/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.")
+            print (self.loss_T/100.,self.lossR/100.,self.loss_gamma/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.)
             
             if(self._high_int_dim==False):
-                print "self.loss_interpret/100."
-                print self.loss_interpret/100.
+                print ("self.loss_interpret/100.")
+                print (self.loss_interpret/100.)
 
             self.lossR=0
             self.loss_gamma=0
@@ -291,10 +289,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             self.loss_disambiguate1=0
             self.loss_disambiguate2=0
             
-            print "self.encoder_diff.train_on_batch([states_val[0],np.roll(states_val[0],1,axis=0)],np.zeros((32,self.learn_and_plan.internal_dim)))"
-            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
-            print self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
-
 
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
@@ -366,11 +360,11 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         -------
         The q values with planning depth d for the provided belief state
         """
-        print "self.full_Q.predict(state_val)[0]"
-        print self.full_Q.predict(state_val)[0]
         encoded_x = self.encoder.predict(state_val)
 
 #        ## DEBUG PURPOSES
+#        print ( "self.full_Q.predict(state_val)[0]" )
+#        print ( self.full_Q.predict(state_val)[0] )
 #        identity_matrix = np.diag(np.ones(self._n_actions))
 #        if(encoded_x.ndim==2):
 #            tile3_encoded_x=np.tile(encoded_x,(self._n_actions,1))
@@ -403,13 +397,13 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         QD_plan=0
         for i in range(d+1):
             Qd=self.qValues_planning_abstr(encoded_x, R, gamma, T, Q, d=i, branching_factor=[self._n_actions,2,2,2,2,2,2,2]).reshape(len(encoded_x),-1)
-            print "Qd,i"
-            print Qd,i
+            print ("Qd,i")
+            print (Qd,i)
             QD_plan+=Qd
         QD_plan=QD_plan/(d+1)
         
-        print "QD_plan"
-        print QD_plan
+        print ("QD_plan")
+        print (QD_plan)
 
         return QD_plan
   
@@ -580,14 +574,14 @@ def transfer(self, original, transfer, epochs=1):
         
         # Then, train the encoder such that the original and transfer states are mapped into the same abstract representation
         x_original=self.encoder.predict(original)#[0]
-        print "x_original[0:10]"
-        print x_original[0:10]
+        print ("x_original[0:10]")
+        print (x_original[0:10])
         for i in range(epochs):
             size = original[0].shape[0]
-            print "train"
-            print self.encoder.train_on_batch(transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] )
-            print "validation"
-            print self.encoder.test_on_batch(transfer[0][int(size*0.8):] , x_original[int(size*0.8):])
+            print ( "train" )
+            print ( self.encoder.train_on_batch(transfer[0][0:int(size*0.8)] , x_original[0:int(size*0.8)] ) )
+            print ( "validation" )
+            print ( self.encoder.test_on_batch(transfer[0][int(size*0.8):] , x_original[int(size*0.8):]) )
          
         self.encoder.compile(optimizer=optimizer4,
                   loss=mean_squared_error_p)
diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index 5ae4de33..f033b09e 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -180,7 +180,7 @@ def encoder_diff_model(self,encoder_model):
                     input = Input(shape=(dim[-3],))
                     inputs.append(input)
         
-        half = len(inputs)/2
+        half = len(inputs)//2
         x1 = encoder_model(inputs[:half])
         x2 = encoder_model(inputs[half:])
         
@@ -269,7 +269,7 @@ def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
                     input = Input(shape=(dim[-3],))
                     inputs.append(input)
 
-        half = len(inputs)/2
+        half = len(inputs)//2
         enc_x = encoder_model(inputs[:half]) #s --> x
         enc_x_ = encoder_model(inputs[half:]) #s --> x
 
@@ -362,7 +362,7 @@ def force_features(self,encoder_model,transition_model,plan_depth=0):
 #                    input = Input(shape=(dim[-3],))
 #                    inputs.append(input)
 #        
-#        half = len(inputs)/2
+#        half = len(inputs)//2
 #        enc_x = encoder_model(inputs[:half]) #s --> x #FIXME
 #        enc_x_ = encoder_model(inputs[half:]) #s --> x
 #        
@@ -614,7 +614,6 @@ def full_Q_model(self, encoder_model, Q_model, plan_depth=0, transition_model=No
         disc_rewards=[]
         for d in range(plan_depth):
             inputs.append(Input(shape=(self._n_actions,)))
-            print inputs[-1:]
             reward=R_model([out]+inputs[-1:])
             if(disc_plan == None):
                 disc_rewards.append(reward)
diff --git a/docs/modules/environments.rst b/docs/modules/environments.rst
index 38f3d275..46dfe758 100644
--- a/docs/modules/environments.rst
+++ b/docs/modules/environments.rst
@@ -3,7 +3,7 @@
 :mod:`Environment`
 =============================
 
-.. automodule:: deer.base_classes.Environment
+.. automodule:: deer.base_classes.environment
 
 .. autoclass:: deer.base_classes.Environment
    :members:
diff --git a/docs/user/installation.rst b/docs/user/installation.rst
index 870fe01c..4f5aefc2 100644
--- a/docs/user/installation.rst
+++ b/docs/user/installation.rst
@@ -7,36 +7,14 @@ Installation
 Dependencies
 --------------
 
-This framework is tested to work under Python 2.7, and Python 3.5. It should also work with Python 3.3 and 3.4.
+This framework is tested to work under Python 3.6.
 
-The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need theano >= 0.7 (lasagne is optional) or you can write your own neural network using your favorite framework.
+The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need keras or you can write your own learning algorithms using your favorite deep learning framework.
 
 For running some of the examples, Matplotlib >= 1.1.1 is required. You also sometimes need to install specific dependencies (e.g. for the atari games, you need to install ALE >= 0.4).
 
 
-User install instructions
---------------------------
-
-You can install the framework with pip:
-
-.. code-block:: bash
-    
-    pip install deer
-
-For the bleeding edge version (recommanded), you can simply use
-
-.. code-block:: bash
-
-    pip install git+git://github.com/VINF/deer.git@master
-
-    
-..
-    If you want to update it to the bleeding edge version you can use pip for this with the command line below:
- 
-    .. code-block:: bash
-    
-        pip install --upgrade --no-deps git+git://github.com/VinF/deer
-
+We recommend to use the bleeding-edge version and to install it by following the :ref:`dev-install`. If you want a simpler installation procedure and do not intend to modify yourself the learning algorithms etc., you can look at the :ref:`user-install`. 
 
 .. _dev-install:
 
@@ -63,3 +41,29 @@ And you can install the framework as a package using the mode ``develop`` so tha
     python setup.py develop
 
 
+.. _user-install:
+
+User install instructions
+--------------------------
+
+You can install the framework with pip:
+
+.. code-block:: bash
+    
+    pip install deer
+
+For the bleeding edge version (recommanded), you can simply use
+
+.. code-block:: bash
+
+    pip install git+git://github.com/VINF/deer.git@master
+
+    
+..
+    If you want to update it to the bleeding edge version you can use pip for this with the command line below:
+ 
+    .. code-block:: bash
+    
+        pip install --upgrade --no-deps git+git://github.com/VinF/deer
+
+
diff --git a/examples/test_CRAR/catcher_env.py b/examples/test_CRAR/catcher_env.py
index 6f726e7d..ffb20f22 100644
--- a/examples/test_CRAR/catcher_env.py
+++ b/examples/test_CRAR/catcher_env.py
@@ -23,7 +23,6 @@ def __init__(self, rng, **kwargs):
         self._mode_score = 0.0
         self._mode_episode_count = 0
 
-        self._actions = [0,1]
         self._height=10#15
         self._width=10 #preferably an odd number so that it's symmetrical
         self._width_paddle=1
@@ -62,8 +61,15 @@ def reset(self, mode):
         
         
     def act(self, action):
-        action = self._actions[action]
-        
+        """Applies the agent action [action] on the environment.
+
+        Parameters
+        -----------
+        action : int
+            The action selected by the agent to operate on the environment. Should be an identifier 
+            included between 0 included and nActions() excluded.
+        """
+
         if(action==0):        
             self.x = max(self.x-1,0)
         if(action==1):        
@@ -82,8 +88,8 @@ def act(self, action):
         return self.reward
 
     def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
-        #print "test_data_set.observations.shape"
-        #print test_data_set.observations()[0][0:1]
+        """ Plot of the low-dimensional representation of the environment built by the model
+        """
         
         all_possib_inp=[]
         for x_b in range(self._nx_block):#[1]:#range(self._nx_block):
@@ -93,44 +99,15 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
                     all_possib_inp.append(state)
 
         all_possib_inp=np.expand_dims(all_possib_inp,axis=1)
-        print "all_possib_inp"
-        print all_possib_inp[0]
-        print all_possib_inp[self._height*(self._width-self._width_paddle+1)-1]
-        print all_possib_inp[self._height*(self._width-self._width_paddle+1)]
-        print all_possib_inp[2*self._height*(self._width-self._width_paddle+1)-1]
-        #print all_possib_inp[2*self._height*(self._width-self._width_paddle+1)]
-        #print all_possib_inp[3*self._height*(self._width-self._width_paddle+1)-1]
-        print "all_possib_inp.shape"
-        print all_possib_inp.shape
-        #print all_possib_inp[self._height*self._width]
-        #print "all_possib_inp[2*self._height*self._width]"
-        #print all_possib_inp[2*self._height*self._width]
-        #print all_possib_inp[2*self._height*self._width-1]
         all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
-        print "learning_algo.encoder.predict(all_possib_inp)"
-        print all_possib_abs_states
         
-        #print "print test_data_set.observations()"
-        #print test_data_set.observations()
         n=self._height-1
         historics=[]
         for i,observ in enumerate(test_data_set.observations()[0][0:n]):
             historics.append(np.expand_dims(observ,axis=0))
         historics=np.array(historics)
-        print "historics"
-        print historics
         abs_states=learning_algo.encoder.predict(historics)
-        print "abs_states"
-        print abs_states
         actions=test_data_set.actions()[0:n]
-        print "actions"
-        print actions
-
-        print actions
-        print "test_data_set.rewards()[0:n]"
-        print test_data_set.rewards()[0:n]
-        print "test_data_set.terminals()[0:n]"
-        print test_data_set.terminals()[0:n]
         if self.inTerminalState() == False:
             self._mode_episode_count += 1
         print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
@@ -145,10 +122,6 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         y = np.array(abs_states)[:,1]
         z = np.array(abs_states)[:,2]
         
-        #Colors
-        #onehot_actions = np.zeros((n, 4))
-        #onehot_actions[np.arange(n), actions] = 1
-        
         fig = plt.figure()
         ax = fig.add_subplot(111,projection='3d')
         ax.set_xlabel(r'$X_1$')
@@ -157,7 +130,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
 
         for j in range(3):
             # Plot the trajectory
-            for i in xrange(30):#(n-1):
+            for i in range(30):#(n-1):
                 ax.plot(x[j*24+i:j*24+i+2], y[j*24+i:j*24+i+2], z[j*24+i:j*24+i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
 
         # Plot the estimated transitions
@@ -165,16 +138,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
             predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0]])])
             predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1]])])
             ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.75", alpha=0.75)
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.75)
-
-#        for xx in np.arange(self._width)-self._width//2:
-#            for yy in np.arange(self._width)-self._width//2:
-#                for zz in np.arange(self._width)-self._width//2:
-#                    predicted1=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[1,0]])])
-#                    predicted2=learning_algo.transition.predict([np.array([[xx,yy,zz]]),np.array([[0,1]])])
-#                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
-#                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
-        
+            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.25", alpha=0.75)        
 
         # Plot the colorbar for the trajectory
         fig.subplots_adjust(right=0.7)
@@ -259,10 +223,6 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
 
         # Plot the Q_vals
         c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-        #print "actions,C"
-        #print actions
-        #print c
-        #c=np.max(c,axis=1)
         m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
         m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
         
@@ -280,7 +240,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         cb1.set_label('Estimated expected return')
 
         plt.show()
-        for ii in xrange(-15,345,30):
+        for ii in range(-15,345,30):
             ax.view_init(elev=20., azim=ii)
             plt.savefig('fig_w_V_div5_'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
 
@@ -295,11 +255,8 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
 
         c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
         c=np.max(c,axis=1)
-        #print "c"
-        #print c
         
         m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-        #plt.colorbar(m)
         fig.subplots_adjust(right=0.8)
         ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
         cmap = matplotlib.cm.hot
@@ -332,13 +289,8 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
 
         c = learning_algo.R.predict([repeat_nactions_coord,tile_identity_matrix])
         c=np.max(np.reshape(c,(125,self.nActions())),axis=1)
-        #print "c"
-        #print c
-        #mini=np.min(c)
-        #maxi=np.max(c)
         
         m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-        #plt.colorbar(m)
         fig.subplots_adjust(right=0.8)
         ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
         cmap = matplotlib.cm.hot
@@ -368,7 +320,7 @@ def observationType(self, subject):
         return np.float32
 
     def nActions(self):
-        return len(self._actions)
+        return 2
 
     def observe(self):
         obs=self.get_observation(self.y,self._x_block,self.x)
diff --git a/examples/test_CRAR/run_catcher.py b/examples/test_CRAR/run_catcher.py
index 2428912e..ae383b3c 100644
--- a/examples/test_CRAR/run_catcher.py
+++ b/examples/test_CRAR/run_catcher.py
@@ -129,7 +129,7 @@ class Defaults:
         pass
     dump(vars(parameters), "params/" + fname + ".jldump")
     
-    #agent.run(n_epochs=1, epoch_length=20000)
+    #agent.run(n_epochs=1, epoch_length=20000) #For collecting data off-policy
     #print "end gathering data"
     
     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
@@ -175,150 +175,150 @@ class Defaults:
     agent.run(parameters.epochs, parameters.steps_per_epoch)
     
 
-    ###
-    # TRANSFER
-    ###
-    optimized_params=learning_algo.getAllParams()
-    print "optimized_params"
-    print optimized_params
-
-    # --- Instantiate learning_algo ---
-    learning_algo = CRAR(
-        env,
-        parameters.rms_decay,
-        parameters.rms_epsilon,
-        parameters.momentum,
-        parameters.clip_norm,
-        parameters.freeze_interval,
-        parameters.batch_size,
-        parameters.update_rule,
-        rng,
-        double_Q=True,
-        high_int_dim=HIGH_INT_DIM,
-        internal_dim=3)
-    learning_algo.setAllParams(optimized_params)
-
-    samples_transfer=500
-    rand_ind=np.random.random_integers(0,20000,samples_transfer)
-    original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
-    transfer=[np.array([[-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
-
-    print "original[0][0:10], transfer[0][0:10]"
-    print original[0][0:10], transfer[0][0:10]
-
-    # Transfer between the two repr
-    learning_algo.transfer(original, transfer, 5000)
-
-    
-    # --- Instantiate environment with reverse=True ---
-    env = catcher_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
-
-    # --- Re instantiate agent ---
-    agent = NeuralAgent(
-        env,
-        learning_algo,
-        parameters.replay_memory_size,
-        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
-        parameters.batch_size,
-        rng,
-        test_policy=test_policy)
-
-    # --- Bind controllers to the agent ---
-    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
-    # learning rate as well as the training epoch number.
-    agent.attach(bc.VerboseController(
-        evaluate_on='epoch', 
-        periodicity=1))
-        
-    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
-    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
-    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
-    # episode or epoch (or never, hence the resetEvery='none').
-    agent.attach(bc.EpsilonController(
-        initial_e=parameters.epsilon_start, 
-        e_decays=parameters.epsilon_decay, 
-        e_min=parameters.epsilon_min,
-        evaluate_on='action',
-        periodicity=1,
-        reset_every='none'))
-
-    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
-    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
-    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
-    agent.attach(bc.TrainerController(
-        evaluate_on='action', 
-        periodicity=parameters.update_frequency, 
-        show_episode_avg_V_value=True, 
-        show_avg_Bellman_residual=True))
-
-    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
-    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
-    agent.attach(bc.LearningRateController(
-        initial_learning_rate=parameters.learning_rate, 
-        learning_rate_decay=parameters.learning_rate_decay,
-        periodicity=1))
-    
-    # Same for the discount factor.
-    agent.attach(bc.DiscountFactorController(
-        initial_discount_factor=parameters.discount, 
-        discount_factor_growth=parameters.discount_inc, 
-        discount_factor_max=parameters.discount_max,
-        periodicity=1))
-
-    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
-    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
-    # these validation epoch to interfere with the training of the agent, which is well established by the 
-    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
-    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
-    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
-    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
-    # [parameters.period_btw_summary_perfs] *validation* epochs.
-    agent.attach(bc.InterleavedTestEpochController(
-        id=catcher_env.VALIDATION_MODE, 
-        epoch_length=parameters.steps_per_test,
-        controllers_to_disable=[0, 1, 2, 3, 4],
-        periodicity=2,
-        show_score=True,
-        summarize_every=1))
-
-
-    #agent.gathering_data=False
-    agent.run(parameters.epochs, parameters.steps_per_epoch)
-
-
-    #print "agent.DataSet.self._terminals"
-    #print "agent._dataset.terminals()"
-    #print agent._dataset.terminals()
-    #print agent._dataset._terminals._data[0:2000]
-    #print agent._dataset._actions._data[0:2000]
-#    r=agent._dataset._rewards._data[0:2000]
-#    print "r before"
-#    print r
-    print agent._dataset._observations[0]._data[0:10]
-#    ind=np.argwhere(r>0)
-#    print "agent._dataset._observations[0]._data[ind[0]]"
-#    print agent._dataset._observations[0]._data[ind[0]]
-#    print ind
-#    agent._dataset._rewards._data=np.delete(agent._dataset._rewards._data,ind)
-#    agent._dataset._terminals._data=np.delete(agent._dataset._terminals._data,ind)
-#    agent._dataset._actions._data=np.delete(agent._dataset._actions._data,ind)
-#    agent._dataset._observations[0]._data=np.delete(agent._dataset._observations[0]._data,ind,axis=0)
-#    r=agent._dataset._rewards._data[0:2000]
-#    print "r after"
-#    print r
-#    print "agent._dataset._observations[0]._data[ind[0]] after"
-#    print agent._dataset._observations[0]._data[ind[0]]
+#    ###
+#    # TRANSFER
+#    ###
+#    optimized_params=learning_algo.getAllParams()
+#    print ("The optimized_params are")
+#    print (optimized_params)
 #
-
-
-
-    
-    # --- Show results ---
-    basename = "scores/" + fname
-    scores = joblib.load(basename + "_scores.jldump")
-    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
-    plt.legend()
-    plt.xlabel("Number of epochs")
-    plt.ylabel("Score")
-    plt.savefig(basename + "_scores.pdf")
-    plt.show()
+#    # --- Instantiate learning_algo ---
+#    learning_algo = CRAR(
+#        env,
+#        parameters.rms_decay,
+#        parameters.rms_epsilon,
+#        parameters.momentum,
+#        parameters.clip_norm,
+#        parameters.freeze_interval,
+#        parameters.batch_size,
+#        parameters.update_rule,
+#        rng,
+#        double_Q=True,
+#        high_int_dim=HIGH_INT_DIM,
+#        internal_dim=3)
+#    learning_algo.setAllParams(optimized_params)
+#
+#    samples_transfer=500
+#    rand_ind=np.random.random_integers(0,20000,samples_transfer)
+#    original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
+#    transfer=[np.array([[-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
+#
+#    print ("original[0][0:10], transfer[0][0:10]")
+#    print (original[0][0:10], transfer[0][0:10])
+#
+#    # Transfer between the two repr
+#    learning_algo.transfer(original, transfer, 5000)
+#
+#    
+#    # --- Instantiate environment with reverse=True ---
+#    env = catcher_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
+#
+#    # --- Re instantiate agent ---
+#    agent = NeuralAgent(
+#        env,
+#        learning_algo,
+#        parameters.replay_memory_size,
+#        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
+#        parameters.batch_size,
+#        rng,
+#        test_policy=test_policy)
+#
+#    # --- Bind controllers to the agent ---
+#    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+#    # learning rate as well as the training epoch number.
+#    agent.attach(bc.VerboseController(
+#        evaluate_on='epoch', 
+#        periodicity=1))
+#        
+#    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+#    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+#    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+#    # episode or epoch (or never, hence the resetEvery='none').
+#    agent.attach(bc.EpsilonController(
+#        initial_e=parameters.epsilon_start, 
+#        e_decays=parameters.epsilon_decay, 
+#        e_min=parameters.epsilon_min,
+#        evaluate_on='action',
+#        periodicity=1,
+#        reset_every='none'))
+#
+#    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+#    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+#    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+#    agent.attach(bc.TrainerController(
+#        evaluate_on='action', 
+#        periodicity=parameters.update_frequency, 
+#        show_episode_avg_V_value=True, 
+#        show_avg_Bellman_residual=True))
+#
+#    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+#    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+#    agent.attach(bc.LearningRateController(
+#        initial_learning_rate=parameters.learning_rate, 
+#        learning_rate_decay=parameters.learning_rate_decay,
+#        periodicity=1))
+#    
+#    # Same for the discount factor.
+#    agent.attach(bc.DiscountFactorController(
+#        initial_discount_factor=parameters.discount, 
+#        discount_factor_growth=parameters.discount_inc, 
+#        discount_factor_max=parameters.discount_max,
+#        periodicity=1))
+#
+#    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+#    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+#    # these validation epoch to interfere with the training of the agent, which is well established by the 
+#    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+#    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+#    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+#    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+#    # [parameters.period_btw_summary_perfs] *validation* epochs.
+#    agent.attach(bc.InterleavedTestEpochController(
+#        id=catcher_env.VALIDATION_MODE, 
+#        epoch_length=parameters.steps_per_test,
+#        controllers_to_disable=[0, 1, 2, 3, 4],
+#        periodicity=2,
+#        show_score=True,
+#        summarize_every=1))
+#
+#
+#    #agent.gathering_data=False
+#    agent.run(parameters.epochs, parameters.steps_per_epoch)
+#
+#
+#    #print "agent.DataSet.self._terminals"
+#    #print "agent._dataset.terminals()"
+#    #print agent._dataset.terminals()
+#    #print agent._dataset._terminals._data[0:2000]
+#    #print agent._dataset._actions._data[0:2000]
+##    r=agent._dataset._rewards._data[0:2000]
+##    print "r before"
+##    print r
+#    print agent._dataset._observations[0]._data[0:10]
+##    ind=np.argwhere(r>0)
+##    print "agent._dataset._observations[0]._data[ind[0]]"
+##    print agent._dataset._observations[0]._data[ind[0]]
+##    print ind
+##    agent._dataset._rewards._data=np.delete(agent._dataset._rewards._data,ind)
+##    agent._dataset._terminals._data=np.delete(agent._dataset._terminals._data,ind)
+##    agent._dataset._actions._data=np.delete(agent._dataset._actions._data,ind)
+##    agent._dataset._observations[0]._data=np.delete(agent._dataset._observations[0]._data,ind,axis=0)
+##    r=agent._dataset._rewards._data[0:2000]
+##    print "r after"
+##    print r
+##    print "agent._dataset._observations[0]._data[ind[0]] after"
+##    print agent._dataset._observations[0]._data[ind[0]]
+##
+#
+#
+#
+#    
+#    # --- Show results ---
+#    basename = "scores/" + fname
+#    scores = joblib.load(basename + "_scores.jldump")
+#    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
+#    plt.legend()
+#    plt.xlabel("Number of epochs")
+#    plt.ylabel("Score")
+#    plt.savefig(basename + "_scores.pdf")
+#    plt.show()
diff --git a/examples/test_CRAR/run_simple_maze.py b/examples/test_CRAR/run_simple_maze.py
index 1928cb08..88bde72b 100644
--- a/examples/test_CRAR/run_simple_maze.py
+++ b/examples/test_CRAR/run_simple_maze.py
@@ -136,28 +136,6 @@ class Defaults:
 
     agent.run(10, 100)  #(5, 50)
     print("end gathering data")
-    #print "agent.DataSet.self._terminals"
-    #print "agent._dataset.terminals()"
-    #print agent._dataset.terminals()
-    #print agent._dataset._terminals._data[0:2000]
-    #print agent._dataset._actions._data[0:2000]
-#    r=agent._dataset._rewards._data[0:2000]
-#    print "r before"
-#    print r
-#    #print agent._dataset._observations[0]._data[0:10]
-#    ind=np.argwhere(r>0)
-#    print "agent._dataset._observations[0]._data[ind[0]]"
-#    print agent._dataset._observations[0]._data[ind[0]]
-#    print ind
-#    agent._dataset._rewards._data=np.delete(agent._dataset._rewards._data,ind)
-#    agent._dataset._terminals._data=np.delete(agent._dataset._terminals._data,ind)
-#    agent._dataset._actions._data=np.delete(agent._dataset._actions._data,ind)
-#    agent._dataset._observations[0]._data=np.delete(agent._dataset._observations[0]._data,ind,axis=0)
-#    r=agent._dataset._rewards._data[0:2000]
-#    print "r after"
-#    print r
-#    print "agent._dataset._observations[0]._data[ind[0]] after"
-#    print agent._dataset._observations[0]._data[ind[0]]
 
     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
     # Plus, we also want to display after each training episode (!= than after every training) the average bellman
diff --git a/examples/test_CRAR/simple_maze_env.py b/examples/test_CRAR/simple_maze_env.py
index 040480f1..6d17c936 100644
--- a/examples/test_CRAR/simple_maze_env.py
+++ b/examples/test_CRAR/simple_maze_env.py
@@ -12,6 +12,10 @@
 from mpl_toolkits.axes_grid1 import host_subplot
 import mpl_toolkits.axisartist as AA
 import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.cm as cm
+from matplotlib.patches import Circle, Rectangle
+from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker            
 import copy 
 
 class MyEnv(Environment):
@@ -22,7 +26,6 @@ def __init__(self, rng, **kwargs):
         self._mode = -1
         self._mode_score = 0.0
         self._mode_episode_count = 0
-        self._actions = [0,1,2,3]
         self._size_maze=8
         self._higher_dim_obs=kwargs["higher_dim_obs"]
         self.create_map()
@@ -59,16 +62,24 @@ def reset(self, mode):
         # Setting the starting position of the agent
         self._pos_agent=[self._size_maze//2,self._size_maze//2]
             
-        print "reset mode"
-        print mode
-        print "self._map"
-        print self._map
-                
+        print ("new map:")
+        print (self._map)
+        print ("reset mode")
+        print (mode)
+
         return [1 * [self._size_maze * [self._size_maze * [0]]]]
         
         
     def act(self, action):
-        action = self._actions[action]
+        """Applies the agent action [action] on the environment.
+
+        Parameters
+        -----------
+        action : int
+            The action selected by the agent to operate on the environment. Should be an identifier 
+            included between 0 included and nActions() excluded.
+        """
+
         self._cur_action=action
         if(action==0):
             if(self._map[self._pos_agent[0]-1,self._pos_agent[1]]==0):
@@ -93,150 +104,119 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         """ Plot of the low-dimensional representation of the environment built by the model
         """
 
-        for i in range(1):
-            all_possib_inp=[]
-            self.create_map()
-            for y_a in range(self._size_maze):
-                for x_a in range(self._size_maze):                
-                    state=copy.deepcopy(self._map)
-                    state[self._size_maze//2,self._size_maze//2]=0
-                    if(state[x_a,y_a]==0):
-                        if(self._higher_dim_obs==True):
-                            all_possib_inp.append(self.get_higher_dim_obs([[x_a,y_a]],[self._pos_goal]))
-                        else:
-                            state[x_a,y_a]=0.5
-                            all_possib_inp.append(state)
+        all_possib_inp=[] # Will store all possible inputs (=observation) for the CRAR agent
+        self.create_map()
+        for y_a in range(self._size_maze):
+            for x_a in range(self._size_maze):                
+                state=copy.deepcopy(self._map)
+                state[self._size_maze//2,self._size_maze//2]=0
+                if(state[x_a,y_a]==0):
+                    if(self._higher_dim_obs==True):
+                        all_possib_inp.append(self.get_higher_dim_obs([[x_a,y_a]],[self._pos_goal]))
+                    else:
+                        state[x_a,y_a]=0.5
+                        all_possib_inp.append(state)
+
+        
+        all_possib_inp=np.expand_dims(np.array(all_possib_inp,dtype='float'),axis=1)
 
-            
-            all_possib_inp=np.expand_dims(np.array(all_possib_inp,dtype='float'),axis=1)
-            print "all_possib_inp.shape"
-            print all_possib_inp.shape
-            print all_possib_inp.dtype
-            print all_possib_inp[0,0,:]
-            print "learning_algo.encoder.predict(all_possib_inp[0:1,0:1,:])"
-            print learning_algo.encoder.predict(all_possib_inp[0:1,0:1,:])
-            all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
-            if(all_possib_abs_states.ndim==4):
-                all_possib_abs_states=np.transpose(all_possib_abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
-            print "learning_algo.encoder.predict(all_possib_inp)[0:2]"
-            print all_possib_abs_states[0:2]
-            
-            n=500
-            historics=[]
-            for i,observ in enumerate(test_data_set.observations()[0][0:n]):
-                historics.append(np.expand_dims(observ,axis=0))
-            historics=np.array(historics)
-            print "historics.shape"
-            print historics.shape
-            print historics.dtype
-            print historics[0,0,:]
-            print "learning_algo.encoder.predict(historics[0:1,0:1,:])"
-            print learning_algo.encoder.predict(historics[0:1,0:1,:])
-            print learning_algo.encoder.predict(all_possib_inp[0:1,0:1,:])
-            print "all_possib_inp[0:1,0:1,:]==historics[0:1,0:1,:]"
-            print all_possib_inp[0:1,0:1,:]==historics[0:1,0:1,:]
-            abs_states=learning_algo.encoder.predict(historics)
-            if(abs_states.ndim==4):
-                abs_states=np.transpose(abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
-            print "abs_states[0:2]"
-            print abs_states[0:2]
-            print abs_states.shape
-            actions=test_data_set.actions()[0:n]
-            print "actions[0:10]"
-            print actions[0:10]
-            
-            print "test_data_set.rewards()[0:10]"
-            print test_data_set.rewards()[0:10]
-            print "test_data_set.terminals()[0:10]"
-            print test_data_set.terminals()[0:10]
-            if self.inTerminalState() == False:
-                self._mode_episode_count += 1
-            print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
+        all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
+        if(all_possib_abs_states.ndim==4):
+            all_possib_abs_states=np.transpose(all_possib_abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
+        
+        n=500
+        historics=[]
+        for i,observ in enumerate(test_data_set.observations()[0][0:n]):
+            historics.append(np.expand_dims(observ,axis=0))
+        historics=np.array(historics)
+
+        abs_states=learning_algo.encoder.predict(historics)
+        if(abs_states.ndim==4):
+            abs_states=np.transpose(abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
+
+        actions=test_data_set.actions()[0:n]
+        
+        if self.inTerminalState() == False:
+            self._mode_episode_count += 1
+        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / (self._mode_episode_count+0.0001), self._mode_episode_count))
+                
+        
+        m = cm.ScalarMappable(cmap=cm.jet)
+        
+        x = np.array(abs_states)[:,0]
+        y = np.array(abs_states)[:,1]
+        if(self.intern_dim>2):
+            z = np.array(abs_states)[:,2]
                     
-            
-            import matplotlib.pyplot as plt
-            from mpl_toolkits.mplot3d import Axes3D
-            import matplotlib.cm as cm
-            m = cm.ScalarMappable(cmap=cm.jet)
-            
-            x = np.array(abs_states)[:,0]
-            y = np.array(abs_states)[:,1]
-            if(self.intern_dim>2):
-                z = np.array(abs_states)[:,2]
-                        
-            fig = plt.figure()
+        fig = plt.figure()
+        if(self.intern_dim==2):
+            ax = fig.add_subplot(111)
+            ax.set_xlabel(r'$X_1$')
+            ax.set_ylabel(r'$X_2$')
+        else:
+            ax = fig.add_subplot(111,projection='3d')
+            ax.set_xlabel(r'$X_1$')
+            ax.set_ylabel(r'$X_2$')
+            ax.set_zlabel(r'$X_3$')
+                    
+        # Plot the estimated transitions
+        for i in range(n-1):
+            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0,0]])])
+            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0,0]])])
+            predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1,0]])])
+            predicted4=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,0,1]])])
             if(self.intern_dim==2):
-                ax = fig.add_subplot(111)
-                ax.set_xlabel(r'$X_1$')
-                ax.set_ylabel(r'$X_2$')
+                ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), color="0.9", alpha=0.75)
+                ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), color="0.65", alpha=0.75)
+                ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), color="0.4", alpha=0.75)
+                ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), color="0.15", alpha=0.75)
             else:
-                ax = fig.add_subplot(111,projection='3d')
-                ax.set_xlabel(r'$X_1$')
-                ax.set_ylabel(r'$X_2$')
-                ax.set_zlabel(r'$X_3$')
-                        
-            # Plot the estimated transitions
-            for i in range(n-1):
-                predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0,0]])])
-                predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0,0]])])
-                predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1,0]])])
-                predicted4=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,0,1]])])
-                if(self.intern_dim==2):
-                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), color="0.9", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), color="0.65", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), color="0.4", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), color="0.15", alpha=0.75)
-                else:
-                    ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.9", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.65", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.4", alpha=0.75)
-                    ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.15", alpha=0.75)            
-            
-            # Plot the dots at each time step depending on the action taken
-            length_block=[[0,18],[18,19],[19,31]]
-            for i in range(3):
-                if(self.intern_dim==2):
-                    line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], s=30, marker='x', edgecolors='k', alpha=0.5)
-                else:
-                    line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], s=30, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
-
+                ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.9", alpha=0.75)
+                ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.65", alpha=0.75)
+                ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.4", alpha=0.75)
+                ax.plot(np.concatenate([x[i:i+1],predicted4[0,:1]]), np.concatenate([y[i:i+1],predicted4[0,1:2]]), np.concatenate([z[i:i+1],predicted4[0,2:3]]), color="0.15", alpha=0.75)            
+        
+        # Plot the dots at each time step depending on the action taken
+        length_block=[[0,18],[18,19],[19,31]]
+        for i in range(3):
             if(self.intern_dim==2):
-                axes_lims=[ax.get_xlim(),ax.get_ylim()]
+                line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], s=30, marker='x', edgecolors='k', alpha=0.5)
             else:
-                axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
-            
-            # Plot the legend for the dots
-            from matplotlib.patches import Circle, Rectangle
-            from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker            
-            
-            # Plot the legend for transition estimates
-            box1b = TextArea(" Estimated transitions (action 0, 1, 2 and 3): ", textprops=dict(color="k"))
-            box2b = DrawingArea(90, 20, 0, 0)
-            el1b = Rectangle((5, 10), 15,2, fc="0.9", alpha=0.75)
-            el2b = Rectangle((25, 10), 15,2, fc="0.65", alpha=0.75) 
-            el3b = Rectangle((45, 10), 15,2, fc="0.4", alpha=0.75)
-            el4b = Rectangle((65, 10), 15,2, fc="0.15", alpha=0.75) 
-            box2b.add_artist(el1b)
-            box2b.add_artist(el2b)
-            box2b.add_artist(el3b)
-            box2b.add_artist(el4b)
-            
-            boxb = HPacker(children=[box1b, box2b],
-                          align="center",
-                          pad=0, sep=5)
-            
-            anchored_box = AnchoredOffsetbox(loc=3,
-                                             child=boxb, pad=0.,
-                                             frameon=True,
-                                             bbox_to_anchor=(0., 0.98),
-                                             bbox_transform=ax.transAxes,
-                                             borderpad=0.,
-                                             )        
-            ax.add_artist(anchored_box)
-            
-            
-            #plt.show()
-            plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
+                line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], s=30, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
+
+        if(self.intern_dim==2):
+            axes_lims=[ax.get_xlim(),ax.get_ylim()]
+        else:
+            axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
+        
+        # Plot the legend for transition estimates
+        box1b = TextArea(" Estimated transitions (action 0, 1, 2 and 3): ", textprops=dict(color="k"))
+        box2b = DrawingArea(90, 20, 0, 0)
+        el1b = Rectangle((5, 10), 15,2, fc="0.9", alpha=0.75)
+        el2b = Rectangle((25, 10), 15,2, fc="0.65", alpha=0.75) 
+        el3b = Rectangle((45, 10), 15,2, fc="0.4", alpha=0.75)
+        el4b = Rectangle((65, 10), 15,2, fc="0.15", alpha=0.75) 
+        box2b.add_artist(el1b)
+        box2b.add_artist(el2b)
+        box2b.add_artist(el3b)
+        box2b.add_artist(el4b)
+        
+        boxb = HPacker(children=[box1b, box2b],
+                      align="center",
+                      pad=0, sep=5)
+        
+        anchored_box = AnchoredOffsetbox(loc=3,
+                                         child=boxb, pad=0.,
+                                         frameon=True,
+                                         bbox_to_anchor=(0., 0.98),
+                                         bbox_transform=ax.transAxes,
+                                         borderpad=0.,
+                                         )        
+        ax.add_artist(anchored_box)
+        
+        
+        #plt.show()
+        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
 
 
 #        # Plot the Q_vals
@@ -331,9 +311,9 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
 #        # and labels.
 #        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
 #        cb1.set_label('Estimated expected return')
-
-        #plt.show()
-        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
+#
+#        #plt.show()
+#        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
 
         matplotlib.pyplot.close("all") # avoids memory leaks
 
@@ -347,7 +327,7 @@ def observationType(self, subject):
         return np.float
 
     def nActions(self):
-        return len(self._actions)
+        return 4
 
     def observe(self):
         obs=copy.deepcopy(self._map)
@@ -361,6 +341,8 @@ def observe(self):
         return [obs]
     
     def get_higher_dim_obs(self,indices_agent,indices_reward):
+        """ Obtain the high-dimensional observation from indices of the agent position and the indices of the reward positions.
+        """
         obs=copy.deepcopy(self._map)
         obs=obs/1.
         obs=np.repeat(np.repeat(obs, 6, axis=0),6, axis=1)
@@ -387,9 +369,7 @@ def get_higher_dim_obs(self,indices_agent,indices_reward):
         for i in indices_reward:
             obs[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=reward_obs
 
-        print indices_agent
         for i in indices_agent:
-            print i
             obs[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=agent_obs
             
         #plt.imshow(obs, cmap='gray_r')
diff --git a/setup.py b/setup.py
index 151e9b0c..0287190c 100644
--- a/setup.py
+++ b/setup.py
@@ -3,9 +3,9 @@
 import deer
 
 NAME = 'deer'
-VERSION = '0.3.2'
+VERSION = '0.4'
 AUTHOR = "Vincent Francois-Lavet"
-AUTHOR_EMAIL = "v.francois@ulg.ac.be"
+AUTHOR_EMAIL = "vincent.francois@gmail.com"
 URL = 'https://github.com/VinF/deer'
 DESCRIPTION = 'Framework for deep reinforcement learning'
 with open('README.rst') as f:
@@ -20,8 +20,8 @@
     'Operating System :: OS Independent',
     'Programming Language :: Python :: 2.7',
     'Programming Language :: Python :: 3',
-    'Programming Language :: Python :: 3.3',
-    'Programming Language :: Python :: 3.4',
+    'Programming Language :: Python :: 3.6',
+    'Programming Language :: Python :: 3.7',
     'Topic :: Scientific/Engineering',
     'Topic :: Utilities',
     'Topic :: Software Development :: Libraries',

From cbdeb624aadcbe30fc69ce7d27510e25e35241b5 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 1 Aug 2018 17:21:16 -0400
Subject: [PATCH 74/96] cleaning and some doc improvement

---
 deer/learning_algos/CRAR_keras.py    | 37 +++++++++++++---------------
 deer/learning_algos/NN_CRAR_keras.py |  8 ------
 deer/learning_algos/q_net_keras.py   |  6 ++++-
 3 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index 857eb39e..7a47c809 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -79,11 +79,9 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_no
         self._internal_dim = kwargs.get('internal_dim',2)
         self.loss_interpret=0
         self.loss_T=0
-        self.loss_T2=0
-        self.loss_disentangle_t=0
-        self.loss_disentangle_a=0
         self.lossR=0
         self.loss_Q=0
+        self.loss_disentangle_t=0
         self.loss_disambiguate1=0
         self.loss_disambiguate2=0
         self.loss_gamma=0
@@ -107,7 +105,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_no
         self.full_gamma = self.learn_and_plan.full_R_model(self.encoder,self.gamma)
         
         # used to fit transitions
-        self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)#full_transition_model(self.encoder,self.transition)
+        self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)
         
         # used to force features variations
         if(self._high_int_dim==False):
@@ -219,8 +217,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             print (R[0])
             
         # Fit transition
-        l=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions]+[(1-terminals_val)], np.zeros_like(Es))
-        self.loss_T+=l
+        self.loss_T+=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions]+[(1-terminals_val)], np.zeros_like(Es))
         
         # Interpretable AI
         if(self._high_int_dim==False):
@@ -261,17 +258,11 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
 
-#
-#        # Loss to have all s' following s,a with a to a distance 1 of s,a)
-#        tiled_x=np.tile(Es,(self._n_actions,1))
-#        tiled_onehot_actions=np.tile(onehot_actions,(self._n_actions,1))
-#        tiled_onehot_actions2=np.repeat(np.diag(np.ones(self._n_actions)),self._batch_size,axis=0)
-
 
         
         if(self.update_counter%500==0):
-            print ("self.loss_T/100.,self.lossR/100.,self.loss_gamma/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.")
-            print (self.loss_T/100.,self.lossR/100.,self.loss_gamma/100.,self.loss_Q/100.,self.loss_disentangle_t/100.,self.loss_disentangle_a/100.,self.loss_disambiguate1/100.,self.loss_disambiguate2/100.)
+            print ("self.loss_T/100., self.lossR/100., self.loss_gamma/100., self.loss_Q/100., self.loss_disentangle_t/100., self.loss_disambiguate1/100., self.loss_disambiguate2/100.")
+            print (self.loss_T/100., self.lossR/100.,self.loss_gamma/100., self.loss_Q/100., self.loss_disentangle_t/100., self.loss_disambiguate1/100., self.loss_disambiguate2/100.)
             
             if(self._high_int_dim==False):
                 print ("self.loss_interpret/100.")
@@ -284,8 +275,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             self.loss_interpret=0
 
             self.loss_disentangle_t=0
-            self.loss_disentangle_a=0
-            
             self.loss_disambiguate1=0
             self.loss_disambiguate2=0
             
@@ -343,9 +332,8 @@ def qValues(self, state_val):
         -------
         The q values for the provided belief state
         """ 
-        copy_state=copy.deepcopy(state_val) #Required because of the "hack" below
+        copy_state=copy.deepcopy(state_val) #Required!
 
-        #return self.full_Q.predict([np.expand_dims(state,axis=0) for state in state_val]+[np.zeros((self._batch_size,self.learn_and_plan.internal_dim))])[0]
         return self.full_Q.predict([np.expand_dims(state,axis=0) for state in copy_state])[0]
 
     def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
@@ -353,8 +341,15 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
 
         Arguments
         ---------
-        state_val : one pseudo state
-        d : planning depth
+        state_val : array of objects (or list of objects)
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        R : R_model
+        gamma : discount_model
+        T : transition_model
+        Q : Q_model
+        d : int
+            planning depth
 
         Returns
         -------
@@ -537,6 +532,8 @@ def _compile(self):
                   loss=exp_dec_error)
 
     def _resetQHat(self):
+        """ Set the target Q-network weights equal to the main Q-network weights
+        """
         for i,(param,next_param) in enumerate(zip(self.params, self.params_target)):
             K.set_value(next_param,K.get_value(param))
 
diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index f033b09e..cfc248bf 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -125,14 +125,6 @@ def encoder_model(self):
                     out=input
                     
             outs_conv.append(out)
-
-        if (self._high_int_dim==True):
-            if ( isinstance(self._n_actions,int)):
-                print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
-            else:
-                input = Input(shape=(len(self._n_actions),))
-                inputs.append(input)
-                outs_conv.append(input)
         
         if(self._high_int_dim==False):
             if len(outs_conv)>1:
diff --git a/deer/learning_algos/q_net_keras.py b/deer/learning_algos/q_net_keras.py
index 0f08c8cf..f376546e 100644
--- a/deer/learning_algos/q_net_keras.py
+++ b/deer/learning_algos/q_net_keras.py
@@ -180,8 +180,9 @@ def chooseBestAction(self, state, *args, **kwargs):
         return np.argmax(q_vals),np.max(q_vals)
         
     def _compile(self):
-        """ compile self.q_vals
+        """ Compile self.q_vals
         """
+        
         if (self._update_rule=="sgd"):
             optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
         elif (self._update_rule=="rmsprop"):
@@ -192,6 +193,9 @@ def _compile(self):
         self.q_vals.compile(optimizer=optimizer, loss='mse')
 
     def _resetQHat(self):
+        """ Set the target Q-network weights equal to the main Q-network weights
+        """
+        
         for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
             K.set_value(next_param,K.get_value(param))
 

From dbf43d91f8dcda3734e85a0946476966d34ca159 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 1 Aug 2018 17:26:09 -0400
Subject: [PATCH 75/96] adding maze

---
 examples/maze/run_test4.py | 379 +++++++++++++++++++++++++++++++++++++
 examples/maze/test_env4.py | 303 +++++++++++++++++++++++++++++
 2 files changed, 682 insertions(+)
 create mode 100644 examples/maze/run_test4.py
 create mode 100644 examples/maze/test_env4.py

diff --git a/examples/maze/run_test4.py b/examples/maze/run_test4.py
new file mode 100644
index 00000000..d07ff764
--- /dev/null
+++ b/examples/maze/run_test4.py
@@ -0,0 +1,379 @@
+"""Maze launcher
+
+Author: Vincent Francois-Lavet
+"""
+
+import sys
+import logging
+import numpy as np
+from joblib import hash, dump
+import os
+
+from deer.default_parser import process_args
+from deer.agent import NeuralAgent
+from deer.q_networks.q_net_keras_lp import MyQNetwork
+from test_env4 import MyEnv as test_env
+import deer.experiment.base_controllers as bc
+
+from deer.policies import EpsilonGreedyPolicy
+
+
+class Defaults:
+    # ----------------------
+    # Experiment Parameters
+    # ----------------------
+    STEPS_PER_EPOCH = 2000
+    EPOCHS = 250
+    STEPS_PER_TEST = 200
+    PERIOD_BTW_SUMMARY_PERFS = 1
+    
+    # ----------------------
+    # Environment Parameters
+    # ----------------------
+    FRAME_SKIP = 2
+
+    # ----------------------
+    # DQN Agent parameters:
+    # ----------------------
+    UPDATE_RULE = 'rmsprop'
+    LEARNING_RATE = 0.0005
+    LEARNING_RATE_DECAY = 0.995
+    DISCOUNT = 0.9
+    DISCOUNT_INC = 1
+    DISCOUNT_MAX = 0.99
+    RMS_DECAY = 0.9
+    RMS_EPSILON = 0.0001
+    MOMENTUM = 0
+    CLIP_DELTA = 1.0
+    EPSILON_START = 1.0
+    EPSILON_MIN = 1.0
+    EPSILON_DECAY = 10000
+    UPDATE_FREQUENCY = 1
+    REPLAY_MEMORY_SIZE = 1000000
+    BATCH_SIZE = 32
+    FREEZE_INTERVAL = 1000
+    DETERMINISTIC = False
+
+HIGHER_DIM_OBS = True
+HIGH_INT_DIM = True
+N_SAMPLES=200000
+samples_transfer=100
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    
+    # --- Parse parameters ---
+    parameters = process_args(sys.argv[1:], Defaults)
+    if parameters.deterministic:
+        rng = np.random.RandomState(123456)
+    else:
+        rng = np.random.RandomState()
+    
+    # --- Instantiate environment ---
+    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS)
+    
+    # --- Instantiate qnetwork ---
+    qnetwork = MyQNetwork(
+        env,
+        parameters.rms_decay,
+        parameters.rms_epsilon,
+        parameters.momentum,
+        parameters.clip_delta,
+        parameters.freeze_interval,
+        parameters.batch_size,
+        parameters.update_rule,
+        rng,
+        double_Q=True,
+        high_int_dim=HIGH_INT_DIM,
+        internal_dim=3)
+    
+    train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.1)
+
+    # --- Instantiate agent ---
+    agent = NeuralAgent(
+        env,
+        qnetwork,
+        parameters.replay_memory_size,
+        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
+        parameters.batch_size,
+        rng,
+        train_policy=train_policy,
+        test_policy=test_policy)
+
+    # --- Create unique filename for FindBestController ---
+    h = hash(vars(parameters), hash_name="sha1")
+    fname = "test_" + h
+    print("The parameters hash is: {}".format(h))
+    print("The parameters are: {}".format(parameters))
+
+    # --- Bind controllers to the agent ---
+    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+    # learning rate as well as the training epoch number.
+    agent.attach(bc.VerboseController(
+        evaluate_on='epoch', 
+        periodicity=1))
+    
+    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+    agent.attach(bc.LearningRateController(
+        initial_learning_rate=parameters.learning_rate, 
+        learning_rate_decay=parameters.learning_rate_decay,
+        periodicity=1))
+    
+    # Same for the discount factor.
+    agent.attach(bc.DiscountFactorController(
+        initial_discount_factor=parameters.discount, 
+        discount_factor_growth=parameters.discount_inc, 
+        discount_factor_max=parameters.discount_max,
+        periodicity=1))
+        
+    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+    # episode or epoch (or never, hence the resetEvery='none').
+    agent.attach(bc.EpsilonController(
+        initial_e=parameters.epsilon_start, 
+        e_decays=parameters.epsilon_decay, 
+        e_min=parameters.epsilon_min,
+        evaluate_on='action',
+        periodicity=1,
+        reset_every='none'))
+
+    agent.run(1, N_SAMPLES)
+    print agent._dataset._rewards._data[0:500]
+    print agent._dataset._terminals._data[0:500]
+    print("end gathering data")
+    old_rewards=agent._dataset._rewards._data
+    old_terminals=agent._dataset._terminals._data
+    old_actions=agent._dataset._actions._data
+    old_observations=agent._dataset._observations[0]._data
+
+    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+    agent.attach(bc.TrainerController(
+        evaluate_on='action', 
+        periodicity=parameters.update_frequency, 
+        show_episode_avg_V_value=True, 
+        show_avg_Bellman_residual=True))
+    
+    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
+    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
+    # "true generalization score", or "test score".
+    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
+    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
+    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
+    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
+    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
+    # application.
+    #agent.attach(bc.FindBestController(
+    #    validationID=test_env.VALIDATION_MODE,
+    #    testID=None,
+    #    unique_fname=fname))
+    
+    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+    # these validation epoch to interfere with the training of the agent, which is well established by the 
+    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+    # [parameters.period_btw_summary_perfs] *validation* epochs.
+    agent.attach(bc.InterleavedTestEpochController(
+        id=test_env.VALIDATION_MODE, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+
+#    agent.attach(bc.InterleavedTestEpochController(
+#        id=test_env.VALIDATION_MODE+1, 
+#        epoch_length=parameters.steps_per_test,
+#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 7,8],
+#        periodicity=2,
+#        show_score=True,
+#        summarize_every=1))
+#
+#    agent.attach(bc.InterleavedTestEpochController(
+#        id=test_env.VALIDATION_MODE+2, 
+#        epoch_length=parameters.steps_per_test,
+#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6,8],
+#        periodicity=2,
+#        show_score=True,
+#        summarize_every=1))
+#    
+#    agent.attach(bc.InterleavedTestEpochController(
+#        id=test_env.VALIDATION_MODE+3, 
+#        epoch_length=parameters.steps_per_test,
+#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6, 7],
+#        periodicity=2,
+#        show_score=True,
+#        summarize_every=1))
+
+    # --- Run the experiment ---
+    try:
+        os.mkdir("params")
+    except Exception:
+        pass
+    dump(vars(parameters), "params/" + fname + ".jldump")
+    agent.gathering_data=False
+    agent.run(parameters.epochs, parameters.steps_per_epoch)
+    
+
+    ###
+    # TRANSFER
+    ###
+    optimized_params=qnetwork.getAllParams()
+    print "optimized_params"
+    print optimized_params
+
+    # --- Instantiate qnetwork ---
+#    qnetwork = MyQNetwork(
+#        env,
+#        parameters.rms_decay,
+#        parameters.rms_epsilon,
+#        parameters.momentum,
+#        parameters.clip_delta,
+#        parameters.freeze_interval,
+#        parameters.batch_size,
+#        parameters.update_rule,
+#        rng,
+#        double_Q=True,
+#        high_int_dim=HIGH_INT_DIM,
+#        internal_dim=3)
+#    qnetwork.setAllParams(optimized_params)
+
+    rand_ind=np.random.random_integers(0,20000,samples_transfer)
+    original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
+    transfer=[np.array([[-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
+
+    print "original[0][0:10], transfer[0][0:10]"
+    print original[0][0:10], transfer[0][0:10]
+
+    # Transfer between the two repr
+    #qnetwork.transfer(original, transfer, 5000000/samples_transfer)
+
+    
+    # --- Re instantiate environment with reverse=True ---
+    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
+
+    # --- Re instantiate agent ---
+    agent = NeuralAgent(
+        env,
+        qnetwork,
+        parameters.replay_memory_size,
+        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
+        parameters.batch_size,
+        rng,
+        test_policy=test_policy)
+
+    # --- Bind controllers to the agent ---
+    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+    # learning rate as well as the training epoch number.
+    agent.attach(bc.VerboseController(
+        evaluate_on='epoch', 
+        periodicity=1))
+        
+    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+    agent.attach(bc.LearningRateController(
+        initial_learning_rate=parameters.learning_rate, 
+        learning_rate_decay=parameters.learning_rate_decay,
+        periodicity=1))
+    
+    # Same for the discount factor.
+    agent.attach(bc.DiscountFactorController(
+        initial_discount_factor=parameters.discount, 
+        discount_factor_growth=parameters.discount_inc, 
+        discount_factor_max=parameters.discount_max,
+        periodicity=1))
+
+    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+    # episode or epoch (or never, hence the resetEvery='none').
+    agent.attach(bc.EpsilonController(
+        initial_e=parameters.epsilon_start, 
+        e_decays=parameters.epsilon_decay, 
+        e_min=parameters.epsilon_min,
+        evaluate_on='action',
+        periodicity=1,
+        reset_every='none'))
+
+    agent.run(1, N_SAMPLES)
+    print agent._dataset._rewards._data[0:500]
+    print agent._dataset._terminals._data[0:500]
+    print("end gathering data")
+    # Setting the dataset to be the same than the old one (but modif for the observations)
+    agent._dataset._rewards._data=old_rewards
+    agent._dataset._terminals._data=old_terminals
+    agent._dataset._actions._data=old_actions
+    agent._dataset._observations[0]._data=-old_observations
+
+    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+    agent.attach(bc.TrainerController(
+        evaluate_on='action', 
+        periodicity=parameters.update_frequency, 
+        show_episode_avg_V_value=True, 
+        show_avg_Bellman_residual=True))
+
+    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+    # these validation epoch to interfere with the training of the agent, which is well established by the 
+    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+    # [parameters.period_btw_summary_perfs] *validation* epochs.
+    agent.attach(bc.InterleavedTestEpochController(
+        id=test_env.VALIDATION_MODE, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+
+
+#    agent.attach(bc.InterleavedTestEpochController(
+#        id=test_env.VALIDATION_MODE+1, 
+#        epoch_length=parameters.steps_per_test,
+#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 7,8],
+#        periodicity=2,
+#        show_score=True,
+#        summarize_every=1))
+#
+#    agent.attach(bc.InterleavedTestEpochController(
+#        id=test_env.VALIDATION_MODE+2, 
+#        epoch_length=parameters.steps_per_test,
+#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6,8],
+#        periodicity=2,
+#        show_score=True,
+#        summarize_every=1))
+#    
+#    agent.attach(bc.InterleavedTestEpochController(
+#        id=test_env.VALIDATION_MODE+3, 
+#        epoch_length=parameters.steps_per_test,
+#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6, 7],
+#        periodicity=2,
+#        show_score=True,
+#        summarize_every=1))
+
+    agent.gathering_data=False
+    agent.run(parameters.epochs, parameters.steps_per_epoch)
+
+
+
+    # --- Show results ---
+    basename = "scores/" + fname
+    scores = joblib.load(basename + "_scores.jldump")
+    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
+    plt.legend()
+    plt.xlabel("Number of epochs")
+    plt.ylabel("Score")
+    plt.savefig(basename + "_scores.pdf")
+    plt.show()
diff --git a/examples/maze/test_env4.py b/examples/maze/test_env4.py
new file mode 100644
index 00000000..477715a4
--- /dev/null
+++ b/examples/maze/test_env4.py
@@ -0,0 +1,303 @@
+""" Interface with the test environment
+
+Author: Vincent Francois-Lavet
+
+def encoder_model(self):
+
+def transition_model(self):
+    x = Dense(10, activation='tanh')(x) #5,15
+    x = Dense(30, activation='tanh')(x) # ,30
+    x = Dense(30, activation='tanh')(x) # ,30
+    x = Dense(10, activation='tanh')(x) # ,30
+
+"""
+import numpy as np
+import cv2
+
+from deer.base_classes import Environment
+
+#import matplotlib
+#matplotlib.use('qt5agg')
+#from mpl_toolkits.axes_grid1 import host_subplot
+#import mpl_toolkits.axisartist as AA
+#import matplotlib.pyplot as plt
+import copy 
+import a_star_path_finding as pf
+
+class MyEnv(Environment):
+    VALIDATION_MODE = 0
+
+    def __init__(self, rng, **kwargs):
+
+        self._random_state = rng
+        self._mode = -1
+        self._mode_score = 0.0
+        self._mode_episode_count = 0
+        self._episode_steps = 0
+        self._actions = [0,1,2,3]
+        self._size_maze = 8
+        self._higher_dim_obs=kwargs.get('higher_dim_obs',False)
+        self._reverse=kwargs.get('reverse',False)
+
+        self._n_walls = int((self._size_maze-2)**2/3.)#int((self._size_maze)**2/3.)
+        self._n_rewards = 3
+        self.create_map()
+        self.intern_dim=3
+        
+    def create_map(self):
+        valid_map=False
+        while valid_map==False:
+            # Agent
+            self._pos_agent=[1,1]
+
+            # Walls
+            self._pos_walls=[]
+            for i in range(self._size_maze):
+                self._pos_walls.append([i,0])
+                self._pos_walls.append([i,self._size_maze-1])
+            for j in range(self._size_maze-2):
+                self._pos_walls.append([0,j+1])
+                self._pos_walls.append([self._size_maze-1,j+1])
+            
+            n=0
+            while n < self._n_walls:
+                potential_wall=[self._random_state.randint(1,self._size_maze-2),self._random_state.randint(1,self._size_maze-2)]
+                if(potential_wall not in self._pos_walls and potential_wall!=self._pos_agent):
+                    self._pos_walls.append(potential_wall)
+                    n+=1
+            
+            # Rewards
+            #self._pos_rewards=[[self._size_maze-2,self._size_maze-2]]
+            self._pos_rewards=[]
+            n=0
+            while n < self._n_rewards:
+                potential_reward=[self._random_state.randint(1,self._size_maze-1),self._random_state.randint(1,self._size_maze-1)]
+                if(potential_reward not in self._pos_rewards and potential_reward not in self._pos_walls and potential_reward!=self._pos_agent):
+                    self._pos_rewards.append(potential_reward)
+                    n+=1
+            
+            valid_map=self.is_valid_map(self._pos_agent,self._pos_walls,self._pos_rewards)
+
+
+    def is_valid_map(self,pos_agent,pos_walls,pos_rewards):
+        a = pf.AStar()
+        pos_walls
+        walls = [tuple(w) for w in pos_walls]
+        start=tuple(pos_agent)
+        for r in pos_rewards:
+            end=tuple(r)
+            a.init_grid(self._size_maze, self._size_maze, walls, start, end)
+            maze=a
+            optimal_path=maze.solve()
+            if(optimal_path==None):
+                return False
+        
+        return True
+
+    def reset(self, mode):
+        self._episode_steps = 0
+        self._mode=mode
+        self.create_map()
+        
+        if mode == MyEnv.VALIDATION_MODE:
+            if self._mode != MyEnv.VALIDATION_MODE:
+                self._mode = MyEnv.VALIDATION_MODE
+                self._mode_score = 0.0
+                self._mode_episode_count = 0
+                
+            else:
+                self._mode_episode_count += 1
+                    
+        print "reset mode"
+        print mode
+        #print "self._pos_agent,self._pos_walls,self._pos_rewards"
+        #print self._pos_agent,self._pos_walls,self._pos_rewards
+                
+        return [1 * [self._size_maze * [self._size_maze * [0]]]]
+        
+        
+    def act(self, action):
+        self._episode_steps += 1
+        action = self._actions[action]
+
+        self.reward = -0.1
+
+        if(action==0):
+            if([self._pos_agent[0]+1,self._pos_agent[1]] not in self._pos_walls):
+                self._pos_agent[0]=self._pos_agent[0]+1
+            #else:
+            #    self.reward=-0.1
+        elif(action==1):        
+            if([self._pos_agent[0],self._pos_agent[1]+1] not in self._pos_walls):
+                self._pos_agent[1]=self._pos_agent[1]+1
+            #else:
+            #    self.reward=-0.1
+        elif(action==2):        
+            if([self._pos_agent[0]-1,self._pos_agent[1]] not in self._pos_walls):
+                self._pos_agent[0]=self._pos_agent[0]-1
+            #else:
+            #    self.reward=-0.1
+        elif(action==3):        
+            if([self._pos_agent[0],self._pos_agent[1]-1] not in self._pos_walls):
+                self._pos_agent[1]=self._pos_agent[1]-1
+            #else:
+            #    self.reward=-0.1
+        
+        if (self._pos_agent in self._pos_rewards):
+            self.reward = 1
+            self._pos_rewards.remove(self._pos_agent)
+
+        self._mode_score += self.reward
+        return self.reward
+
+
+    def summarizePerformance(self, test_data_set, learning_algo):
+        print "test_data_set.observations.shape"
+        print test_data_set.observations()[0][0:1]
+        
+        print "self._mode_score"
+        print self._mode_score
+        
+
+    def inputDimensions(self):
+        if(self._higher_dim_obs==True):
+            return [(1,self._size_maze*6,self._size_maze*6)]
+        else:
+            return [(1,self._size_maze,self._size_maze)]
+        
+    def observationType(self, subject):
+        return np.float32
+
+    def nActions(self):
+        return len(self._actions)
+
+    def observe(self):
+        self._map=np.zeros((self._size_maze,self._size_maze))
+        for coord_wall in self._pos_walls:
+            self._map[coord_wall[0],coord_wall[1]]=1
+        for coord_reward in self._pos_rewards:
+            self._map[coord_reward[0],coord_reward[1]]=2
+        self._map[self._pos_agent[0],self._pos_agent[1]]=0.5
+
+        if(self._higher_dim_obs==True):
+            indices_reward=np.argwhere(self._map == 2)
+            indices_agent=np.argwhere(self._map == 0.5)
+            self._map=self._map/1.
+            self._map=np.repeat(np.repeat(self._map, 6, axis=0),6, axis=1)
+            # agent repr
+            agent_obs=np.zeros((6,6))
+            agent_obs[0,2]=0.8
+            agent_obs[1,0:5]=0.9
+            agent_obs[2,1:4]=0.9
+            agent_obs[3,1:4]=0.9
+            agent_obs[4,1]=0.9
+            agent_obs[4,3]=0.9
+            agent_obs[5,0:2]=0.9
+            agent_obs[5,3:5]=0.9
+            
+            # reward repr
+            reward_obs=np.zeros((6,6))
+            reward_obs[:,1]=0.7
+            reward_obs[0,1:4]=0.6
+            reward_obs[1,3]=0.7
+            reward_obs[2,1:4]=0.6
+            reward_obs[4,2]=0.7
+            reward_obs[5,2:4]=0.7
+            
+            for i in indices_reward:
+                #print self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]
+                self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=reward_obs
+
+            for i in indices_agent:
+                self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=agent_obs
+            self._map=(self._map*2)-1 #scaling
+            #print "self._map higher_dim_obs"
+            #print self._map
+            #plt.imshow(self._map, cmap='gray_r')
+            #plt.show()
+        else:
+            self._map=self._map/2.
+            self._map[self._map == 0.5] = 0.99  # agent
+            self._map[self._map == 1.] = 0.5    # reward
+            
+        if(self._reverse==True):
+            self._map=-self._map #1-self._map
+        
+        #print "self._map"
+        #print self._map
+        return [self._map]
+
+    def inTerminalState(self):
+        if ( self._pos_rewards==[] or (self._mode>=0 and self._episode_steps >= 50) ):
+            return True
+        else:
+            return False
+
+
+
+if __name__ == "__main__":
+    import hashlib
+    
+    rng = np.random.RandomState(123456)
+    env = MyEnv(rng, higher_dim_obs=False)
+    
+    maps=[]
+    for i in range(10000):
+        env.create_map()
+        
+        one_laby=env.observe()[0]
+        
+        # Hashing the labyrinths to be able to find duplicates in O(1)
+        #print str(one_laby)
+        #second_laby=copy.deepcopy(one_laby)
+        one_laby=int(hashlib.sha1(str(one_laby)).hexdigest(), 16) % (10 ** 8)
+        #print one_laby
+        #print int(hashlib.sha1(str(second_laby)).hexdigest(), 16) % (10 ** 8)
+        
+        # TESTING ADDING DUPLICATION
+        #if i%1000==0:
+        #    maps.append(one_laby)
+        if i%1000==0:
+            env.reset(0)
+        if i%1000==500:
+            env.reset(1)
+
+        maps.append(copy.deepcopy(one_laby))
+
+    duplicate_laby=0
+    for i in range(10000):
+        env.create_map()
+        one_laby=env.observe()[0]
+        
+        # Hashing the labyrinths to be able to find duplicates in O(1)
+        #print str(one_laby)
+        #second_laby=copy.deepcopy(one_laby)
+        one_laby=int(hashlib.sha1(str(one_laby)).hexdigest(), 16) % (10 ** 8)
+        #print one_laby
+        #print int(hashlib.sha1(str(second_laby)).hexdigest(), 16) % (10 ** 8)
+        
+        # TESTING ADDING DUPLICATION
+        #if i%1000==0:
+        #    maps.append(one_laby)
+        # TESTING WITH RESETS
+        if i%1000==0:
+            env.reset(0)
+        if i%1000==500:
+            env.reset(1)
+
+        #print maps,one_laby
+        #print maps.count(one_laby)
+        duplicate=min(maps.count(one_laby),1)
+        #duplicate=0
+        #for a in maps:
+        #    if(a==one_laby):
+        #        duplicate=1
+        #        break
+        duplicate_laby+=duplicate
+        
+        if i%1000==0:
+            print "duplicate_laby"
+            print duplicate_laby
+    
+    
+    

From e5629451bf6479dfc1e404af6e38c1944e86d159 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 1 Aug 2018 17:28:04 -0400
Subject: [PATCH 76/96] cleaning maze + compatibility python 3

---
 examples/maze/run_test4.py | 44 ++++++++++++++++----------------
 examples/maze/test_env4.py | 52 ++++++--------------------------------
 2 files changed, 30 insertions(+), 66 deletions(-)

diff --git a/examples/maze/run_test4.py b/examples/maze/run_test4.py
index d07ff764..2cc364b1 100644
--- a/examples/maze/run_test4.py
+++ b/examples/maze/run_test4.py
@@ -11,7 +11,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
+from deer.learning_algos.CRAR_keras import CRAR
 from test_env4 import MyEnv as test_env
 import deer.experiment.base_controllers as bc
 
@@ -44,7 +44,7 @@ class Defaults:
     RMS_DECAY = 0.9
     RMS_EPSILON = 0.0001
     MOMENTUM = 0
-    CLIP_DELTA = 1.0
+    CLIP_NORM = 1.0
     EPSILON_START = 1.0
     EPSILON_MIN = 1.0
     EPSILON_DECAY = 10000
@@ -73,13 +73,13 @@ class Defaults:
     # --- Instantiate environment ---
     env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS)
     
-    # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
+    # --- Instantiate learning_algo ---
+    learning_algo = CRAR(
         env,
         parameters.rms_decay,
         parameters.rms_epsilon,
         parameters.momentum,
-        parameters.clip_delta,
+        parameters.clip_norm,
         parameters.freeze_interval,
         parameters.batch_size,
         parameters.update_rule,
@@ -88,13 +88,13 @@ class Defaults:
         high_int_dim=HIGH_INT_DIM,
         internal_dim=3)
     
-    train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.1)
+    train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.)
+    test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1)
 
     # --- Instantiate agent ---
     agent = NeuralAgent(
         env,
-        qnetwork,
+        learning_algo,
         parameters.replay_memory_size,
         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
         parameters.batch_size,
@@ -142,8 +142,8 @@ class Defaults:
         reset_every='none'))
 
     agent.run(1, N_SAMPLES)
-    print agent._dataset._rewards._data[0:500]
-    print agent._dataset._terminals._data[0:500]
+    #print (agent._dataset._rewards._data[0:500])
+    #print (agent._dataset._terminals._data[0:500])
     print("end gathering data")
     old_rewards=agent._dataset._rewards._data
     old_terminals=agent._dataset._terminals._data
@@ -226,12 +226,12 @@ class Defaults:
     ###
     # TRANSFER
     ###
-    optimized_params=qnetwork.getAllParams()
-    print "optimized_params"
-    print optimized_params
+    optimized_params=learning_algo.getAllParams()
+    print ("optimized_params")
+    print (optimized_params)
 
-    # --- Instantiate qnetwork ---
-#    qnetwork = MyQNetwork(
+    # --- Instantiate learning_algo ---
+#    learning_algo = CRAR(
 #        env,
 #        parameters.rms_decay,
 #        parameters.rms_epsilon,
@@ -244,17 +244,17 @@ class Defaults:
 #        double_Q=True,
 #        high_int_dim=HIGH_INT_DIM,
 #        internal_dim=3)
-#    qnetwork.setAllParams(optimized_params)
+#    learning_algo.setAllParams(optimized_params)
 
     rand_ind=np.random.random_integers(0,20000,samples_transfer)
     original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
     transfer=[np.array([[-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
 
-    print "original[0][0:10], transfer[0][0:10]"
-    print original[0][0:10], transfer[0][0:10]
+    print ("original[0][0:10], transfer[0][0:10]")
+    print (original[0][0:10], transfer[0][0:10])
 
     # Transfer between the two repr
-    #qnetwork.transfer(original, transfer, 5000000/samples_transfer)
+    #learning_algo.transfer(original, transfer, 5000000/samples_transfer)
 
     
     # --- Re instantiate environment with reverse=True ---
@@ -263,7 +263,7 @@ class Defaults:
     # --- Re instantiate agent ---
     agent = NeuralAgent(
         env,
-        qnetwork,
+        learning_algo,
         parameters.replay_memory_size,
         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
         parameters.batch_size,
@@ -304,8 +304,8 @@ class Defaults:
         reset_every='none'))
 
     agent.run(1, N_SAMPLES)
-    print agent._dataset._rewards._data[0:500]
-    print agent._dataset._terminals._data[0:500]
+    #print (agent._dataset._rewards._data[0:500])
+    #print (agent._dataset._terminals._data[0:500])
     print("end gathering data")
     # Setting the dataset to be the same than the old one (but modif for the observations)
     agent._dataset._rewards._data=old_rewards
diff --git a/examples/maze/test_env4.py b/examples/maze/test_env4.py
index 477715a4..a36dd4d4 100644
--- a/examples/maze/test_env4.py
+++ b/examples/maze/test_env4.py
@@ -1,15 +1,6 @@
 """ Interface with the test environment
 
 Author: Vincent Francois-Lavet
-
-def encoder_model(self):
-
-def transition_model(self):
-    x = Dense(10, activation='tanh')(x) #5,15
-    x = Dense(30, activation='tanh')(x) # ,30
-    x = Dense(30, activation='tanh')(x) # ,30
-    x = Dense(10, activation='tanh')(x) # ,30
-
 """
 import numpy as np
 import cv2
@@ -108,8 +99,7 @@ def reset(self, mode):
             else:
                 self._mode_episode_count += 1
                     
-        print "reset mode"
-        print mode
+        #print ("reset mode:"+str(mode)+".")
         #print "self._pos_agent,self._pos_walls,self._pos_rewards"
         #print self._pos_agent,self._pos_walls,self._pos_rewards
                 
@@ -125,23 +115,15 @@ def act(self, action):
         if(action==0):
             if([self._pos_agent[0]+1,self._pos_agent[1]] not in self._pos_walls):
                 self._pos_agent[0]=self._pos_agent[0]+1
-            #else:
-            #    self.reward=-0.1
         elif(action==1):        
             if([self._pos_agent[0],self._pos_agent[1]+1] not in self._pos_walls):
                 self._pos_agent[1]=self._pos_agent[1]+1
-            #else:
-            #    self.reward=-0.1
         elif(action==2):        
             if([self._pos_agent[0]-1,self._pos_agent[1]] not in self._pos_walls):
                 self._pos_agent[0]=self._pos_agent[0]-1
-            #else:
-            #    self.reward=-0.1
         elif(action==3):        
             if([self._pos_agent[0],self._pos_agent[1]-1] not in self._pos_walls):
                 self._pos_agent[1]=self._pos_agent[1]-1
-            #else:
-            #    self.reward=-0.1
         
         if (self._pos_agent in self._pos_rewards):
             self.reward = 1
@@ -152,11 +134,10 @@ def act(self, action):
 
 
     def summarizePerformance(self, test_data_set, learning_algo):
-        print "test_data_set.observations.shape"
-        print test_data_set.observations()[0][0:1]
+        print ("test_data_set.observations.shape")
+        print (test_data_set.observations()[0][0:1])
         
-        print "self._mode_score"
-        print self._mode_score
+        print ("self._mode_score:"+str(self._mode_score)+".")
         
 
     def inputDimensions(self):
@@ -248,15 +229,9 @@ def inTerminalState(self):
         one_laby=env.observe()[0]
         
         # Hashing the labyrinths to be able to find duplicates in O(1)
-        #print str(one_laby)
-        #second_laby=copy.deepcopy(one_laby)
-        one_laby=int(hashlib.sha1(str(one_laby)).hexdigest(), 16) % (10 ** 8)
-        #print one_laby
-        #print int(hashlib.sha1(str(second_laby)).hexdigest(), 16) % (10 ** 8)
+        one_laby=int(hashlib.sha1(str(one_laby).encode('utf-8')).hexdigest(), 16) % (10 ** 8)
         
         # TESTING ADDING DUPLICATION
-        #if i%1000==0:
-        #    maps.append(one_laby)
         if i%1000==0:
             env.reset(0)
         if i%1000==500:
@@ -270,34 +245,23 @@ def inTerminalState(self):
         one_laby=env.observe()[0]
         
         # Hashing the labyrinths to be able to find duplicates in O(1)
-        #print str(one_laby)
-        #second_laby=copy.deepcopy(one_laby)
-        one_laby=int(hashlib.sha1(str(one_laby)).hexdigest(), 16) % (10 ** 8)
-        #print one_laby
-        #print int(hashlib.sha1(str(second_laby)).hexdigest(), 16) % (10 ** 8)
+        one_laby=int(hashlib.sha1(str(one_laby).encode('utf-8')).hexdigest(), 16) % (10 ** 8)
         
         # TESTING ADDING DUPLICATION
         #if i%1000==0:
         #    maps.append(one_laby)
+
         # TESTING WITH RESETS
         if i%1000==0:
             env.reset(0)
         if i%1000==500:
             env.reset(1)
 
-        #print maps,one_laby
-        #print maps.count(one_laby)
         duplicate=min(maps.count(one_laby),1)
-        #duplicate=0
-        #for a in maps:
-        #    if(a==one_laby):
-        #        duplicate=1
-        #        break
         duplicate_laby+=duplicate
         
         if i%1000==0:
-            print "duplicate_laby"
-            print duplicate_laby
+            print ("Number of duplicate labyrinths:"+str(duplicate_laby)+".")
     
     
     

From 4768d774257350209010e71102ab49a6f499ec90 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Wed, 1 Aug 2018 17:51:27 -0400
Subject: [PATCH 77/96] additional cleaning

---
 deer/learning_algos/CRAR_keras.py    | 12 ++--
 deer/learning_algos/NN_CRAR_keras.py | 96 ----------------------------
 2 files changed, 7 insertions(+), 101 deletions(-)

diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index 7a47c809..84731a42 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -1,5 +1,5 @@
 """
-Code for the CRAR agent using Keras
+Code for the CRAR learning algorithm using Keras
 
 """
 
@@ -322,15 +322,17 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
 
     def qValues(self, state_val):
-        """ Get the q values for one belief state (without planning)
+        """ Get the q values for one pseudo state (without planning)
 
         Arguments
         ---------
-        state_val : one pseudo state
+        state_val : array of objects (or list of objects)
+            Each object is a numpy array that relates to one of the observations
+            with size [1 * history size * size of punctual observation (which is 2D,1D or scalar)]).
 
         Returns
         -------
-        The q values for the provided belief state
+        The q values for the provided pseudo state
         """ 
         copy_state=copy.deepcopy(state_val) #Required!
 
@@ -343,7 +345,7 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         ---------
         state_val : array of objects (or list of objects)
             Each object is a numpy array that relates to one of the observations
-            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+            with size [1 * history size * size of punctual observation (which is 2D,1D or scalar)]).
         R : R_model
         gamma : discount_model
         T : transition_model
diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index cfc248bf..dc8695a0 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -322,102 +322,6 @@ def force_features(self,encoder_model,transition_model,plan_depth=0):
         
         return model
 
-
-#    def diff_s_s_(self,encoder_model):
-#        """
-#        Used to force some state representation to be sufficiently different
-#        
-#        Parameters
-#        -----------
-#        s
-#        a
-#        random z
-#    
-#        Returns
-#        -------
-#        model with output Tx (= model estimate of x')
-#    
-#        """
-#        inputs=[]
-#        
-#        for j in range(2):
-#            for i, dim in enumerate(self._input_dimensions):
-#                if len(dim) == 3:
-#                    input = Input(shape=(dim[-3],dim[-2],dim[-1]))
-#                    inputs.append(input)
-#            
-#                elif len(dim) == 2:
-#                    input = Input(shape=(dim[-3],dim[-2]))
-#                    inputs.append(input)
-#            
-#                else:
-#                    input = Input(shape=(dim[-3],))
-#                    inputs.append(input)
-#        
-#        half = len(inputs)//2
-#        enc_x = encoder_model(inputs[:half]) #s --> x #FIXME
-#        enc_x_ = encoder_model(inputs[half:]) #s --> x
-#        
-#        if (self._high_int_dim==True):
-#            enc_x=Flatten()(enc_x)
-#            enc_x_=Flatten()(enc_x_)
-#        x = Subtract()([enc_x,enc_x_])
-#
-#        #x = Dot(axes=-1, normalize=False)([x,x])
-#        
-#        model = Model(inputs=inputs, outputs=x )
-#        
-#        return model
-
-    def diff_sa_sa(self,encoder_model,transition_model):
-        """
-    
-        Parameters
-        -----------
-        s
-        a
-        rand_a
-    
-        Returns
-        -------
-        model with output Tx (= model estimate of x')
-    
-        """
-        inputs=[]
-        
-        for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3 or len(dim) == 4:
-                input = Input(shape=(dim[-3],dim[-2],dim[-1]))
-                inputs.append(input)
-
-            elif len(dim) == 2:
-                input = Input(shape=(dim[-3],dim[-2]))
-                inputs.append(input)
-
-            else:
-                input = Input(shape=(dim[-3],))
-                inputs.append(input)
-        
-        input = Input(shape=(self._n_actions,))
-        inputs.append(input)
-        input = Input(shape=(self._n_actions,))
-        inputs.append(input)
-        
-        enc_x = encoder_model(inputs[:-2]) #s --> x
-        Tx= transition_model([enc_x,inputs[-2]])
-        rand_Tx= transition_model([enc_x,inputs[-1]])
-        
-        if (self._high_int_dim==True):
-            Tx=Flatten()(Tx)
-            rand_Tx=Flatten()(rand_Tx)
-            x = Subtract()([Tx,rand_Tx])
-        else:
-            x = Subtract()([Tx,rand_Tx])
-        
-        model = Model(inputs=inputs, outputs=x )
-        
-        return model
-
     def R_model(self):
         """
         Build a network consistent with each type of inputs

From 23f560ccb488fa46e2c6b5dd2c4d825a3754f111 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 2 Aug 2018 15:36:04 -0400
Subject: [PATCH 78/96] cleaning and adding comments

---
 deer/learning_algos/CRAR_keras.py    |  49 +++---
 deer/learning_algos/NN_CRAR_keras.py | 233 ++++++++++++++++-----------
 2 files changed, 167 insertions(+), 115 deletions(-)

diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index 84731a42..2df27b5f 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -25,6 +25,9 @@ def exp_dec_error(y_true, y_pred):
     return K.exp( - 5.*K.sqrt( K.clip(K.sum(K.square(y_pred), axis=-1, keepdims=True),0.000001,10) )  ) # tend to increase y_pred
 
 def cosine_proximity2(y_true, y_pred):
+    """ This loss is similar to the native cosine_proximity loss from Keras
+    but it differs by the fact that only the two first components of the two vectors are used
+    """
     y_true = K.l2_normalize(y_true[:,0:2], axis=-1)
     y_pred = K.l2_normalize(y_pred[:,0:2], axis=-1)
     return -K.sum(y_true * y_pred, axis=-1)
@@ -91,18 +94,18 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_no
         self.encoder = self.learn_and_plan.encoder_model()
         self.encoder_diff = self.learn_and_plan.encoder_diff_model(self.encoder)
         
-        self.R = self.learn_and_plan.R_model()
+        self.R = self.learn_and_plan.float_model()
         self.Q = self.learn_and_plan.Q_model()
-        self.gamma = self.learn_and_plan.R_model()
+        self.gamma = self.learn_and_plan.float_model()
         self.transition = self.learn_and_plan.transition_model()
 
         self.full_Q=self.learn_and_plan.full_Q_model(self.encoder,self.Q,0,self._df)
         
         # used to fit rewards
-        self.full_R = self.learn_and_plan.full_R_model(self.encoder,self.R)
+        self.full_R = self.learn_and_plan.full_float_model(self.encoder,self.R)
         
         # used to fit gamma
-        self.full_gamma = self.learn_and_plan.full_R_model(self.encoder,self.gamma)
+        self.full_gamma = self.learn_and_plan.full_float_model(self.encoder,self.gamma)
         
         # used to fit transitions
         self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)
@@ -125,8 +128,8 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_no
         self.learn_and_plan_target = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
         self.encoder_target = self.learn_and_plan_target.encoder_model()
         self.Q_target = self.learn_and_plan_target.Q_model()
-        self.R_target = self.learn_and_plan_target.R_model()
-        self.gamma_target = self.learn_and_plan_target.R_model()
+        self.R_target = self.learn_and_plan_target.float_model()
+        self.gamma_target = self.learn_and_plan_target.float_model()
         self.transition_target = self.learn_and_plan_target.transition_model()
 
         self.full_Q_target = self.learn_and_plan_target.full_Q_model(self.encoder,self.Q) # FIXME
@@ -322,7 +325,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
 
     def qValues(self, state_val):
-        """ Get the q values for one pseudo state (without planning)
+        """ Get the q values for one pseudo-state (without planning)
 
         Arguments
         ---------
@@ -339,15 +342,15 @@ def qValues(self, state_val):
         return self.full_Q.predict([np.expand_dims(state,axis=0) for state in copy_state])[0]
 
     def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
-        """ Get the q values for one belief state with a planning depth d
-
+        """ Get the average Q-values up to planning depth d for one pseudo-state.
+        
         Arguments
         ---------
         state_val : array of objects (or list of objects)
             Each object is a numpy array that relates to one of the observations
             with size [1 * history size * size of punctual observation (which is 2D,1D or scalar)]).
-        R : R_model
-        gamma : discount_model
+        R : float_model for the reward
+        gamma : float_model for the discount
         T : transition_model
         Q : Q_model
         d : int
@@ -355,7 +358,7 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
 
         Returns
         -------
-        The q values with planning depth d for the provided belief state
+        The average q values with planning depth up to d for the provided pseudo-state
         """
         encoded_x = self.encoder.predict(state_val)
 
@@ -405,22 +408,32 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         return QD_plan
   
     def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
-        """ 
+        """ Get the q values for one pseudo-state with a planning depth d
+
+        Arguments
+        ---------
+        state_abstr_val : internal state(s).
+        R : float_model for the reward
+        gamma : float_model for the discount
+        T : transition_model
+        Q : Q_model
+        d : int
+            planning depth
+
+        Returns
+        -------
+        The Q-values with planning depth d for the provided encoded state(s)
         """
         #if(branching_factor==None or branching_factor>self._n_actions):
         #    branching_factor=self._n_actions
         
-        #print "qValues_planning_abstr d"
-        #print d
         n=len(state_abstr_val)
-        identity_matrix = np.diag(np.ones(self._n_actions))
+        identity_matrix = np.identity(self._n_actions)
         
         this_branching_factor=branching_factor.pop(0)
         if (n==1):
             # We require that the first branching factor is self._n_actions so that QD_plan has the right dimension
             this_branching_factor=self._n_actions
-        #else:
-        #    this_branching_factor=branching_factor
                          
         if (d==0):
             if(this_branching_factor<self._n_actions):
diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index dc8695a0..986e6740 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -21,17 +21,15 @@ class NN():
     input_dimensions :
     n_actions :
     random_state : numpy random number generator
-    action_as_input : Boolean
-        Whether the action is given as input or as output
     high_int_dim : Boolean
-        Whether the abstract state should be high dimensional in the form of frames/vectors or whether it should be low-dimensional
+        Whether the abstract state should be high dimensional in the form of frames/vectors or whether it should 
+        be low-dimensional
     """
-    def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False, **kwargs):
+    def __init__(self, batch_size, input_dimensions, n_actions, random_state, **kwargs):
         self._input_dimensions=input_dimensions
         self._batch_size=batch_size
         self._random_state=random_state
         self._n_actions=n_actions
-        self._action_as_input=action_as_input
         self._high_int_dim=kwargs["high_int_dim"]
         if(self._high_int_dim==True):
             self.n_channels_internal_dim=kwargs["internal_dim"] #dim[-3]
@@ -40,15 +38,20 @@ def __init__(self, batch_size, input_dimensions, n_actions, random_state, action
                                                         #3 for catcher
 
     def encoder_model(self):
-        """
-    
+        """ Instantiate a Keras model for the encoder of the CRAR learning algorithm.
+        
+        The model takes the following as input 
+        s : list of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        
         Parameters
         -----------
-        s
+        
     
         Returns
         -------
-        model with output x (= encoding of s)
+        Keras model with output x (= encoding of s)
     
         """
         layers=[]
@@ -63,22 +66,13 @@ def encoder_model(self):
                 x=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
                 if(dim[-2]>8 and dim[-1]>8):
                     self._pooling_encoder=6
-                    #x = Conv2D(4, (3, 3), padding='same', activation='tanh')(x)
-                    #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                    #x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
                     x = Conv2D(8, (2, 2), padding='same', activation='tanh')(x)
                     x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
                     x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                     x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
                     x = MaxPooling2D(pool_size=(3, 3), strides=None, padding='same')(x)
-                    #x = Conv2D(4, (2, 2), padding='same', activation='tanh')(x)
-                    #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
-                    #x = Conv2D(16, (4, 4), padding='same', activation='tanh')(x)
-                    #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
                 else:
                     self._pooling_encoder=1
-                    #x = Conv2D(8, (1, 1), padding='same', activation='tanh')(x)
-                    #x = MaxPooling2D(pool_size=(self._pooling_encoder, self._pooling_encoder), strides=None, padding='same')(x)
                     
                 if(self._high_int_dim==True):
                     x = Conv2D(self.n_channels_internal_dim, (1, 1), padding='same')(x)
@@ -126,6 +120,9 @@ def encoder_model(self):
                     
             outs_conv.append(out)
         
+        if(self._high_int_dim==True):
+            model = Model(inputs=inputs, outputs=outs_conv)
+
         if(self._high_int_dim==False):
             if len(outs_conv)>1:
                 x = merge(outs_conv, mode='concat')
@@ -140,20 +137,28 @@ def encoder_model(self):
         
             x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
         
-        model = Model(inputs=inputs, outputs=x)
+            model = Model(inputs=inputs, outputs=x)
         
         return model
 
     def encoder_diff_model(self,encoder_model):
-        """
-    
+        """ Instantiate a Keras model that provides the difference between two encoded pseudo-states
+        
+        The model takes the two following inputs:
+        s1 : list of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        s2 : list of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        
         Parameters
         -----------
-        s
+        encoder_model: instantiation of a Keras model for the encoder
     
         Returns
         -------
-        model with output x (= encoding of s)
+        model with output the difference between the encoding of s1 and the encoding of s2
     
         """
         inputs=[]
@@ -185,16 +190,19 @@ def encoder_diff_model(self,encoder_model):
         return model
 
     def transition_model(self):
-        """
+        """  Instantiate a Keras model for the transition between two encoded pseudo-states.
     
+        The model takes as inputs:
+        x : internal state
+        a : int
+            the action considered
+        
         Parameters
         -----------
-        x
-        a
     
         Returns
         -------
-        model with output Tx (= model estimate of x')
+        model that outputs the transition of (x,a)
     
         """
         if(self._high_int_dim==True):
@@ -202,29 +210,28 @@ def transition_model(self):
             inputs = [ Input(shape=(-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder),self.n_channels_internal_dim)), Input( shape=(self._n_actions,) ) ]     # data_format='channels_last'
             
             layers_action=inputs[1]
-            layers_action=RepeatVector(-(-dim[-2] // self._pooling_encoder)*-(-dim[-1] // self._pooling_encoder))(layers_action)#K.repeat_elements(layers_action,rep=dim[-2]*dim[-1],axis=1)
+            layers_action=RepeatVector(-(-dim[-2] // self._pooling_encoder)*-(-dim[-1] // self._pooling_encoder))(layers_action)
             layers_action=Reshape((self._n_actions,-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder)))(layers_action)
             layers_action=Permute((2,3,1), input_shape=(self.n_channels_internal_dim+self._n_actions,-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder)))(layers_action)    #data_format='channels_last'
             
             x = Concatenate(axis=-1)([layers_action,inputs[0]])
             
-            x = Conv2D(16, (1, 1), padding='same', activation='tanh')(x) # Try to keep locality as much as possible --> FIXME
+            x = Conv2D(16, (1, 1), padding='same', activation='tanh')(x)
             x = Conv2D(32, (2, 2), padding='same', activation='tanh')(x)
             x = Conv2D(64, (3, 3), padding='same', activation='tanh')(x)
             x = Conv2D(32, (2, 2), padding='same', activation='tanh')(x)
             x = Conv2D(16, (1, 1), padding='same', activation='tanh')(x)
-            #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
             x = Conv2D(self.n_channels_internal_dim, (1, 1), padding='same')(x)
             x = Add()([inputs[0],x])
         else:
             inputs = [ Input( shape=(self.internal_dim,) ), Input( shape=(self._n_actions,) ) ]     # x
 
-            x = Concatenate()(inputs)#,axis=-1)
-            x = Dense(10, activation='tanh')(x) #5,15
-            x = Dense(30, activation='tanh')(x) # ,30
-            x = Dense(30, activation='tanh')(x) # ,30
-            x = Dense(10, activation='tanh')(x) # ,30
-            x = Dense(self.internal_dim)(x)#, activity_regularizer=regularizers.l2(0.00001))(x) #, activation='relu'
+            x = Concatenate()(inputs)
+            x = Dense(10, activation='tanh')(x)
+            x = Dense(30, activation='tanh')(x)
+            x = Dense(30, activation='tanh')(x)
+            x = Dense(10, activation='tanh')(x)
+            x = Dense(self.internal_dim)(x)
             x = Add()([inputs[0],x])
         
         model = Model(inputs=inputs, outputs=x)
@@ -232,14 +239,28 @@ def transition_model(self):
         return model
 
     def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
-        """
-        Used to fit the transitions
+        """ For plan_depth=0, instantiate a Keras model that provides the difference between T(E(s1),a) and E(s2).
+        Note that it gives 0 if the transition leading to s2 is terminal (we don't need to fit the transition if 
+        it is terminal).
+        
+        For plan_depth=0, the model takes the four following inputs:
+        s1 : list of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        s2 : list of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        a : list of ints with length (plan_depth+1)
+            the action(s) considered at s1
+        terminal : boolean
+            Whether the transition leading to s2 is terminal
         
         Parameters
         -----------
-        s
-        a
-        s'
+        encoder_model: instantiation of a Keras model for the encoder (E)
+        transition_model: instantiation of a Keras model for the transition (T)
+        plan_depth: if>1, it provides the possibility to consider a sequence of transitions between s1 and s2 
+        (input a is then a list of actions)
     
         Returns
         -------
@@ -281,18 +302,26 @@ def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
         return model
 
     def force_features(self,encoder_model,transition_model,plan_depth=0):
-        """
-        Used to force some transitions'directions
+        """ Instantiate a Keras model that provides the vector of the transition at E(s1). It is calculated as the different between E(s1) and E(T(s1)). 
+        Used to force the directions of the transitions.
+        
+        The model takes the four following inputs:
+        s1 : list of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        a : list of ints with length (plan_depth+1)
+            the action(s) considered at s1
         
         Parameters
         -----------
-        s
-        a
-        s'
-    
+        encoder_model: instantiation of a Keras model for the encoder (E)
+        transition_model: instantiation of a Keras model for the transition (T)
+        plan_depth: if>1, it provides the possibility to consider a sequence of transitions between s1 and s2 
+        (input a is then a list of actions)
+            
         Returns
         -------
-        model with output Tx (= model estimate of x')
+        model with output E(s1)-T(E(s1))
     
         """
         inputs=[]
@@ -322,18 +351,21 @@ def force_features(self,encoder_model,transition_model,plan_depth=0):
         
         return model
 
-    def R_model(self):
-        """
-        Build a network consistent with each type of inputs
-
+    def float_model(self):
+        """ Instantiate a Keras model for fitting a float from x.
+                
+        The model takes the following inputs:
+        x : internal state
+        a : int
+            the action considered at x
+        
         Parameters
         -----------
-        x
-        a
-    
+            
         Returns
         -------
-        r
+        model that outputs a float
+    
         """
         
         if(self._high_int_dim==True):
@@ -351,7 +383,6 @@ def R_model(self):
             x = Conv2D(32, (3, 3), padding='same', activation='tanh')(x)
             x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
             x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
-            #x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='same')(x)
             x = Conv2D(4, (1, 1), padding='same', activation='tanh')(x)
 
             # we stack a deep fully-connected network on top
@@ -371,18 +402,27 @@ def R_model(self):
         
         return model
 
-    def full_R_model(self,encoder_model,R_model,plan_depth=0,transition_model=None):
-        """
-        Maps internal state to immediate rewards
-
+    def full_float_model(self,encoder_model,float_model,plan_depth=0,transition_model=None):
+        """ Instantiate a Keras model for fitting a float from s.
+                
+        The model takes the four following inputs:
+        s : list of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        a : list of ints with length (plan_depth+1)
+            the action(s) considered at s
+                
         Parameters
         -----------
-        s
-        a
-    
+        encoder_model: instantiation of a Keras model for the encoder (E)
+        float_model: instantiation of a Keras model for fitting a float from x
+        plan_depth: if>1, it provides the possibility to consider a sequence of transitions following s 
+        (input a is then a list of actions)
+        transition_model: instantiation of a Keras model for the transition (T)
+            
         Returns
         -------
-        r
+        model with output the reward r
         """
         
         inputs=[]
@@ -410,13 +450,25 @@ def full_R_model(self,encoder_model,R_model,plan_depth=0,transition_model=None):
         input = Input(shape=(self._n_actions,))
         inputs.append(input)
         
-        out = R_model([Tx]+inputs[-1:])
+        out = float_model([Tx]+inputs[-1:])
 
         model = Model(inputs=inputs, outputs=out)
         
         return model
 
     def Q_model(self):
+        """ Instantiate a  a Keras model for the Q-network from x.
+
+        The model takes the following inputs:
+        x : internal state
+
+        Parameters
+        -----------
+            
+        Returns
+        -------
+        model that outputs the Q-values for each action
+        """
         if(self._high_int_dim==True):
             inputs=[]
             outs_conv=[]
@@ -425,7 +477,6 @@ def Q_model(self):
                 if len(dim) == 3 or len(dim) == 4:
                     input = Input(shape=(-(-dim[-2] // self._pooling_encoder),-(-dim[-1] // self._pooling_encoder),self.n_channels_internal_dim)) #data_format is already 'channels_last'
                     inputs.append(input)
-                    #reshaped=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)
                     x = input     #data_format is already 'channels_last'
             
                     x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
@@ -439,14 +490,6 @@ def Q_model(self):
                         
                 outs_conv.append(out)
             
-            if (self._action_as_input==True):
-                if ( isinstance(self._n_actions,int)):
-                    print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
-                else:
-                    input = Input(shape=(len(self._n_actions),))
-                    inputs.append(input)
-                    outs_conv.append(input)
-            
             if len(outs_conv)>1:
                 x = merge(outs_conv, mode='concat')
             else:
@@ -463,13 +506,7 @@ def Q_model(self):
         x = Dense(50, activation='tanh')(x)
         x = Dense(20, activation='tanh')(x)
         
-        #if (self._action_as_input==False):
-        #    if ( isinstance(self._n_actions,int)):
         out = Dense(self._n_actions)(x)
-        #    else:
-        #        out = Dense(len(self._n_actions))(x)
-        #else:
-        #    out = Dense(1)(x)
                 
         model = Model(inputs=inputs, outputs=out)
         
@@ -477,17 +514,28 @@ def Q_model(self):
 
 
     def full_Q_model(self, encoder_model, Q_model, plan_depth=0, transition_model=None, R_model=None, discount_model=None):
-        """
-        Build a network consistent with each type of inputs
-
+        """ Instantiate a  a Keras model for the Q-network from s.
+
+        The model takes the following inputs:
+        s : list of objects
+            Each object is a numpy array that relates to one of the observations
+            with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
+        a : list of ints with length plan_depth; if plan_depth=0, there isn't any input for a.
+            the action(s) considered at s
+    
         Parameters
         -----------
-        s
-        noise in abstract state space
-    
+        encoder_model: instantiation of a Keras model for the encoder (E)
+        Q_model: instantiation of a Keras model for the Q-network from x.
+        plan_depth: if>1, it provides the possibility to consider a sequence of transitions following s 
+        (input a is then a list of actions)
+        transition_model: instantiation of a Keras model for the transition (T)
+        R_model: instantiation of a Keras model for the reward
+        discount_model: instantiation of a Keras model for the discount
+            
         Returns
         -------
-        model with output Tx (= model estimate of x')
+        model with output the Q-values
         """
         inputs=[]
         
@@ -522,15 +570,6 @@ def full_Q_model(self, encoder_model, Q_model, plan_depth=0, transition_model=No
                 disc_plan=Multiply()([disc_plan,discount]) #disc_model([out]+inputs[-1:])
 
             out=transition_model([out]+inputs[-1:])
-
-        #if(self._high_int_dim==True):
-        #    input = Input(shape=(dim[-2],dim[-1],dim[-3]))
-        #    inputs.append(input)
-        #else:
-        #    input = Input(shape=(self.internal_dim,))
-        #    inputs.append(input)
-        #
-        #x=Add()([out,inputs[-1]]) # adding noise in the abstract state space
         
         if(plan_depth==0):
             Q_estim=Q_model(out)

From 232c0ed57998f2786107987150adae171b37fb73 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 2 Aug 2018 17:17:40 -0400
Subject: [PATCH 79/96] Cleaning and updating comments in CRAR_keras

---
 deer/learning_algos/CRAR_keras.py | 32 ++++++++++---------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index 2df27b5f..57c3dba3 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -408,7 +408,8 @@ def qValues_planning(self, state_val, R, gamma, T, Q, d=5):
         return QD_plan
   
     def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_factor=None):
-        """ Get the q values for one pseudo-state with a planning depth d
+        """ Get the q values for pseudo-state(s) with a planning depth d. 
+        This function is called recursively by decreasing the depth d at every step.
 
         Arguments
         ---------
@@ -432,7 +433,8 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
         
         this_branching_factor=branching_factor.pop(0)
         if (n==1):
-            # We require that the first branching factor is self._n_actions so that QD_plan has the right dimension
+            # We require that the first branching factor is self._n_actions so that this function return values 
+            # with the right dimension (=self._n_actions). 
             this_branching_factor=self._n_actions
                          
         if (d==0):
@@ -443,6 +445,8 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
         else:
             if(this_branching_factor==self._n_actions):
                 # All actions are considered in the tree
+                # NB: For this case, we do not use argpartition because we want to keep the actions in the natural order
+                # That way, this function returns the Q-values for all actions with planning depth d in the right order
                 repeat_identity=np.repeat(identity_matrix,len(state_abstr_val),axis=0)
                 if(state_abstr_val.ndim==2):
                     tile3_encoded_x=np.tile(state_abstr_val,(self._n_actions,1))
@@ -451,30 +455,13 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
                 else:
                     print ("error")
             else:
-                # A subset of the actions are considered in the tree
+                # A subset of the actions are considered
                 estim_Q_values=Q.predict([state_abstr_val])
-                #print estim_Q_values
                 ind = np.argpartition(estim_Q_values, -this_branching_factor)[:,-this_branching_factor:]
-                #print ind
-                #print identity_matrix[ind]
-                #repeat_identity=np.repeat(identity_matrix[ind],len(state_abstr_val),axis=0)
                 repeat_identity=identity_matrix[ind].reshape(n*this_branching_factor,self._n_actions)
-                #print repeat_identity
-                #if(state_abstr_val.ndim==2):
-                #    tile3_encoded_x=np.tile(state_abstr_val,(this_branching_factor,1))
-                #elif(state_abstr_val.ndim==4):
-                #    tile3_encoded_x=np.tile(state_abstr_val,(this_branching_factor,1,1,1))
-                #else:
-                #    print ("error")
                 tile3_encoded_x=np.repeat(state_abstr_val,this_branching_factor,axis=0)
-                #print "tile3_encoded_x"
-                #print tile3_encoded_x
             
-            #print tile3_encoded_x
-            #print repeat_identity
             r_vals_d0=np.array(R.predict([tile3_encoded_x,repeat_identity]))
-            #print "r_vals_d0"
-            #print r_vals_d0
             r_vals_d0=r_vals_d0.flatten()
             
             gamma_vals_d0=np.array(gamma.predict([tile3_encoded_x,repeat_identity]))
@@ -484,11 +471,12 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
             return r_vals_d0+gamma_vals_d0*np.amax(self.qValues_planning_abstr(next_x_predicted,R,gamma,T,Q,d=d-1,branching_factor=branching_factor).reshape(len(state_abstr_val)*this_branching_factor,branching_factor[0]),axis=1).flatten()
 
     def chooseBestAction(self, state, mode, *args, **kwargs):
-        """ Get the best action for a belief state
+        """ Get the best action for a pseudo-state
 
         Arguments
         ---------
-        state : one belief state
+        state : one pseudo-state
+        mode : identifier of the mode (-1 is reserved for the training mode)
 
         Returns
         -------

From d1ec1cf33213a6bfe0afec8cf98565145f4aa9de Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 2 Aug 2018 17:29:18 -0400
Subject: [PATCH 80/96] cleaning CRAR_keras

---
 deer/learning_algos/CRAR_keras.py | 54 ++++++++++++++++---------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index 57c3dba3..74a9ad11 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -110,21 +110,23 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_no
         # used to fit transitions
         self.diff_Tx_x_ = self.learn_and_plan.diff_Tx_x_(self.encoder,self.transition)
         
+        # constraint on consecutive t
+        self.diff_s_s_ = self.learn_and_plan.encoder_diff_model(self.encoder)
+
         # used to force features variations
         if(self._high_int_dim==False):
             self.force_features=self.learn_and_plan.force_features(self.encoder,self.transition)
                 
-        # constraint on consecutive t
-        self.diff_s_s_ = self.learn_and_plan.encoder_diff_model(self.encoder)
-
+        # Grab all the parameters in self.params
         layers=self.encoder.layers+self.Q.layers+self.R.layers+self.gamma.layers+self.transition.layers
-        # Grab all the parameters together.
         self.params = [ param
                     for layer in layers 
                     for param in layer.trainable_weights ]
 
+        # Compile all models
         self._compile()
 
+        # Instantiate the same neural network as a target network.
         self.learn_and_plan_target = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, high_int_dim=self._high_int_dim, internal_dim=self._internal_dim)
         self.encoder_target = self.learn_and_plan_target.encoder_model()
         self.Q_target = self.learn_and_plan_target.Q_model()
@@ -135,8 +137,8 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_no
         self.full_Q_target = self.learn_and_plan_target.full_Q_model(self.encoder,self.Q) # FIXME
         self.full_Q_target.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.full_Q
 
+        # Grab all the parameters of the target network together.
         layers=self.encoder_target.layers+self.Q_target.layers+self.R_target.layers+self.gamma_target.layers+self.transition_target.layers
-        # Grab all the parameters together.
         self.params_target = [ param
                     for layer in layers 
                     for param in layer.trainable_weights ]
@@ -222,6 +224,23 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         # Fit transition
         self.loss_T+=self.diff_Tx_x_.train_on_batch(states_val+next_states_val+[onehot_actions]+[(1-terminals_val)], np.zeros_like(Es))
         
+        # Fit rewards
+        self.lossR+=self.full_R.train_on_batch(states_val+[onehot_actions], rewards_val)
+
+        # Fit gammas
+        self.loss_gamma+=self.full_gamma.train_on_batch(states_val+[onehot_actions], (1-terminals_val[:])*self._df)
+
+        # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
+        # reduce the squared value of the abstract features
+        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
+
+        # Increase the entropy in the abstract features of two states
+        # This works only when states_val is made up of only one observation --> FIXME
+        rolled=np.roll(states_val[0],1,axis=0)
+        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
+
+        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
+
         # Interpretable AI
         if(self._high_int_dim==False):
             target_modif_features=np.zeros((self._n_actions,self._internal_dim))
@@ -244,23 +263,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             self.loss_interpret+=self.force_features.train_on_batch(states_val_tiled+[onehot_actions_tiled], target_modif_features)
 
     
-        # Fit rewards
-        self.lossR+=self.full_R.train_on_batch(states_val+[onehot_actions], rewards_val) 
-
-        # Fit gammas
-        self.loss_gamma+=self.full_gamma.train_on_batch(states_val+[onehot_actions], (1-terminals_val[:])*self._df)
-
-        # Loss to ensure entropy but limited volume in abstract state space, avg=0 and sigma=1
-        # reduce the squared value of the abstract features
-        self.loss_disambiguate1+=self.encoder.train_on_batch(states_val,np.zeros_like(Es)) #np.zeros((self._batch_size,self.learn_and_plan.internal_dim)))
-        
-        # Increase the entropy in the abstract features of two states
-        # This is done only when states_val is made up of only one observation --> FIXME
-        rolled=np.roll(states_val[0],1,axis=0)
-        self.loss_disambiguate2+=self.encoder_diff.train_on_batch([states_val[0],rolled],np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
-
-        self.loss_disentangle_t+=self.diff_s_s_.train_on_batch(states_val+next_states_val, np.reshape(np.zeros_like(Es),(self._batch_size,-1)))
-
 
         
         if(self.update_counter%500==0):
@@ -522,10 +524,6 @@ def _compile(self):
         self.full_R.compile(optimizer=optimizer3, loss='mse') # Fit rewards
         self.full_gamma.compile(optimizer=optimizer3, loss='mse') # Fit discount
 
-        if(self._high_int_dim==False):
-            self.force_features.compile(optimizer=optimizer7,
-                  loss=cosine_proximity2)
-
         self.encoder.compile(optimizer=optimizer4,
                   loss=mean_squared_error_p)
         self.encoder_diff.compile(optimizer=optimizer5,
@@ -534,6 +532,10 @@ def _compile(self):
         self.diff_s_s_.compile(optimizer=optimizer6,
                   loss=exp_dec_error)
 
+        if(self._high_int_dim==False):
+            self.force_features.compile(optimizer=optimizer7,
+                  loss=cosine_proximity2)
+
     def _resetQHat(self):
         """ Set the target Q-network weights equal to the main Q-network weights
         """

From 6fc8fe1caa1233945f9d691e67fd1a5e62b683db Mon Sep 17 00:00:00 2001
From: Vincent F <vincent.francois@gmail.com>
Date: Wed, 15 Aug 2018 01:59:17 +0000
Subject: [PATCH 81/96] improve doc agent

---
 deer/agent.py | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/deer/agent.py b/deer/agent.py
index 7169a05b..90fb2a5a 100644
--- a/deer/agent.py
+++ b/deer/agent.py
@@ -338,22 +338,16 @@ def _runEpisode(self, maxSteps):
         
     def _step(self):
         """
-        This method is called at each time step. If the agent is currently in testing mode, and if its *test* replay 
-        memory has enough samples, it will select the best action it can. If there are not enough samples, FIXME.
-        In the case the agent is not in testing mode, if its replay memory has enough samples, it will select the best 
-        action it can with probability 1-CurrentEpsilon and a random action otherwise. If there are not enough samples, 
-        it will always select a random action.
-        Parameters
-        -----------
-        state : ndarray
-            An ndarray(size=number_of_inputs, dtype='object), where states[input] is a 1+D matrix of dimensions
-               input.historySize x "shape of a given ponctual observation for this input".
+        This method is called at each time step and performs one action in the environment.
+
         Returns
         -------
-        action : int
-            The id of the action selected by the agent.
         V : float
             Estimated value function of current state.
+        action : int
+            The id of the action selected by the agent.
+        reward : float
+            Reward obtained for the transition
         """
 
         action, V = self._chooseAction()
@@ -408,17 +402,18 @@ class AgentWarning(RuntimeWarning):
 class DataSet(object):
     """A replay memory consisting of circular buffers for observations, actions, rewards and terminals."""
 
-    def __init__(self, env, random_state=None, max_size=1000, use_priority=False, only_full_history=True):
+    def __init__(self, env, random_state=None, max_size=1000000, use_priority=False, only_full_history=True):
         """Initializer.
         Parameters
         -----------
         inputDims : list of tuples
-            For each subject i, inputDims[i] is a tuple where the first value is the memory size for this
-            subject and the rest describes the shape of each single observation on this subject (number, vector or
-            matrix). See base_classes.Environment.inputDimensions() documentation for more info about this format.
+            Each tuple relates to one of the observations where the first value is the history size considered for this
+            observation and the rest describes the shape of each punctual observation (e.g., scalar, vector or matrix). 
+            See base_classes.Environment.inputDimensions() documentation for more info.
         random_state : Numpy random number generator
             If None, a new one is created with default numpy seed.
-        max_size : The replay memory maximum size.
+        max_size : float
+            The replay memory maximum size. Default : 1000000
         """
 
         self._batch_dimensions = env.inputDimensions()
@@ -470,8 +465,6 @@ def terminals(self):
 
     def observations(self):
         """Get all observations currently in the replay memory, ordered by time where they were observed.
-        
-        observations[s][i] corresponds to the observation made on subject s before the agent took actions()[i].
         """
 
         ret = np.zeros_like(self._observations)
@@ -730,11 +723,11 @@ def _randomPrioritizedBatch(self, batch_size):
         return indices_replay_mem, indices_tree
 
     def addSample(self, obs, action, reward, is_terminal, priority):
-        """Store a (observation[for all subjects], action, reward, is_terminal) in the dataset. 
+        """Store the punctual observations, action, reward, is_terminal and priority in the dataset. 
         Parameters
         -----------
         obs : ndarray
-            An ndarray(dtype='object') where obs[s] corresponds to the observation made on subject s before the
+            An ndarray(dtype='object') where obs[s] corresponds to the punctual observation s before the
             agent took action [action].
         action :  int
             The action taken after having observed [obs].

From 11083961da7206f6a9c24550fc2ef54c6eced0aa Mon Sep 17 00:00:00 2001
From: Vincent F <vincent.francois@gmail.com>
Date: Wed, 15 Aug 2018 02:02:34 +0000
Subject: [PATCH 82/96] improve doc environment

---
 deer/base_classes/environment.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/deer/base_classes/environment.py b/deer/base_classes/environment.py
index 6e802ec1..edda14d6 100644
--- a/deer/base_classes/environment.py
+++ b/deer/base_classes/environment.py
@@ -55,9 +55,9 @@ def act(self, action):
     def inputDimensions(self):
         """Gets the shape of the input space for this environment.
         
-        This returns a list whose length is the number of observations in the environment. Each element of the 
-        list is a tuple: the first integer is always the history size considered for this observation and the rest describes 
-        the shape of the observation at a given time step:
+        This returns a list whose length is the number of observations in the environment. Each element of the list is a tuple: 
+        the first integer is always the history size considered for this observation and the rest describes the shape of the 
+        observation at a given time step. For instance:
         - () or (1,) means each observation at a given time step is a single scalar and the history size is 1 (= only current 
         observation)
         - (N,) means each observation at a given time step is a single scalar and the history size is N
@@ -92,14 +92,12 @@ def inTerminalState(self):
         return False
 
     def observe(self):
-        """Gets a list of punctual observations on all subjects composing this environment.
+        """Gets a list of punctual observations composing this environment.
         
-        This returns a list where element i is a punctual observation on subject i. You will notice that the history 
-        of observations on this subject is not returned; only the very last observation. Each element is thus either 
-        a number, vector or matrix and not a succession of numbers, vectors and matrices.
+        This returns a list where element i is a punctual observation. Note that the history  of observations is not 
+        returned and only the current observation is.
 
-        See the documentation of batchDimensions() for more information about the shape of the observations according 
-        to their mathematical representation (number, vector or matrix).
+        See the documentation of inputDimensions() for more information about the shape of the observations.
         """
 
         raise NotImplementedError()

From 9a20d5b99ba4304f5aaec7e13888f0390b515003 Mon Sep 17 00:00:00 2001
From: Vincent F <vincent.francois@gmail.com>
Date: Wed, 15 Aug 2018 02:05:51 +0000
Subject: [PATCH 83/96] fixes ALE

---
 examples/ALE/ALE_env_gym.py | 32 +++++++++++++++-----------------
 examples/ALE/run_ALE.py     |  8 ++++----
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/examples/ALE/ALE_env_gym.py b/examples/ALE/ALE_env_gym.py
index 39b012b7..23cfdaf1 100644
--- a/examples/ALE/ALE_env_gym.py
+++ b/examples/ALE/ALE_env_gym.py
@@ -9,14 +9,12 @@
 import gym
 from deer.base_classes import Environment
 
-import matplotlib
-matplotlib.use('qt5agg')
-from mpl_toolkits.axes_grid1 import host_subplot
-import mpl_toolkits.axisartist as AA
-import matplotlib.pyplot as plt
-from PIL import Image
-
-import gym
+#import matplotlib
+#matplotlib.use('qt5agg')
+#from mpl_toolkits.axes_grid1 import host_subplot
+#import mpl_toolkits.axisartist as AA
+#import matplotlib.pyplot as plt
+#from PIL import Image
     
 class MyEnv(Environment):
     VALIDATION_MODE = 0
@@ -67,14 +65,14 @@ def reset(self, mode):
         self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
         self.state=np.zeros((84,84), dtype=np.uint8) #FIXME
         
-        return [4 * [84 * [84 * [0]]]]
+        return [1*[4 * [84 * [84 * [0]]]]]
         
         
     def act(self, action):
         #print "action"
         #print action
         
-        self.state=np.zeros((84,84), dtype=np.uint8)
+        self.state=np.zeros((4,84,84), dtype=np.float)
         reward=0
         for t in range(4):
             observation, r, self.terminal, info = self.env.step(action)
@@ -84,23 +82,23 @@ def act(self, action):
             if self.inTerminalState():
                 break
 
-        self._screen=np.average(observation,axis=-1) # Gray levels
-        self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_NEAREST)  # 84*84
-        #plt.imshow(self._screen, cmap='gray')
-        #plt.show()
-        self.state=self._reduced_screen
+            self._screen=np.average(observation,axis=-1) # Gray levels
+            self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_NEAREST)  # 84*84
+            #plt.imshow(self._screen, cmap='gray')
+            #plt.show()
+            self.state[t,:,:]=self._reduced_screen
             
         self._mode_score += reward
         return np.sign(reward)
 
-    def summarizePerformance(self, test_data_set, learning_algo):
+    def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         if self.inTerminalState() == False:
             self._mode_episode_count += 1
         print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
 
 
     def inputDimensions(self):
-        return [(4, 84, 84)] #FIXME
+        return [(1, 4, 84, 84)]
 
     def observationType(self, subject):
         return np.float #np.uint8
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index 391c835c..dfc518af 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -34,8 +34,8 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.001
-    LEARNING_RATE_DECAY = 0.99
+    LEARNING_RATE = 0.0002
+    LEARNING_RATE_DECAY = 1.#0.99
     DISCOUNT = 0.95
     DISCOUNT_INC = 0.99
     DISCOUNT_MAX = 0.99
@@ -47,9 +47,9 @@ class Defaults:
     EPSILON_MIN = .1
     EPSILON_DECAY = 100000
     UPDATE_FREQUENCY = 1
-    REPLAY_MEMORY_SIZE = 1000000
+    REPLAY_MEMORY_SIZE = 250000 #Each element is 4 frames --> 10^6 frames
     BATCH_SIZE = 32
-    FREEZE_INTERVAL = 10000
+    FREEZE_INTERVAL = 2500
     DETERMINISTIC = True
 
 HIGH_INT_DIM = True

From 3387546c8b639005e4a3eec2d2560b395c2ef6ba Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 6 Sep 2018 14:46:32 -0400
Subject: [PATCH 84/96] modifs CRAR agents (minor fixes, adaptations to ALE and
 cleaning)

---
 deer/learning_algos/CRAR_keras.py    | 35 +++++++++--------
 deer/learning_algos/NN_CRAR_keras.py | 57 ++++++++++++++++++----------
 2 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/deer/learning_algos/CRAR_keras.py b/deer/learning_algos/CRAR_keras.py
index 74a9ad11..b698d1b1 100644
--- a/deer/learning_algos/CRAR_keras.py
+++ b/deer/learning_algos/CRAR_keras.py
@@ -60,7 +60,7 @@ class CRAR(LearningAlgo):
         Activate or not the double_Q learning.
         More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
     neural_network : object, optional
-        default is deer.learning_algos.NN_keras
+        Default is deer.learning_algos.NN_keras
     """
 
     def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN, **kwargs):
@@ -119,6 +119,7 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_no
                 
         # Grab all the parameters in self.params
         layers=self.encoder.layers+self.Q.layers+self.R.layers+self.gamma.layers+self.transition.layers
+
         self.params = [ param
                     for layer in layers 
                     for param in layer.trainable_weights ]
@@ -134,13 +135,14 @@ def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_no
         self.gamma_target = self.learn_and_plan_target.float_model()
         self.transition_target = self.learn_and_plan_target.transition_model()
 
-        self.full_Q_target = self.learn_and_plan_target.full_Q_model(self.encoder,self.Q) # FIXME
+        self.full_Q_target = self.learn_and_plan_target.full_Q_model(self.encoder_target,self.Q_target)
         self.full_Q_target.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.full_Q
 
         # Grab all the parameters of the target network together.
-        layers=self.encoder_target.layers+self.Q_target.layers+self.R_target.layers+self.gamma_target.layers+self.transition_target.layers
+        layers_target=self.encoder_target.layers+self.Q_target.layers+self.R_target.layers+self.gamma_target.layers+self.transition_target.layers
+
         self.params_target = [ param
-                    for layer in layers 
+                    for layer in layers_target
                     for param in layer.trainable_weights ]
 
         self._resetQHat()
@@ -207,10 +209,10 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
                    
         if(self.update_counter%500==0):
             print ("Printing a few elements useful for debugging:")
-            print ("states_val[0][0]")
-            print (states_val[0][0])
-            print ("next_states_val[0][0]")
-            print (next_states_val[0][0])
+            #print ("states_val[0][0]")
+            #print (states_val[0][0])
+            #print ("next_states_val[0][0]")
+            #print (next_states_val[0][0])
             print ("actions_val[0], rewards_val[0], terminals_val[0]")
             print (actions_val[0], rewards_val[0], terminals_val[0])
             print ("Es[0],ETs[0],Es_[0]")
@@ -266,12 +268,12 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
 
         
         if(self.update_counter%500==0):
-            print ("self.loss_T/100., self.lossR/100., self.loss_gamma/100., self.loss_Q/100., self.loss_disentangle_t/100., self.loss_disambiguate1/100., self.loss_disambiguate2/100.")
-            print (self.loss_T/100., self.lossR/100.,self.loss_gamma/100., self.loss_Q/100., self.loss_disentangle_t/100., self.loss_disambiguate1/100., self.loss_disambiguate2/100.)
-            
+            print ("self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.")
+            print (self.loss_T/500., self.lossR/500.,self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.)
+
             if(self._high_int_dim==False):
-                print ("self.loss_interpret/100.")
-                print (self.loss_interpret/100.)
+                print ("self.loss_interpret/500.")
+                print (self.loss_interpret/500.)
 
             self.lossR=0
             self.loss_gamma=0
@@ -282,7 +284,6 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
             self.loss_disentangle_t=0
             self.loss_disambiguate1=0
             self.loss_disambiguate2=0
-            
 
         if self.update_counter % self._freeze_interval == 0:
             self._resetQHat()
@@ -457,9 +458,11 @@ def qValues_planning_abstr(self, state_abstr_val, R, gamma, T, Q, d, branching_f
                 else:
                     print ("error")
             else:
-                # A subset of the actions are considered
+                # A subset of the actions corresponding to the best estimated Q-values are considered et each branch 
                 estim_Q_values=Q.predict([state_abstr_val])
                 ind = np.argpartition(estim_Q_values, -this_branching_factor)[:,-this_branching_factor:]
+                # Replacing ind if we want random branching
+                #ind = np.random.randint(0,self._n_actions,size=ind.shape)
                 repeat_identity=identity_matrix[ind].reshape(n*this_branching_factor,self._n_actions)
                 tile3_encoded_x=np.repeat(state_abstr_val,this_branching_factor,axis=0)
             
@@ -565,7 +568,7 @@ def setLearningRate(self, lr):
         K.set_value(self.encoder.optimizer.lr, self._lr)
         K.set_value(self.encoder_diff.optimizer.lr, self._lr)
 
-        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1 for distrib of laby
+        K.set_value(self.diff_s_s_.optimizer.lr, self._lr/5.) # /5. for simple laby or simple catcher; /1. for distrib of laby
 
     def transfer(self, original, transfer, epochs=1):
         # First, make sure that the target network and the current network are the same
diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index 986e6740..e3f27fe1 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -6,9 +6,9 @@
 import numpy as np
 from keras import backend as K
 from keras.models import Model
-from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization, merge, RepeatVector, AveragePooling2D
+from keras.layers import Input, Layer, Dense, Flatten, Activation, Conv2D, MaxPooling2D, UpSampling2D, Reshape, Permute, Add, Subtract, Dot, Multiply, Average, Lambda, Concatenate, BatchNormalization, merge, RepeatVector, AveragePooling2D
 from keras import regularizers
-np.random.seed(102912)
+#np.random.seed(111111)
 
 class NN():
     """
@@ -54,17 +54,22 @@ def encoder_model(self):
         Keras model with output x (= encoding of s)
     
         """
-        layers=[]
         outs_conv=[]
         inputs=[]
 
         for i, dim in enumerate(self._input_dimensions):
             # - observation[i] is a FRAME
             if len(dim) == 3 or len(dim) == 4:
-                input = Input(shape=(dim[-3],dim[-2],dim[-1]))
-                inputs.append(input)
+                if(len(dim) == 4):
+                    input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
+                    inputs.append(input)
+                    input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
+                else:
+                    input = Input(shape=(dim[-3],dim[-2],dim[-1]))
+                    inputs.append(input)
                 x=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
-                if(dim[-2]>8 and dim[-1]>8):
+
+                if(dim[-2]>12 and dim[-1]>12):
                     self._pooling_encoder=6
                     x = Conv2D(8, (2, 2), padding='same', activation='tanh')(x)
                     x = Conv2D(16, (2, 2), padding='same', activation='tanh')(x)
@@ -165,14 +170,16 @@ def encoder_diff_model(self,encoder_model):
         
         for j in range(2):
             for i, dim in enumerate(self._input_dimensions):
-                if len(dim) == 3 or len(dim) == 4:
+                if(len(dim) == 4):
+                    input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
+                    inputs.append(input)
+                    input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
+                elif(len(dim) == 3):
                     input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                     inputs.append(input)
-            
                 elif len(dim) == 2:
                     input = Input(shape=(dim[-3],dim[-2]))
                     inputs.append(input)
-            
                 else:
                     input = Input(shape=(dim[-3],))
                     inputs.append(input)
@@ -270,14 +277,16 @@ def diff_Tx_x_(self,encoder_model,transition_model,plan_depth=0):
         inputs=[]
         for j in range(2):
             for i, dim in enumerate(self._input_dimensions):
-                if len(dim) == 3 or len(dim) == 4:
+                if(len(dim) == 4):
+                    input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
+                    inputs.append(input)
+                    input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
+                elif(len(dim) == 3):
                     input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                     inputs.append(input)
-            
                 elif len(dim) == 2:
                     input = Input(shape=(dim[-3],dim[-2]))
                     inputs.append(input)
-            
                 else:
                     input = Input(shape=(dim[-3],))
                     inputs.append(input)
@@ -326,14 +335,16 @@ def force_features(self,encoder_model,transition_model,plan_depth=0):
         """
         inputs=[]
         for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3 or len(dim) == 4:
+            if(len(dim) == 4):
+                input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
+                inputs.append(input)
+                input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
+            elif(len(dim) == 3):
                 input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
-        
             elif len(dim) == 2:
                 input = Input(shape=(dim[-3],dim[-2]))
                 inputs.append(input)
-        
             else:
                 input = Input(shape=(dim[-3],))
                 inputs.append(input)
@@ -428,14 +439,16 @@ def full_float_model(self,encoder_model,float_model,plan_depth=0,transition_mode
         inputs=[]
         
         for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3 or len(dim) == 4:
+            if(len(dim) == 4):
+                input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
+                inputs.append(input)
+                input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
+            elif(len(dim) == 3):
                 input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
-
             elif len(dim) == 2:
                 input = Input(shape=(dim[-3],dim[-2]))
                 inputs.append(input)
-
             else:
                 input = Input(shape=(dim[-3],))
                 inputs.append(input)
@@ -540,14 +553,16 @@ def full_Q_model(self, encoder_model, Q_model, plan_depth=0, transition_model=No
         inputs=[]
         
         for i, dim in enumerate(self._input_dimensions):
-            if len(dim) == 3 or len(dim) == 4:
+            if(len(dim) == 4):
+                input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
+                inputs.append(input)
+                input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
+            elif(len(dim) == 3):
                 input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                 inputs.append(input)
-
             elif len(dim) == 2:
                 input = Input(shape=(dim[-3],dim[-2]))
                 inputs.append(input)
-
             else:
                 input = Input(shape=(dim[-3],))
                 inputs.append(input)

From b8567ddf370337037fb49bf06f9f9c0b06f124db Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 6 Sep 2018 14:57:10 -0400
Subject: [PATCH 85/96] modifs ALE

---
 examples/ALE/ALE_env_gym.py | 12 ++++++++----
 examples/ALE/run_ALE.py     |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/examples/ALE/ALE_env_gym.py b/examples/ALE/ALE_env_gym.py
index 23cfdaf1..b1ff7a14 100644
--- a/examples/ALE/ALE_env_gym.py
+++ b/examples/ALE/ALE_env_gym.py
@@ -25,7 +25,11 @@ def __init__(self, rng, **kwargs):
         Arguments:
             rng - the numpy random number generator            
         """
-        self.env = gym.make('SpaceInvaders-v4')#Breakout-v4')#BeamRider-v4')#Qbert-v4')#Seaquest-v4')#Freeway-v4')
+        if(bool(kwargs["game"])):
+            self.env = gym.make(kwargs["game"])
+        else:
+            # Choice between Seaquest-v4, Breakout-v4, SpaceInvaders-v4, BeamRider-v4, Qbert-v4, Freeway-v4', etc.
+            self.env = gym.make('Seaquest-v4')
         self._random_state=rng
         self.env.reset()
         frame_skip=kwargs.get('frame_skip',1)
@@ -101,11 +105,11 @@ def inputDimensions(self):
         return [(1, 4, 84, 84)]
 
     def observationType(self, subject):
-        return np.float #np.uint8
+        return np.float16
 
     def nActions(self):
-        print "self.env.action_space"
-        print self.env.action_space
+        print ("self.env.action_space")
+        print (self.env.action_space)
         return self.env.action_space.n
 
     def observe(self):
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index dfc518af..c6bb35b9 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -71,7 +71,7 @@ class Defaults:
     #                         {"key": "color_averaging", "value": True},
     #                         {"key": "repeat_action_probability", "value": 0.}])
     
-    env = ALE_env(rng, frame_skip=parameters.frame_skip)
+    env = ALE_env(rng, game=parameters.param1, frame_skip=parameters.frame_skip)
     
     # --- Instantiate qnetwork ---
     qnetwork = CRAR(

From 917799c0a1c9e474d4037d5cd3fac2b8cfe8d11b Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 6 Sep 2018 15:02:58 -0400
Subject: [PATCH 86/96] maze

---
 examples/maze/run_test4.py | 319 +++++++++++++++++++------------------
 examples/maze/test_env4.py |   8 +-
 2 files changed, 164 insertions(+), 163 deletions(-)

diff --git a/examples/maze/run_test4.py b/examples/maze/run_test4.py
index 2cc364b1..77ae448f 100644
--- a/examples/maze/run_test4.py
+++ b/examples/maze/run_test4.py
@@ -37,7 +37,7 @@ class Defaults:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
     LEARNING_RATE = 0.0005
-    LEARNING_RATE_DECAY = 0.995
+    LEARNING_RATE_DECAY = 1.#0.995
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
     DISCOUNT_MAX = 0.99
@@ -142,6 +142,7 @@ class Defaults:
         reset_every='none'))
 
     agent.run(1, N_SAMPLES)
+    
     #print (agent._dataset._rewards._data[0:500])
     #print (agent._dataset._terminals._data[0:500])
     print("end gathering data")
@@ -182,36 +183,36 @@ class Defaults:
     # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
     # [parameters.period_btw_summary_perfs] *validation* epochs.
     agent.attach(bc.InterleavedTestEpochController(
-        id=test_env.VALIDATION_MODE, 
+        id=0, 
         epoch_length=parameters.steps_per_test,
-        controllers_to_disable=[0, 1, 2, 3, 4],
+        controllers_to_disable=[0, 1, 2, 3, 4, 6, 7, 8],
         periodicity=2,
         show_score=True,
         summarize_every=1))
 
-#    agent.attach(bc.InterleavedTestEpochController(
-#        id=test_env.VALIDATION_MODE+1, 
-#        epoch_length=parameters.steps_per_test,
-#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 7,8],
-#        periodicity=2,
-#        show_score=True,
-#        summarize_every=1))
-#
-#    agent.attach(bc.InterleavedTestEpochController(
-#        id=test_env.VALIDATION_MODE+2, 
-#        epoch_length=parameters.steps_per_test,
-#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6,8],
-#        periodicity=2,
-#        show_score=True,
-#        summarize_every=1))
-#    
-#    agent.attach(bc.InterleavedTestEpochController(
-#        id=test_env.VALIDATION_MODE+3, 
-#        epoch_length=parameters.steps_per_test,
-#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6, 7],
-#        periodicity=2,
-#        show_score=True,
-#        summarize_every=1))
+    agent.attach(bc.InterleavedTestEpochController(
+        id=1, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4, 5, 7,8],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+
+    agent.attach(bc.InterleavedTestEpochController(
+        id=2, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6,8],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
+    
+    agent.attach(bc.InterleavedTestEpochController(
+        id=3, 
+        epoch_length=parameters.steps_per_test,
+        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6, 7],
+        periodicity=2,
+        show_score=True,
+        summarize_every=1))
 
     # --- Run the experiment ---
     try:
@@ -223,149 +224,149 @@ class Defaults:
     agent.run(parameters.epochs, parameters.steps_per_epoch)
     
 
-    ###
-    # TRANSFER
-    ###
-    optimized_params=learning_algo.getAllParams()
-    print ("optimized_params")
-    print (optimized_params)
-
-    # --- Instantiate learning_algo ---
-#    learning_algo = CRAR(
+#    ###
+#    # TRANSFER
+#    ###
+#    optimized_params=learning_algo.getAllParams()
+#    print ("optimized_params")
+#    print (optimized_params)
+#
+#    # --- Instantiate learning_algo ---
+##    learning_algo = CRAR(
+##        env,
+##        parameters.rms_decay,
+##        parameters.rms_epsilon,
+##        parameters.momentum,
+##        parameters.clip_delta,
+##        parameters.freeze_interval,
+##        parameters.batch_size,
+##        parameters.update_rule,
+##        rng,
+##        double_Q=True,
+##        high_int_dim=HIGH_INT_DIM,
+##        internal_dim=3)
+##    learning_algo.setAllParams(optimized_params)
+#
+#    rand_ind=np.random.random_integers(0,20000,samples_transfer)
+#    original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
+#    transfer=[np.array([[-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
+#
+#    print ("original[0][0:10], transfer[0][0:10]")
+#    print (original[0][0:10], transfer[0][0:10])
+#
+#    # Transfer between the two repr
+#    #learning_algo.transfer(original, transfer, 5000000/samples_transfer)
+#
+#    
+#    # --- Re instantiate environment with reverse=True ---
+#    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
+#
+#    # --- Re instantiate agent ---
+#    agent = NeuralAgent(
 #        env,
-#        parameters.rms_decay,
-#        parameters.rms_epsilon,
-#        parameters.momentum,
-#        parameters.clip_delta,
-#        parameters.freeze_interval,
+#        learning_algo,
+#        parameters.replay_memory_size,
+#        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
 #        parameters.batch_size,
-#        parameters.update_rule,
 #        rng,
-#        double_Q=True,
-#        high_int_dim=HIGH_INT_DIM,
-#        internal_dim=3)
-#    learning_algo.setAllParams(optimized_params)
-
-    rand_ind=np.random.random_integers(0,20000,samples_transfer)
-    original=[np.array([[agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
-    transfer=[np.array([[-agent._dataset._observations[o]._data[rand_ind[n]+l] for l in range(1)] for n in range(samples_transfer)]) for o in range(1)]
-
-    print ("original[0][0:10], transfer[0][0:10]")
-    print (original[0][0:10], transfer[0][0:10])
-
-    # Transfer between the two repr
-    #learning_algo.transfer(original, transfer, 5000000/samples_transfer)
-
-    
-    # --- Re instantiate environment with reverse=True ---
-    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
-
-    # --- Re instantiate agent ---
-    agent = NeuralAgent(
-        env,
-        learning_algo,
-        parameters.replay_memory_size,
-        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
-        parameters.batch_size,
-        rng,
-        test_policy=test_policy)
-
-    # --- Bind controllers to the agent ---
-    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
-    # learning rate as well as the training epoch number.
-    agent.attach(bc.VerboseController(
-        evaluate_on='epoch', 
-        periodicity=1))
-        
-    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
-    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
-    agent.attach(bc.LearningRateController(
-        initial_learning_rate=parameters.learning_rate, 
-        learning_rate_decay=parameters.learning_rate_decay,
-        periodicity=1))
-    
-    # Same for the discount factor.
-    agent.attach(bc.DiscountFactorController(
-        initial_discount_factor=parameters.discount, 
-        discount_factor_growth=parameters.discount_inc, 
-        discount_factor_max=parameters.discount_max,
-        periodicity=1))
-
-    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
-    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
-    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
-    # episode or epoch (or never, hence the resetEvery='none').
-    agent.attach(bc.EpsilonController(
-        initial_e=parameters.epsilon_start, 
-        e_decays=parameters.epsilon_decay, 
-        e_min=parameters.epsilon_min,
-        evaluate_on='action',
-        periodicity=1,
-        reset_every='none'))
-
-    agent.run(1, N_SAMPLES)
-    #print (agent._dataset._rewards._data[0:500])
-    #print (agent._dataset._terminals._data[0:500])
-    print("end gathering data")
-    # Setting the dataset to be the same than the old one (but modif for the observations)
-    agent._dataset._rewards._data=old_rewards
-    agent._dataset._terminals._data=old_terminals
-    agent._dataset._actions._data=old_actions
-    agent._dataset._observations[0]._data=-old_observations
-
-    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
-    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
-    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
-    agent.attach(bc.TrainerController(
-        evaluate_on='action', 
-        periodicity=parameters.update_frequency, 
-        show_episode_avg_V_value=True, 
-        show_avg_Bellman_residual=True))
-
-    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
-    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
-    # these validation epoch to interfere with the training of the agent, which is well established by the 
-    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
-    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
-    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
-    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
-    # [parameters.period_btw_summary_perfs] *validation* epochs.
-    agent.attach(bc.InterleavedTestEpochController(
-        id=test_env.VALIDATION_MODE, 
-        epoch_length=parameters.steps_per_test,
-        controllers_to_disable=[0, 1, 2, 3, 4],
-        periodicity=2,
-        show_score=True,
-        summarize_every=1))
-
-
-#    agent.attach(bc.InterleavedTestEpochController(
-#        id=test_env.VALIDATION_MODE+1, 
-#        epoch_length=parameters.steps_per_test,
-#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 7,8],
-#        periodicity=2,
-#        show_score=True,
-#        summarize_every=1))
+#        test_policy=test_policy)
 #
-#    agent.attach(bc.InterleavedTestEpochController(
-#        id=test_env.VALIDATION_MODE+2, 
-#        epoch_length=parameters.steps_per_test,
-#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6,8],
-#        periodicity=2,
-#        show_score=True,
-#        summarize_every=1))
+#    # --- Bind controllers to the agent ---
+#    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
+#    # learning rate as well as the training epoch number.
+#    agent.attach(bc.VerboseController(
+#        evaluate_on='epoch', 
+#        periodicity=1))
+#        
+#    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
+#    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
+#    agent.attach(bc.LearningRateController(
+#        initial_learning_rate=parameters.learning_rate, 
+#        learning_rate_decay=parameters.learning_rate_decay,
+#        periodicity=1))
 #    
+#    # Same for the discount factor.
+#    agent.attach(bc.DiscountFactorController(
+#        initial_discount_factor=parameters.discount, 
+#        discount_factor_growth=parameters.discount_inc, 
+#        discount_factor_max=parameters.discount_max,
+#        periodicity=1))
+#
+#    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
+#    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
+#    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
+#    # episode or epoch (or never, hence the resetEvery='none').
+#    agent.attach(bc.EpsilonController(
+#        initial_e=parameters.epsilon_start, 
+#        e_decays=parameters.epsilon_decay, 
+#        e_min=parameters.epsilon_min,
+#        evaluate_on='action',
+#        periodicity=1,
+#        reset_every='none'))
+#
+#    agent.run(1, N_SAMPLES)
+#    #print (agent._dataset._rewards._data[0:500])
+#    #print (agent._dataset._terminals._data[0:500])
+#    print("end gathering data")
+#    # Setting the dataset to be the same than the old one (but modif for the observations)
+#    agent._dataset._rewards._data=old_rewards
+#    agent._dataset._terminals._data=old_terminals
+#    agent._dataset._actions._data=old_actions
+#    agent._dataset._observations[0]._data=-old_observations
+#
+#    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
+#    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
+#    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
+#    agent.attach(bc.TrainerController(
+#        evaluate_on='action', 
+#        periodicity=parameters.update_frequency, 
+#        show_episode_avg_V_value=True, 
+#        show_avg_Bellman_residual=True))
+#
+#    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
+#    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
+#    # these validation epoch to interfere with the training of the agent, which is well established by the 
+#    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
+#    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
+#    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
+#    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
+#    # [parameters.period_btw_summary_perfs] *validation* epochs.
 #    agent.attach(bc.InterleavedTestEpochController(
-#        id=test_env.VALIDATION_MODE+3, 
+#        id=test_env.VALIDATION_MODE, 
 #        epoch_length=parameters.steps_per_test,
-#        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6, 7],
+#        controllers_to_disable=[0, 1, 2, 3, 4],
 #        periodicity=2,
 #        show_score=True,
 #        summarize_every=1))
-
-    agent.gathering_data=False
-    agent.run(parameters.epochs, parameters.steps_per_epoch)
-
+#
+#
+##    agent.attach(bc.InterleavedTestEpochController(
+##        id=test_env.VALIDATION_MODE+1, 
+##        epoch_length=parameters.steps_per_test,
+##        controllers_to_disable=[0, 1, 2, 3, 4, 5, 7,8],
+##        periodicity=2,
+##        show_score=True,
+##        summarize_every=1))
+##
+##    agent.attach(bc.InterleavedTestEpochController(
+##        id=test_env.VALIDATION_MODE+2, 
+##        epoch_length=parameters.steps_per_test,
+##        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6,8],
+##        periodicity=2,
+##        show_score=True,
+##        summarize_every=1))
+##    
+##    agent.attach(bc.InterleavedTestEpochController(
+##        id=test_env.VALIDATION_MODE+3, 
+##        epoch_length=parameters.steps_per_test,
+##        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6, 7],
+##        periodicity=2,
+##        show_score=True,
+##        summarize_every=1))
+#
+#    agent.gathering_data=False
+#    agent.run(parameters.epochs, parameters.steps_per_epoch)
+#
 
 
     # --- Show results ---
diff --git a/examples/maze/test_env4.py b/examples/maze/test_env4.py
index a36dd4d4..1c68546e 100644
--- a/examples/maze/test_env4.py
+++ b/examples/maze/test_env4.py
@@ -99,7 +99,7 @@ def reset(self, mode):
             else:
                 self._mode_episode_count += 1
                     
-        #print ("reset mode:"+str(mode)+".")
+        print ("reset mode:"+str(mode)+".")
         #print "self._pos_agent,self._pos_walls,self._pos_rewards"
         #print self._pos_agent,self._pos_walls,self._pos_rewards
                 
@@ -133,7 +133,7 @@ def act(self, action):
         return self.reward
 
 
-    def summarizePerformance(self, test_data_set, learning_algo):
+    def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         print ("test_data_set.observations.shape")
         print (test_data_set.observations()[0][0:1])
         
@@ -192,8 +192,8 @@ def observe(self):
             for i in indices_agent:
                 self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=agent_obs
             self._map=(self._map*2)-1 #scaling
-            #print "self._map higher_dim_obs"
-            #print self._map
+            #print ("self._map higher_dim_obs")
+            #print (self._map)
             #plt.imshow(self._map, cmap='gray_r')
             #plt.show()
         else:

From 36d17b347382db642d94ba0de5fa8ba4a55bbc9e Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 6 Sep 2018 16:01:01 -0400
Subject: [PATCH 87/96] update requirements

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fac4321d..694b63c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 numpy>=1.10
 joblib>=0.9
-theano>=0.8
-matplotlib>=1.1.1
\ No newline at end of file
+matplotlib>=1.1.1
+keras>=2.1.1
\ No newline at end of file

From 541d5dd7af13ade347e1dbd08c3efb07a5b5ec9c Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 6 Sep 2018 16:22:13 -0400
Subject: [PATCH 88/96] minor modifs catcher

---
 examples/test_CRAR/catcher_env.py | 10 +++++++++-
 examples/test_CRAR/run_catcher.py |  8 ++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/test_CRAR/catcher_env.py b/examples/test_CRAR/catcher_env.py
index ffb20f22..819d61a2 100644
--- a/examples/test_CRAR/catcher_env.py
+++ b/examples/test_CRAR/catcher_env.py
@@ -92,11 +92,19 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         """
         
         all_possib_inp=[]
+        #labels=[]
         for x_b in range(self._nx_block):#[1]:#range(self._nx_block):
             for y_b in range(self._height):
                 for x_p in range(self._width-self._width_paddle+1):
                     state=self.get_observation(y_b,x_b*((self._width-1)//(self._nx_block-1)),x_p)
                     all_possib_inp.append(state)
+                    
+                    #labels.append(x_b)
+
+        #arr=np.array(all_possib_inp)
+        #arr=arr.reshape(arr.shape[0],-1)            
+        #np.savetxt('tsne_python/catcherH_X.txt',arr.reshape(arr.shape[0],-1))
+        #np.savetxt('tsne_python/cacherH_labels.txt',np.array(labels))
 
         all_possib_inp=np.expand_dims(all_possib_inp,axis=1)
         all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
@@ -242,7 +250,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         plt.show()
         for ii in range(-15,345,30):
             ax.view_init(elev=20., azim=ii)
-            plt.savefig('fig_w_V_div5_'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
+            plt.savefig('fig_w_V_div5_forcelr_forcessdiv2'+str(learning_algo.update_counter)+'_'+str(ii)+'.pdf')
 
 
         # fig_visuV
diff --git a/examples/test_CRAR/run_catcher.py b/examples/test_CRAR/run_catcher.py
index ae383b3c..f1666ec7 100644
--- a/examples/test_CRAR/run_catcher.py
+++ b/examples/test_CRAR/run_catcher.py
@@ -22,7 +22,7 @@ class Defaults:
     # Experiment Parameters
     # ----------------------
     STEPS_PER_EPOCH = 2000
-    EPOCHS = 20
+    EPOCHS = 50
     STEPS_PER_TEST = 500
     PERIOD_BTW_SUMMARY_PERFS = 1
     
@@ -35,8 +35,8 @@ class Defaults:
     # DQN Agent parameters:
     # ----------------------
     UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0002
-    LEARNING_RATE_DECAY = 1
+    LEARNING_RATE = 0.0005
+    LEARNING_RATE_DECAY = 0.9
     DISCOUNT = 0.9
     DISCOUNT_INC = 1
     DISCOUNT_MAX = 0.99
@@ -54,7 +54,7 @@ class Defaults:
     DETERMINISTIC = False
 
 
-HIGHER_DIM_OBS = False
+HIGHER_DIM_OBS = True
 HIGH_INT_DIM = False
 
 if __name__ == "__main__":

From 687017d378e8d410bf4e5f178fd4f0b0c030fb88 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Thu, 6 Sep 2018 16:43:53 -0400
Subject: [PATCH 89/96] simple maze

---
 examples/test_CRAR/run_simple_maze.py |  8 ++++----
 examples/test_CRAR/simple_maze_env.py | 26 ++++++++++++++++++++++----
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/examples/test_CRAR/run_simple_maze.py b/examples/test_CRAR/run_simple_maze.py
index 88bde72b..ecc12a85 100644
--- a/examples/test_CRAR/run_simple_maze.py
+++ b/examples/test_CRAR/run_simple_maze.py
@@ -23,7 +23,7 @@ class Defaults:
     # ----------------------
     STEPS_PER_EPOCH = 5000
     EPOCHS = 50
-    STEPS_PER_TEST = 500
+    STEPS_PER_TEST = 1000
     PERIOD_BTW_SUMMARY_PERFS = 1
     
     # ----------------------
@@ -54,7 +54,7 @@ class Defaults:
     DETERMINISTIC = False
 
 
-
+HIGHER_DIM_OBS = True
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
@@ -67,7 +67,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = simple_maze_env(rng, higher_dim_obs=False)
+    env = simple_maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS)
     
     # --- Instantiate learning_algo ---
     learning_algo = CRAR(
@@ -134,7 +134,7 @@ class Defaults:
         periodicity=1,
         reset_every='none'))
 
-    agent.run(10, 100)  #(5, 50)
+    agent.run(10, 500)
     print("end gathering data")
 
     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
diff --git a/examples/test_CRAR/simple_maze_env.py b/examples/test_CRAR/simple_maze_env.py
index 6d17c936..3ccab93f 100644
--- a/examples/test_CRAR/simple_maze_env.py
+++ b/examples/test_CRAR/simple_maze_env.py
@@ -105,6 +105,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         """
 
         all_possib_inp=[] # Will store all possible inputs (=observation) for the CRAR agent
+        labels_maze=[]
         self.create_map()
         for y_a in range(self._size_maze):
             for x_a in range(self._size_maze):                
@@ -116,7 +117,23 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
                     else:
                         state[x_a,y_a]=0.5
                         all_possib_inp.append(state)
-
+                    
+                    ## labels
+                    #if(y_a<self._size_maze//2):
+                    #    labels_maze.append(0.)
+                    #elif(y_a==self._size_maze//2):
+                    #    labels_maze.append(1.)
+                    #else:
+                    #    labels_maze.append(2.)
+        
+        #arr=np.array(all_possib_inp)
+        #if(self._higher_dim_obs==False):
+        #    arr=arr.reshape(arr.shape[0],-1)
+        #else:
+        #    arr=arr.reshape(arr.shape[0],-1)
+        #    
+        #np.savetxt('tsne_python/mazesH_X.txt',arr.reshape(arr.shape[0],-1))
+        #np.savetxt('tsne_python/mazesH_labels.txt',np.array(labels_maze))
         
         all_possib_inp=np.expand_dims(np.array(all_possib_inp,dtype='float'),axis=1)
 
@@ -124,7 +141,7 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         if(all_possib_abs_states.ndim==4):
             all_possib_abs_states=np.transpose(all_possib_abs_states, (0, 3, 1, 2))    # data_format='channels_last' --> 'channels_first'
         
-        n=500
+        n=1000
         historics=[]
         for i,observ in enumerate(test_data_set.observations()[0][0:n]):
             historics.append(np.expand_dims(observ,axis=0))
@@ -179,10 +196,11 @@ def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
         # Plot the dots at each time step depending on the action taken
         length_block=[[0,18],[18,19],[19,31]]
         for i in range(3):
+            colors=['blue','orange','green']
             if(self.intern_dim==2):
-                line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], s=30, marker='x', edgecolors='k', alpha=0.5)
+                line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
             else:
-                line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], s=30, marker='x', depthshade=True, edgecolors='k', alpha=0.5)
+                line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
 
         if(self.intern_dim==2):
             axes_lims=[ax.get_xlim(),ax.get_ylim()]

From cf32f1f4358eeed99a39579f77e9eaeeea3956c5 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Fri, 7 Sep 2018 13:27:36 -0400
Subject: [PATCH 90/96] fix q_net_keras.py

---
 deer/learning_algos/q_net_keras.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deer/learning_algos/q_net_keras.py b/deer/learning_algos/q_net_keras.py
index f376546e..13d98538 100644
--- a/deer/learning_algos/q_net_keras.py
+++ b/deer/learning_algos/q_net_keras.py
@@ -126,7 +126,7 @@ def train(self, states_val, actions_val, rewards_val, next_states_val, terminals
         else:
             max_next_q_vals=np.max(next_q_vals, axis=1, keepdims=True)
 
-        not_terminals=np.ones_like(terminals_val) - terminals_val
+        not_terminals=np.invert(terminals_val).astype(float)
         
         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
         

From 2157d3569cef1d0267b06498e39a6f153aa06e7e Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Sat, 8 Sep 2018 19:45:35 -0400
Subject: [PATCH 91/96] fix travis

---
 ci_scripts/install.sh | 21 +++++++++++++++++----
 ci_scripts/test.sh    |  8 --------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index ec4a9c83..30937c3d 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -15,7 +15,7 @@ ls -l
 echo
 if [[ ! -f miniconda.sh ]]
    then
-   wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+   wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ #Miniconda3-4.5.4-Linux-x86_64.sh \
        -O miniconda.sh
    fi
 chmod +x miniconda.sh && ./miniconda.sh -b
@@ -30,11 +30,23 @@ popd
 # Configure the conda environment and put it in the path using the
 # provided versions
 conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
-      numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \
-      matplotlib theano=$THEANO_VERSION joblib
+      numpy
 
+conda install libgcc -y
 source activate testenv
-
+pip install --upgrade pip
+pip install scipy
+pip install keras
+pip install tensorflow
+pip install matplotlib
+pip install joblib
+pip install cython
+
+#if [[ "$PYTHON_VERSION" == "2.7" ]]; then
+#    pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.1-cp27-none-linux_x86_64.whl # tensorflow
+#elif [[ "$PYTHON_VERSION" == "3.5" ]]; then
+#    pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.1-cp35-cp35m-linux_x86_64.whl 
+#fi
 
 if [[ "$COVERAGE" == "true" ]]; then
     pip install coverage coveralls
@@ -44,5 +56,6 @@ python --version
 python -c "import numpy; print('numpy %s' % numpy.__version__)"
 python -c "import scipy; print('scipy %s' % scipy.__version__)"
 python -c "import theano; print('theano %s' % theano.__version__)"
+python -c "import tensorflow; print('tensorflow %s' % tensorflow.__version__)"
 
 python setup.py develop
diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh
index 661b631c..0ac37927 100644
--- a/ci_scripts/test.sh
+++ b/ci_scripts/test.sh
@@ -13,15 +13,7 @@ elif [[ "$EXAMPLE" == "mountain_car" ]]; then
     python run_mountain_car.py  --epochs 5
 
     pip -V pip
-    pip install --upgrade pip
-    conda install libgcc -y
-    if [[ "$PYTHON_VERSION" == "2.7" ]]; then
-      pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0-cp27-none-linux_x86_64.whl;
-    elif [[ "$PYTHON_VERSION" == "3.5" ]]; then
-      pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0-cp35-cp35m-linux_x86_64.whl;
-    fi
     
-#    pip install keras
 #    python run_mountain_car_continuous.py  --epochs 5
 
 else

From 145e2e93a2d715307b32c8fb3138d6db4a10067e Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Mon, 10 Sep 2018 10:26:49 -0400
Subject: [PATCH 92/96] fixes

---
 deer/learning_algos/NN_CRAR_keras.py |  3 ++-
 deer/learning_algos/NN_keras.py      | 16 +++++++++++-----
 examples/ALE/run_ALE.py              | 10 +++-------
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/deer/learning_algos/NN_CRAR_keras.py b/deer/learning_algos/NN_CRAR_keras.py
index e3f27fe1..5adf9a71 100644
--- a/deer/learning_algos/NN_CRAR_keras.py
+++ b/deer/learning_algos/NN_CRAR_keras.py
@@ -64,10 +64,11 @@ def encoder_model(self):
                     input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
                     inputs.append(input)
                     input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
+                    x=Permute((2,3,1), input_shape=(dim[-4]*dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
                 else:
                     input = Input(shape=(dim[-3],dim[-2],dim[-1]))
                     inputs.append(input)
-                x=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
+                    x=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
 
                 if(dim[-2]>12 and dim[-1]>12):
                     self._pooling_encoder=6
diff --git a/deer/learning_algos/NN_keras.py b/deer/learning_algos/NN_keras.py
index 56273aff..2edb37cb 100644
--- a/deer/learning_algos/NN_keras.py
+++ b/deer/learning_algos/NN_keras.py
@@ -38,11 +38,17 @@ def _buildDQN(self):
 
         for i, dim in enumerate(self._input_dimensions):
             # - observation[i] is a FRAME
-            if len(dim) == 3:
-                input = Input(shape=(dim[0],dim[1],dim[2]))
-                inputs.append(input)
-                reshaped=Permute((2,3,1), input_shape=(dim[0],dim[1],dim[2]))(input)    #data_format='channels_last'
-                x = Conv2D(8, (4, 4), activation='relu', padding='valid')(reshaped)   #Conv on the frames
+            if len(dim) == 3 or len(dim) == 4:
+                if(len(dim) == 4):
+                    input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
+                    inputs.append(input)
+                    input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
+                    x=Permute((2,3,1), input_shape=(dim[-4]*dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
+                else:
+                    input = Input(shape=(dim[-3],dim[-2],dim[-1]))
+                    inputs.append(input)
+                    x=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
+                x = Conv2D(8, (4, 4), activation='relu', padding='valid')(x)   #Conv on the frames
                 x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x)         #Conv on the frames
                 x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x)
                 x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x)         #Conv on the frames
diff --git a/examples/ALE/run_ALE.py b/examples/ALE/run_ALE.py
index c6bb35b9..256a1d8a 100644
--- a/examples/ALE/run_ALE.py
+++ b/examples/ALE/run_ALE.py
@@ -10,7 +10,7 @@
 
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
-from deer.learning_algos.CRAR_keras import CRAR
+from deer.learning_algos.q_net_keras import MyQNetwork
 from ALE_env_gym import MyEnv as ALE_env
 import deer.experiment.base_controllers as bc
 
@@ -52,8 +52,6 @@ class Defaults:
     FREEZE_INTERVAL = 2500
     DETERMINISTIC = True
 
-HIGH_INT_DIM = True
-
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
@@ -74,7 +72,7 @@ class Defaults:
     env = ALE_env(rng, game=parameters.param1, frame_skip=parameters.frame_skip)
     
     # --- Instantiate qnetwork ---
-    qnetwork = CRAR(
+    qnetwork = MyQNetwork(
         env,
         parameters.rms_decay,
         parameters.rms_epsilon,
@@ -84,9 +82,7 @@ class Defaults:
         parameters.batch_size,
         parameters.update_rule,
         rng,     
-        double_Q=True,
-        high_int_dim=HIGH_INT_DIM,
-        internal_dim=3)
+        double_Q=True)
     
     train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
     test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)

From 7caf4ab8f891bf467ec842a35855800bfbc072b3 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Mon, 10 Sep 2018 16:17:06 -0400
Subject: [PATCH 93/96] fixes

---
 .../MG_two_storages/MG_two_storages_env.py    | 12 +++----
 .../MG_two_storages/run_MG_two_storages.py    |  4 +--
 examples/gym/mountain_car_env.py              | 36 ++++++++++++-------
 examples/gym/run_mountain_car.py              |  8 ++++-
 4 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/examples/MG_two_storages/MG_two_storages_env.py b/examples/MG_two_storages/MG_two_storages_env.py
index 8561871f..0213f79e 100644
--- a/examples/MG_two_storages/MG_two_storages_env.py
+++ b/examples/MG_two_storages/MG_two_storages_env.py
@@ -33,8 +33,8 @@ def __init__(self, rng, reduce_qty_data=None, length_history=None, start_history
         reduce_qty_data=int(reduce_qty_data) if reduce_qty_data is not None else int(1)
         length_history=int(length_history) if length_history is not None else int(12)
         start_history=int(start_history) if start_history is not None else int(0)
-        print "reduce_qty_data, length_history, start_history"
-        print reduce_qty_data, length_history, start_history
+        print ("reduce_qty_data, length_history, start_history")
+        print (reduce_qty_data, length_history, start_history)
         # Defining the type of environment
         self._dist_equinox=0
         self._pred=0
@@ -86,8 +86,8 @@ def __init__(self, rng, reduce_qty_data=None, length_history=None, start_history
         self.production_valid=self.production_valid_norm*12000./1000.*inc_sizing
         self.production_test=self.production_test_norm*12000/1000*inc_sizing
 
-        print "self.production_train brefore"
-        print self.production_train
+        print ("self.production_train brefore")
+        print (self.production_train)
         
         ###
         ### Artificially reducing the variety of the training and validation time series
@@ -110,8 +110,8 @@ def __init__(self, rng, reduce_qty_data=None, length_history=None, start_history
                 self.production_train_norm[season*nd_one_seas:(season+1)*nd_one_seas]=np.tile(self.production_train_norm[int((season+(self._start_history+0.)/self._reduce_qty_data)*nd_one_seas):int((season+(self._start_history+1.)/self._reduce_qty_data)*nd_one_seas)], self._reduce_qty_data)
                 self.production_valid_norm[season*nd_one_seas:(season+1)*nd_one_seas]=np.tile(self.production_valid_norm[int((season+(self._start_history+0.)/self._reduce_qty_data)*nd_one_seas):int((season+(self._start_history+1.)/self._reduce_qty_data)*nd_one_seas)], self._reduce_qty_data)
             
-        print "self.production_train after"
-        print self.production_train
+        print ("self.production_train after")
+        print (self.production_train)
 
         self.min_production=min(self.production_train)
         self.max_production=max(self.production_train)
diff --git a/examples/MG_two_storages/run_MG_two_storages.py b/examples/MG_two_storages/run_MG_two_storages.py
index e6913b7a..904cb0c1 100644
--- a/examples/MG_two_storages/run_MG_two_storages.py
+++ b/examples/MG_two_storages/run_MG_two_storages.py
@@ -63,7 +63,7 @@ class Defaults:
     
     # --- Parse parameters ---
     parameters = process_args(sys.argv[1:], Defaults)
-    print parameters.deterministic
+
     if parameters.deterministic:
         rng = np.random.RandomState(123456)
     else:
@@ -74,8 +74,6 @@ class Defaults:
         # That way, the number of days in each season is divisible by parameters.param1 and it is thus possible
         # to reduce the variety of the data within each season in the time series by a factor of parameters.param1
         parameters.steps_per_epoch=parameters.steps_per_epoch-(parameters.steps_per_epoch%(24*4*int(parameters.param1)))-1
-    print "parameters.steps_per_epoch"
-    print parameters.steps_per_epoch
 
     # --- Instantiate environment ---
     env = MG_two_storages_env(rng, parameters.param1, parameters.param2, parameters.param3)
diff --git a/examples/gym/mountain_car_env.py b/examples/gym/mountain_car_env.py
index 06be69e5..65338a22 100644
--- a/examples/gym/mountain_car_env.py
+++ b/examples/gym/mountain_car_env.py
@@ -12,21 +12,31 @@ def __init__(self, rng):
         Arguments:
             rng - the numpy random number generator            
         """
-        self.env = gym.make('MountainCar-v0')
+        gym.envs.register(
+        id='MountainCarModified-v0',
+        entry_point='gym.envs.classic_control:MountainCarEnv',
+        max_episode_steps=500,      # MountainCar-v0 uses 200
+        reward_threshold=-110.0,
+        )
+
+        self.env = gym.make('MountainCarModified-v0')
+        self.env.max_episode_steps = 500
         self.rng=rng
         self._last_observation = self.env.reset()
         self.is_terminal=False
-        self._input_dim = [(1,), (1,)]      # self.env.observation_space.shape is equal to 4 
+        self._input_dim = [(1,), (1,)]      # self.env.observation_space.shape is equal to 2
                                             # and we use only the current observation in the pseudo-state
 
     def act(self, action):
         """ Simulate one time step in the environment.
         """
         reward=0
-        for _ in range(5):
+        nsteps=10
+        for _ in range(nsteps):
             self._last_observation, r, self.is_terminal, info = self.env.step(action)
             reward+=r
             if(self.is_terminal==True):
+                reward+=3*nsteps
                 break
         
             if (self.mode==0): # Show the policy only at test time
@@ -35,15 +45,14 @@ def act(self, action):
                 except:
                     pass
                     #print("Warning:", sys.exc_info()[0])
-        
-        s=copy.deepcopy(self._last_observation)
-        
+
+        #s=copy.deepcopy(self._last_observation)
         ## Possibility to add a reward shaping for faster convergence   
         #s[0]+=math.pi/6
         #if(s[0]>0):
         #    reward+=pow(s[0],2)#np.linalg.norm(s[0])
-                
-        return reward
+
+        return reward/nsteps
                 
     def reset(self, mode=0):
         """ Reset environment for a new episode.
@@ -55,11 +64,12 @@ def reset(self, mode=0):
         self.mode=mode
         
         self._last_observation = self.env.reset()
-        if (self.mode==-1): # Reset to a random value when in training mode (that allows to increase exploration)
-            high=self.env.observation_space.high
-            low=self.env.observation_space.low
-            self._last_observation=low+self.rng.rand(2)*(high-low)            
-            self.env.state=self._last_observation
+        # DEEPRECATED
+        #if (self.mode==-1): # Reset to a random value when in training mode (that allows to increase exploration)
+        #    high=self.env.observation_space.high
+        #    low=self.env.observation_space.low
+        #    self._last_observation=low+self.rng.rand(2)*(high-low)
+        #    self.env.env.state=self._last_observation
 
         self.is_terminal=False
         
diff --git a/examples/gym/run_mountain_car.py b/examples/gym/run_mountain_car.py
index 9848972d..1f737485 100644
--- a/examples/gym/run_mountain_car.py
+++ b/examples/gym/run_mountain_car.py
@@ -12,6 +12,7 @@
 from deer.agent import NeuralAgent
 from deer.learning_algos.q_net_keras import MyQNetwork
 from mountain_car_env import MyEnv as mountain_car_env
+from deer.policies import EpsilonGreedyPolicy,LongerExplorationPolicy
 
 class Defaults:
     # ----------------------
@@ -75,6 +76,9 @@ class Defaults:
         rng,
         double_Q=True)
     
+    train_policy = LongerExplorationPolicy(qnetwork, env.nActions(), rng, 1.0)#EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.)
+    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.)
+
     # --- Instantiate agent ---
     agent = NeuralAgent(
         env,
@@ -83,7 +87,9 @@ class Defaults:
         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
         parameters.batch_size,
         rng,
-        exp_priority=1.)
+        exp_priority=1.,
+        train_policy=train_policy,
+        test_policy=test_policy)
 
     # --- Bind controllers to the agent ---
     # For comments, please refer to run_toy_env.py

From e7d070015b1abaf6ea932f57f0e095752a0daeeb Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Mon, 10 Sep 2018 16:23:05 -0400
Subject: [PATCH 94/96] remove PLE

---
 examples/PLE/PLE_env.py | 376 ----------------------------------------
 examples/PLE/run_PLE.py | 195 ---------------------
 2 files changed, 571 deletions(-)
 delete mode 100644 examples/PLE/PLE_env.py
 delete mode 100644 examples/PLE/run_PLE.py

diff --git a/examples/PLE/PLE_env.py b/examples/PLE/PLE_env.py
deleted file mode 100644
index b4bd8788..00000000
--- a/examples/PLE/PLE_env.py
+++ /dev/null
@@ -1,376 +0,0 @@
-""" Interface with the PLE environment
-
-"""
-
-import numpy as np
-import cv2
-from ple import PLE
-
-from deer.base_classes import Environment
-
-import matplotlib
-matplotlib.use('qt5agg')
-from mpl_toolkits.axes_grid1 import host_subplot
-import mpl_toolkits.axisartist as AA
-import matplotlib.pyplot as plt
-import sys
-sys.path.insert(0, 'all_frames')
-import plot_all_frames
-
-class MyEnv(Environment):
-    VALIDATION_MODE = 0
-
-    def __init__(self, rng, game=None, frame_skip=2, width=64, height=64,
-            ple_options={"display_screen": True, "force_fps":True, "fps":15}):
-
-        self._mode = -1
-        self._mode_score = 0.0
-        self._mode_episode_count = 0
-
-        self._frame_skip = frame_skip if frame_skip >= 1 else 1
-        self._random_state = rng
-        self.width=width
-        self.height=height
-       
-        if game is None:
-            raise ValueError("Game must be provided")
-
-        self._ple = PLE(game, **ple_options)
-        self._ple.game.rng = rng
-        self._ple.init()
-
-        w, h = self._ple.getScreenDims()
-        self._screen = np.empty((h, w), dtype=np.uint8)
-        self._reduced_screen = np.empty((32, 32), dtype=np.uint8)
-        self._actions = self._ple.getActionSet()
-
-                
-    def reset(self, mode):
-        if mode == MyEnv.VALIDATION_MODE:
-            if self._mode != MyEnv.VALIDATION_MODE:
-                self._mode = MyEnv.VALIDATION_MODE
-                self._mode_score = 0.0
-                self._mode_episode_count = 0
-                # fix the seed for every new validation. It potentially removes one source of variance and
-                # it allows to show some illustration of the learning for the same setting in validation
-                #self._ple.game.rng = np.random.RandomState(23) # 23:left, center, right, ...
-            else:
-                self._mode_episode_count += 1
-        elif self._mode != -1: # and thus mode == -1
-            self._mode = -1
-        
-        
-        self._ple.reset_game()
-        #for _ in range(self._ple.rng.randint(15)):
-        #    self._ple.act(self._ple.NOOP)
-        self._screen = self._ple.getScreenGrayscale()
-        cv2.resize(self._screen, (32, 32), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
-        
-        return [1 * [32 * [32 * [0]]]]
-        
-        
-    def act(self, action):
-        #print action
-        #print self._actions
-        #if self._mode == MyEnv.VALIDATION_MODE:
-        #    action=0
-        action = self._actions[action]
-        
-
-        self.reward = 0
-        for _ in range(self._frame_skip):
-            self.reward += self._ple.act(action)
-            if self.inTerminalState():
-                break
-            
-        self._screen = self._ple.getScreenGrayscale()
-        cv2.resize(self._screen, (32, 32), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
-  
-        self._mode_score += self.reward
-        return np.sign(self.reward)
-
-    def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
-        all_possib_inp=np.expand_dims(np.array(plot_all_frames.get_all_possib_inp(self.width,self.height)),axis=1)/256.
-        #print "all_possib_inp[0]"
-        print "all_possib_inp.shape"
-        print all_possib_inp.shape
-        #print all_possib_inp[0]
-        #print all_possib_inp[224]
-        #print all_possib_inp[225]
-        #print "all_possib_inp[449]"
-        #print all_possib_inp[449]
-        #print all_possib_inp[450]
-        all_possib_abs_states=learning_algo.encoder.predict(all_possib_inp)
-        print "np.array(all_possib_abs_states).shape"
-        print np.array(all_possib_abs_states).shape
-        #print all_possib_abs_states[0,0]
-        #print "test_data_set.observations.shape"
-        #print test_data_set.observations()[0][0:1]
-        n=14
-        historics=[]
-        for i,observ in enumerate(test_data_set.observations()[0][1:n]):
-            historics.append(np.expand_dims(observ,axis=0))
-#        for i,observ in enumerate(test_data_set.observations()[0][0:n+1]):
-#            if(i<n):
-#                historics.append(np.expand_dims(observ,axis=0))
-#            if(i>0):
-#                historics[i-1]=np.concatenate([historics[i-1],np.expand_dims(observ,axis=0)], axis=0)
-        historics=np.array(historics)
-        #print "historics[0]"
-        #print historics.shape
-        #print historics[0]
-        abs_states=learning_algo.encoder.predict(historics)
-        print abs_states
-        actions=test_data_set.actions()[1:n] #instead of 0:n because history of 2 time steps considered
-        print actions
-        print test_data_set.rewards()[1:n]
-        if self.inTerminalState() == False:
-            self._mode_episode_count += 1
-        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
-                
-        
-        import matplotlib.pyplot as plt
-        from mpl_toolkits.mplot3d import Axes3D
-        import matplotlib.cm as cm
-        m = cm.ScalarMappable(cmap=cm.jet)
-        
-        x = np.array(abs_states)[:,0]
-        y = np.array(abs_states)[:,1]
-        z = np.array(abs_states)[:,2]
-        
-        #Colors
-        #onehot_actions = np.zeros((n, 4))
-        #onehot_actions[np.arange(n), actions] = 1
-        
-        # Plot the trajectory
-        fig = plt.figure()
-        ax = fig.add_subplot(111,projection='3d')
-        for i in xrange(n-1):
-            ax.plot(x[i:i+2], y[i:i+2], z[i:i+2], color=plt.cm.cool(255*i/n), alpha=0.5)
-
-        # Plot the fitted one-step trajectory from time t=10
-        for i in range(n-1):
-            predicted1=learning_algo.transition.predict([abs_states[i:i+1],np.array([[1,0,0]])])
-            predicted2=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,1,0]])])
-            predicted3=learning_algo.transition.predict([abs_states[i:i+1],np.array([[0,0,1]])])
-            ax.plot(np.concatenate([x[i:i+1],predicted3[0,:1]]), np.concatenate([y[i:i+1],predicted3[0,1:2]]), np.concatenate([z[i:i+1],predicted3[0,2:3]]), color="0.23", alpha=0.75) #black
-            ax.plot(np.concatenate([x[i:i+1],predicted2[0,:1]]), np.concatenate([y[i:i+1],predicted2[0,1:2]]), np.concatenate([z[i:i+1],predicted2[0,2:3]]), color="0.57", alpha=0.75) #grey
-            ax.plot(np.concatenate([x[i:i+1],predicted1[0,:1]]), np.concatenate([y[i:i+1],predicted1[0,1:2]]), np.concatenate([z[i:i+1],predicted1[0,2:3]]), color="0.9", alpha=0.75) #white
-
-#        for xx in [-2,-1.,0, 1., 2.]:
-#            for yy in [-2,-1.,0, 1., 2.]:
-#                for zz in [-2,-1.,0, 1., 2.]:
-#                    predicted1=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[1,0,0]])])
-#                    predicted2=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,1,0]])])
-#                    predicted3=learning_algo.transition2.predict([np.array([[xx,yy,zz]]),np.array([[0,0,1]])])
-#                    ax.plot(np.concatenate([np.array([xx]),predicted1[0,:1]]), np.concatenate([np.array([yy]),predicted1[0,1:2]]), np.concatenate([np.array([zz]),predicted1[0,2:]]), color="1", alpha=0.5)
-#                    ax.plot(np.concatenate([np.array([xx]),predicted2[0,:1]]), np.concatenate([np.array([yy]),predicted2[0,1:2]]), np.concatenate([np.array([zz]),predicted2[0,2:]]), color="0.5", alpha=0.5)
-#                    ax.plot(np.concatenate([np.array([xx]),predicted3[0,:1]]), np.concatenate([np.array([yy]),predicted3[0,1:2]]), np.concatenate([np.array([zz]),predicted3[0,2:]]), color="0", alpha=0.5)
-                    #ax.plot(np.concatenate([x[i:i+1],predicted[0,:1]]), np.concatenate([y[i:i+1],predicted[0,1:2]]), np.concatenate([z[i:i+1],predicted[0,2:]]), color="g")
-        
-
-        # Plot the colorbar for the trajectory
-        fig.subplots_adjust(right=0.7)
-        ax1 = fig.add_axes([0.725, 0.15, 0.025, 0.7])
-        # Set the colormap and norm to correspond to the data for which the colorbar will be used.
-        cmap = matplotlib.cm.cool
-        norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
-
-        # ColorbarBase derives from ScalarMappable and puts a colorbar in a specified axes, so it has 
-        # everything needed for a standalone colorbar.  There are many more kwargs, but the
-        # following gives a basic continuous colorbar with ticks and labels.
-        cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=cmap,
-                                norm=norm,
-                                orientation='vertical')
-        cb1.set_label('Beginning to end of trajectory')
-
-
-        # Plot the dots at each time step depending on the action taken
-        self._nx_block=3
-        length_block=15*8
-        for i in range(self._nx_block):
-            line3 = ax.scatter(all_possib_abs_states[i*length_block:(i+1)*length_block,0], all_possib_abs_states[i*length_block:(i+1)*length_block,1] ,all_possib_abs_states[i*length_block:(i+1)*length_block,2], s=10, marker='x', depthshade=True, edgecolors='k', alpha=0.2)
-        print np.tile(np.expand_dims(actions,axis=1),(1,3))
-        print np.tile(np.expand_dims(0.75-actions/4.,axis=1),(1,3))
-        line2 = ax.scatter(x, y ,z , c=np.tile(np.expand_dims(0.9-actions/3.,axis=1),(1,3)), s=50, marker='o', edgecolors='k', depthshade=True, alpha=0.75)
-        axes_lims=[ax.get_xlim(),ax.get_ylim(),ax.get_zlim()]
-        zrange=axes_lims[2][1]-axes_lims[2][0]
-        
-        # Plot the legend for the dots
-        from matplotlib.patches import Circle, Rectangle
-        from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, DrawingArea, HPacker
-        box1 = TextArea(" State representation (action 0, 1 or 2) : ", textprops=dict(color="k")) #none, left and right
-        
-        box2 = DrawingArea(60, 20, 0, 0)
-        el1 = Circle((10, 10), 5, fc="0.9", alpha=0.75, edgecolor="k")
-        el2 = Circle((30, 10), 5, fc="0.57", alpha=0.75, edgecolor="k") 
-        el3 = Circle((50, 10), 5, fc="0.23", alpha=0.75, edgecolor="k") 
-        box2.add_artist(el1)
-        box2.add_artist(el2)
-        box2.add_artist(el3)
-        
-        box = HPacker(children=[box1, box2],
-                      align="center",
-                      pad=0, sep=5)
-        
-        anchored_box = AnchoredOffsetbox(loc=3,
-                                         child=box, pad=0.,
-                                         frameon=True,
-                                         bbox_to_anchor=(0., 1.07),
-                                         bbox_transform=ax.transAxes,
-                                         borderpad=0.,
-                                         )        
-        ax.add_artist(anchored_box)
-
-        # Plot the legend for transition estimates
-        box1b = TextArea(" Estimated transitions (action 0, 1 or 2): ", textprops=dict(color="k"))
-        box2b = DrawingArea(70, 20, 0, 0)
-        el1b = Rectangle((5, 10), 15,2, fc="0.9", alpha=0.75)
-        el2b = Rectangle((25, 10), 15,2, fc="0.57", alpha=0.75) 
-        el3b = Rectangle((45, 10), 15,2, fc="0.23", alpha=0.75) 
-        box2b.add_artist(el1b)
-        box2b.add_artist(el2b)
-        box2b.add_artist(el3b)
-
-        boxb = HPacker(children=[box1b, box2b],
-                      align="center",
-                      pad=0, sep=5)
-        
-        anchored_box = AnchoredOffsetbox(loc=3,
-                                         child=boxb, pad=0.,
-                                         frameon=True,
-                                         bbox_to_anchor=(0., 0.98),
-                                         bbox_transform=ax.transAxes,
-                                         borderpad=0.,
-                                         )        
-        ax.add_artist(anchored_box)
-
-        ax.w_xaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        ax.w_yaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        ax.w_zaxis.set_pane_color((0.99, 0.99, 0.99, 0.99))
-        #plt.show()
-        plt.savefig('fig_base'+str(learning_algo.update_counter)+'.pdf')
-
-
-#        # Plot the Q_vals
-#        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-#        #print "actions,C"
-#        #print actions
-#        #print c
-#        #c=np.max(c,axis=1)
-#        m1=ax.scatter(x, y, z+zrange/20, c=c[:,0], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-#        m2=ax.scatter(x, y, z+3*zrange/40, c=c[:,1], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-#        m3=ax.scatter(x, y, z+zrange/10, c=c[:,2], vmin=-1., vmax=1., cmap=plt.cm.RdYlGn)
-#        
-#        #plt.colorbar(m3)
-#        ax2 = fig.add_axes([0.85, 0.15, 0.025, 0.7])
-#        cmap = matplotlib.cm.RdYlGn
-#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-#
-#        # ColorbarBase derives from ScalarMappable and puts a colorbar
-#        # in a specified axes, so it has everything needed for a
-#        # standalone colorbar.  There are many more kwargs, but the
-#        # following gives a basic continuous colorbar with ticks
-#        # and labels.
-#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-#        cb1.set_label('Estimated expected return')
-#
-#        plt.savefig('fig_w_V'+str(learning_algo.update_counter)+'.pdf')
-#
-#
-#        # fig_visuV
-#        fig = plt.figure()
-#        ax = fig.add_subplot(111, projection='3d')
-#        
-#        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
-#        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
-#        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-#
-#        c = learning_algo.Q.predict(np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1))
-#        c=np.max(c,axis=1)
-#        #print "c"
-#        #print c
-#        
-#        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-#        #plt.colorbar(m)
-#        fig.subplots_adjust(right=0.8)
-#        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
-#        cmap = matplotlib.cm.hot
-#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-#
-#        # ColorbarBase derives from ScalarMappable and puts a colorbar
-#        # in a specified axes, so it has everything needed for a
-#        # standalone colorbar.  There are many more kwargs, but the
-#        # following gives a basic continuous colorbar with ticks
-#        # and labels.
-#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-#        cb1.set_label('Estimated expected return')
-#
-#        #plt.show()
-#        plt.savefig('fig_visuV'+str(learning_algo.update_counter)+'.pdf')
-#
-#
-#        # fig_visuR
-#        fig = plt.figure()
-#        ax = fig.add_subplot(111, projection='3d')
-#        
-#        x = np.array([i for i in range(5) for jk in range(25)])/4.*(axes_lims[0][1]-axes_lims[0][0])+axes_lims[0][0]
-#        y = np.array([j for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[1][1]-axes_lims[1][0])+axes_lims[1][0]
-#        z = np.array([k for i in range(5) for j in range(5) for k in range(5)])/4.*(axes_lims[2][1]-axes_lims[2][0])+axes_lims[2][0]
-#
-#        coords=np.concatenate((np.expand_dims(x,axis=1),np.expand_dims(y,axis=1),np.expand_dims(z,axis=1)),axis=1)
-#        repeat3_coord=np.repeat(coords,3,axis=0)
-#        identity_matrix = np.diag(np.ones(self.nActions()))
-#        tile_identity_matrix=np.tile(identity_matrix,(5*5*5,1))
-#
-#        c = learning_algo.R.predict([repeat3_coord,tile_identity_matrix])
-#        c=np.max(np.reshape(c,(125,3)),axis=1)
-#        #print "c"
-#        #print c
-#        #mini=np.min(c)
-#        #maxi=np.max(c)
-#        
-#        m=ax.scatter(x, y, z, c=c, vmin=-1., vmax=1., cmap=plt.hot())
-#        #plt.colorbar(m)
-#        fig.subplots_adjust(right=0.8)
-#        ax2 = fig.add_axes([0.875, 0.15, 0.025, 0.7])
-#        cmap = matplotlib.cm.hot
-#        norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-#
-#        # ColorbarBase derives from ScalarMappable and puts a colorbar
-#        # in a specified axes, so it has everything needed for a
-#        # standalone colorbar.  There are many more kwargs, but the
-#        # following gives a basic continuous colorbar with ticks
-#        # and labels.
-#        cb1 = matplotlib.colorbar.ColorbarBase(ax2, cmap=cmap,norm=norm,orientation='vertical')
-#        cb1.set_label('Estimated expected return')
-#
-#        #plt.show()
-#        plt.savefig('fig_visuR'+str(learning_algo.update_counter)+'.pdf')
-
-        matplotlib.pyplot.close("all") # avoids memory leaks
-
-    def inputDimensions(self):
-        return [(1, 32, 32)]
-
-    def observationType(self, subject):
-        return np.float32
-
-    def nActions(self):
-        return len(self._actions)
-
-    def observe(self):
-        return [np.array(self._reduced_screen)/256.]
-
-    def inTerminalState(self):
-        #if (self.reward!=0):
-        #    # If a reward has been observed, end the episode
-        #    print "end!!"
-        #    return True
-        #else:
-        return self._ple.game_over()
-                
-
-
-if __name__ == "__main__":
-    pass
\ No newline at end of file
diff --git a/examples/PLE/run_PLE.py b/examples/PLE/run_PLE.py
deleted file mode 100644
index 1b2d7c51..00000000
--- a/examples/PLE/run_PLE.py
+++ /dev/null
@@ -1,195 +0,0 @@
-"""ALE launcher. See Wiki for more details about this experiment.
-
-Authors: Vincent Francois-Lavet, David Taralla
-"""
-
-import sys
-import logging
-import numpy as np
-from joblib import hash, dump
-import os
-
-from deer.default_parser import process_args
-from deer.agent import NeuralAgent
-from deer.q_networks.q_net_keras_lp import MyQNetwork
-from PLE_env import MyEnv as PLE_env
-import deer.experiment.base_controllers as bc
-
-from ple.games.catcher import Catcher
-
-from deer.policies import EpsilonGreedyPolicy
-
-
-class Defaults:
-    # ----------------------
-    # Experiment Parameters
-    # ----------------------
-    STEPS_PER_EPOCH = 1000
-    EPOCHS = 500
-    STEPS_PER_TEST = 500
-    PERIOD_BTW_SUMMARY_PERFS = 1
-    
-    # ----------------------
-    # Environment Parameters
-    # ----------------------
-    FRAME_SKIP = 2
-
-    # ----------------------
-    # DQN Agent parameters:
-    # ----------------------
-    UPDATE_RULE = 'rmsprop'
-    LEARNING_RATE = 0.0002
-    LEARNING_RATE_DECAY = 0.98
-    DISCOUNT = 0.9
-    DISCOUNT_INC = 1
-    DISCOUNT_MAX = 0.99
-    RMS_DECAY = 0.9
-    RMS_EPSILON = 0.0001
-    MOMENTUM = 0
-    CLIP_DELTA = 1.0
-    EPSILON_START = 1.0
-    EPSILON_MIN = 1.0
-    EPSILON_DECAY = 10000
-    UPDATE_FREQUENCY = 1
-    REPLAY_MEMORY_SIZE = 1000000
-    BATCH_SIZE = 32
-    FREEZE_INTERVAL = 1000
-    DETERMINISTIC = False
-
-
-
-
-if __name__ == "__main__":
-    game = Catcher(width=64, height=64) 
-    logging.basicConfig(level=logging.INFO)
-    
-    # --- Parse parameters ---
-    parameters = process_args(sys.argv[1:], Defaults)
-    if parameters.deterministic:
-        rng = np.random.RandomState(123456)
-    else:
-        rng = np.random.RandomState()
-    
-    # --- Instantiate environment ---
-    env = PLE_env(rng, game=game, frame_skip=parameters.frame_skip,width=width, height=height,
-            ple_options={"display_screen": True, "force_fps":True, "fps":20})
-    
-    # --- Instantiate qnetwork ---
-    qnetwork = MyQNetwork(
-        env,
-        parameters.rms_decay,
-        parameters.rms_epsilon,
-        parameters.momentum,
-        parameters.clip_delta,
-        parameters.freeze_interval,
-        parameters.batch_size,
-        parameters.update_rule,
-        rng)
-    
-    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
-
-    # --- Instantiate agent ---
-    agent = NeuralAgent(
-        env,
-        qnetwork,
-        parameters.replay_memory_size,
-        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
-        parameters.batch_size,
-        rng,
-        test_policy=test_policy)
-
-    # --- Create unique filename for FindBestController ---
-    h = hash(vars(parameters), hash_name="sha1")
-    fname = "PLE_" + h
-    print("The parameters hash is: {}".format(h))
-    print("The parameters are: {}".format(parameters))
-
-    # --- Bind controllers to the agent ---
-    # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
-    # learning rate as well as the training epoch number.
-    agent.attach(bc.VerboseController(
-        evaluate_on='epoch', 
-        periodicity=1))
-    
-    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
-    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
-    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
-    agent.attach(bc.TrainerController(
-        evaluate_on='action', 
-        periodicity=parameters.update_frequency, 
-        show_episode_avg_V_value=True, 
-        show_avg_Bellman_residual=True))
-    
-    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
-    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
-    agent.attach(bc.LearningRateController(
-        initial_learning_rate=parameters.learning_rate, 
-        learning_rate_decay=parameters.learning_rate_decay,
-        periodicity=1))
-    
-    # Same for the discount factor.
-    agent.attach(bc.DiscountFactorController(
-        initial_discount_factor=parameters.discount, 
-        discount_factor_growth=parameters.discount_inc, 
-        discount_factor_max=parameters.discount_max,
-        periodicity=1))
-    
-    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
-    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
-    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
-    # episode or epoch (or never, hence the resetEvery='none').
-    agent.attach(bc.EpsilonController(
-        initial_e=parameters.epsilon_start, 
-        e_decays=parameters.epsilon_decay, 
-        e_min=parameters.epsilon_min,
-        evaluate_on='action',
-        periodicity=1,
-        reset_every='none'))
-    
-    # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
-    # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
-    # "true generalization score", or "test score".
-    # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
-    # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
-    # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
-    # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
-    # of the validation and test scores (see below) or simply recover the resulting neural network for your 
-    # application.
-    agent.attach(bc.FindBestController(
-        validationID=PLE_env.VALIDATION_MODE,
-        testID=None,
-        unique_fname=fname))
-    
-    # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
-    # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
-    # these validation epoch to interfere with the training of the agent, which is well established by the 
-    # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
-    # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
-    # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
-    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
-    # [parameters.period_btw_summary_perfs] *validation* epochs.
-    agent.attach(bc.InterleavedTestEpochController(
-        id=PLE_env.VALIDATION_MODE, 
-        epoch_length=parameters.steps_per_test,
-        controllers_to_disable=[0, 1, 2, 3, 4],
-        periodicity=2,
-        show_score=True,
-        summarize_every=1))
-    
-    # --- Run the experiment ---
-    try:
-        os.mkdir("params")
-    except Exception:
-        pass
-    dump(vars(parameters), "params/" + fname + ".jldump")
-    agent.run(parameters.epochs, parameters.steps_per_epoch)
-    
-    # --- Show results ---
-    basename = "scores/" + fname
-    scores = joblib.load(basename + "_scores.jldump")
-    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
-    plt.legend()
-    plt.xlabel("Number of epochs")
-    plt.ylabel("Score")
-    plt.savefig(basename + "_scores.pdf")
-    plt.show()

From ba9f47301ba638054cf1fd1d75e4fe475cd930eb Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Mon, 10 Sep 2018 16:30:30 -0400
Subject: [PATCH 95/96] update doc

---
 docs/user/environments.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/user/environments.rst b/docs/user/environments.rst
index a4567afa..6d9e5171 100644
--- a/docs/user/environments.rst
+++ b/docs/user/environments.rst
@@ -27,7 +27,6 @@ Examples are better than precepts and the best is to get started with the follow
   
   environments/toy_env_time_series.rst
   environments/gym.rst
-  environments/two_storages.rst  
-  environments/PLE.rst
+  environments/two_storages.rst
   environments/ALE.rst
   
\ No newline at end of file

From 1738bb3958cd090698dd8b4c1736aa3a3263af97 Mon Sep 17 00:00:00 2001
From: Vincent Francois <vincent.francois@gmail.com>
Date: Tue, 11 Sep 2018 14:27:54 -0400
Subject: [PATCH 96/96] rename maze_env

---
 examples/maze/{test_env4.py => maze_env.py} |  7 -------
 examples/maze/{run_test4.py => run_maze.py} | 16 ++++++++--------
 2 files changed, 8 insertions(+), 15 deletions(-)
 rename examples/maze/{test_env4.py => maze_env.py} (96%)
 rename examples/maze/{run_test4.py => run_maze.py} (97%)

diff --git a/examples/maze/test_env4.py b/examples/maze/maze_env.py
similarity index 96%
rename from examples/maze/test_env4.py
rename to examples/maze/maze_env.py
index 1c68546e..41fa48ca 100644
--- a/examples/maze/test_env4.py
+++ b/examples/maze/maze_env.py
@@ -98,10 +98,6 @@ def reset(self, mode):
                 
             else:
                 self._mode_episode_count += 1
-                    
-        print ("reset mode:"+str(mode)+".")
-        #print "self._pos_agent,self._pos_walls,self._pos_rewards"
-        #print self._pos_agent,self._pos_walls,self._pos_rewards
                 
         return [1 * [self._size_maze * [self._size_maze * [0]]]]
         
@@ -186,7 +182,6 @@ def observe(self):
             reward_obs[5,2:4]=0.7
             
             for i in indices_reward:
-                #print self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]
                 self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=reward_obs
 
             for i in indices_agent:
@@ -204,8 +199,6 @@ def observe(self):
         if(self._reverse==True):
             self._map=-self._map #1-self._map
         
-        #print "self._map"
-        #print self._map
         return [self._map]
 
     def inTerminalState(self):
diff --git a/examples/maze/run_test4.py b/examples/maze/run_maze.py
similarity index 97%
rename from examples/maze/run_test4.py
rename to examples/maze/run_maze.py
index 77ae448f..7d403334 100644
--- a/examples/maze/run_test4.py
+++ b/examples/maze/run_maze.py
@@ -12,7 +12,7 @@
 from deer.default_parser import process_args
 from deer.agent import NeuralAgent
 from deer.learning_algos.CRAR_keras import CRAR
-from test_env4 import MyEnv as test_env
+from maze_env import MyEnv as maze_env
 import deer.experiment.base_controllers as bc
 
 from deer.policies import EpsilonGreedyPolicy
@@ -71,7 +71,7 @@ class Defaults:
         rng = np.random.RandomState()
     
     # --- Instantiate environment ---
-    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS)
+    env = maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS)
     
     # --- Instantiate learning_algo ---
     learning_algo = CRAR(
@@ -170,7 +170,7 @@ class Defaults:
     # of the validation and test scores (see below) or simply recover the resulting neural network for your 
     # application.
     #agent.attach(bc.FindBestController(
-    #    validationID=test_env.VALIDATION_MODE,
+    #    validationID=maze_env.VALIDATION_MODE,
     #    testID=None,
     #    unique_fname=fname))
     
@@ -259,7 +259,7 @@ class Defaults:
 #
 #    
 #    # --- Re instantiate environment with reverse=True ---
-#    env = test_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
+#    env = maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True)
 #
 #    # --- Re instantiate agent ---
 #    agent = NeuralAgent(
@@ -332,7 +332,7 @@ class Defaults:
 #    # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
 #    # [parameters.period_btw_summary_perfs] *validation* epochs.
 #    agent.attach(bc.InterleavedTestEpochController(
-#        id=test_env.VALIDATION_MODE, 
+#        id=maze_env.VALIDATION_MODE, 
 #        epoch_length=parameters.steps_per_test,
 #        controllers_to_disable=[0, 1, 2, 3, 4],
 #        periodicity=2,
@@ -341,7 +341,7 @@ class Defaults:
 #
 #
 ##    agent.attach(bc.InterleavedTestEpochController(
-##        id=test_env.VALIDATION_MODE+1, 
+##        id=maze_env.VALIDATION_MODE+1, 
 ##        epoch_length=parameters.steps_per_test,
 ##        controllers_to_disable=[0, 1, 2, 3, 4, 5, 7,8],
 ##        periodicity=2,
@@ -349,7 +349,7 @@ class Defaults:
 ##        summarize_every=1))
 ##
 ##    agent.attach(bc.InterleavedTestEpochController(
-##        id=test_env.VALIDATION_MODE+2, 
+##        id=maze_env.VALIDATION_MODE+2, 
 ##        epoch_length=parameters.steps_per_test,
 ##        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6,8],
 ##        periodicity=2,
@@ -357,7 +357,7 @@ class Defaults:
 ##        summarize_every=1))
 ##    
 ##    agent.attach(bc.InterleavedTestEpochController(
-##        id=test_env.VALIDATION_MODE+3, 
+##        id=maze_env.VALIDATION_MODE+3, 
 ##        epoch_length=parameters.steps_per_test,
 ##        controllers_to_disable=[0, 1, 2, 3, 4, 5, 6, 7],
 ##        periodicity=2,