Merge branch 'epsilon_fix' into test

Svalorzen · Sep 17, 2018 · d116504 · d116504
2 parents 6938851 + 57aac10
commit d116504
Show file tree

Hide file tree

Showing 66 changed files with 608 additions and 583 deletions.
diff --git a/examples/MDP/tiger_antelope.cpp b/examples/MDP/tiger_antelope.cpp
@@ -230,7 +230,7 @@ int main() {
     auto solution = solver(model);
 
     printCurrentTimeString();
-    std::cout << " - Converged: " << (std::get<0>(solution) < solver.getEpsilon()) << "\n";
+    std::cout << " - Converged: " << (std::get<0>(solution) < solver.getTolerance()) << "\n";
 
     AIToolbox::MDP::Policy policy(world.getS(), world.getA(), std::get<1>(solution));
 

diff --git a/examples/MDP/tiger_antelope.py b/examples/MDP/tiger_antelope.py
@@ -246,7 +246,7 @@ def printState(coord):
                 print ".",
         print ""
 
-def solve_mdp(horizon, epsilon, discount=0.9):
+def solve_mdp(horizon, tolerance, discount=0.9):
     """
     Construct the gridworld MDP, and solve it using value iteration. Print the
     best found policy for sample states.
@@ -291,14 +291,13 @@ def solve_mdp(horizon, epsilon, discount=0.9):
     model.setDiscount(discount)
 
     # Perform value iteration
-    print time.strftime("%H:%M:%S"), "- Solving MDP using ValueIteration(horizon={}, epsilon={})".format(
-        horizon, epsilon)
+    print time.strftime("%H:%M:%S"), "- Solving MDP using ValueIteration(horizon={}, tolerance={})".format(
+        horizon, tolerance)
 
-
-    solver = MDP.ValueIteration(horizon, epsilon)
+    solver = MDP.ValueIteration(horizon, tolerance)
     solution = solver(model)
 
-    print time.strftime("%H:%M:%S"), "- Converged:", solution[0] < solver.getEpsilon()
+    print time.strftime("%H:%M:%S"), "- Converged:", solution[0] < solver.getTolerance()
     _, value_function, q_function = solution
 
     policy = MDP.Policy(len(S), len(A), value_function)
@@ -334,11 +333,11 @@ def solve_mdp(horizon, epsilon, discount=0.9):
                         help="Size of the square gridworld.")
     parser.add_argument('-ho', '--horizon', default=1000000, type=int,
                         help="Horizon parameter for value iteration")
-    parser.add_argument('-e', '--epsilon', default=0.001, type=float,
-                        help="Epsilon parameter for value iteration")
+    parser.add_argument('-t', '--tolerance', default=0.001, type=float,
+                        help="Tolerance parameter for value iteration")
     parser.add_argument('-d', '--discount', default=0.9, type=float,
                         help="Discount parameter for value iteration")
 
     args = parser.parse_args()
     SQUARE_SIZE = args.square_size
-    solve_mdp(horizon=args.horizon, epsilon=args.epsilon)
+    solve_mdp(horizon=args.horizon, tolerance=args.tolerance)
diff --git a/examples/POMDP/tiger_door.cpp b/examples/POMDP/tiger_door.cpp
@@ -194,7 +194,7 @@ int main() {
     // we're just going to do one thing only, and we're done. 2 means we get to
     // do a single action, observe the result, and act again. And so on.
     unsigned horizon = 15;
-    // The 0.0 is the epsilon factor, used with high horizons. It gives a way
+    // The 0.0 is the tolerance factor, used with high horizons. It gives a way
     // to stop the computation if the policy has converged to something static.
     AIToolbox::POMDP::IncrementalPruning solver(horizon, 0.0);
 

diff --git a/examples/POMDP/tiger_door.py b/examples/POMDP/tiger_door.py
@@ -181,7 +181,7 @@ def makeTigerProblem():
     # we're just going to do one thing only, and we're done. 2 means we get to
     # do a single action, observe the result, and act again. And so on.
     horizon = 15
-    # The 0.0 is the epsilon factor, used with high horizons. It gives a way
+    # The 0.0 is the tolerance factor, used with high horizons. It gives a way
     # to stop the computation if the policy has converged to something static.
     solver = POMDP.IncrementalPruning(horizon, 0.0)
 

diff --git a/include/AIToolbox/EpsilonPolicyInterface.hpp b/include/AIToolbox/EpsilonPolicyInterface.hpp
@@ -14,9 +14,9 @@ namespace AIToolbox {
      * automatic exploratory behaviour (e.g. epsilon-greedy policies).
      *
      * An epsilon-greedy policy is a policy that takes a greedy action a
-     * certain percentage of the time, and otherwise takes a random action.
-     * They are useful to force the agent to explore an unknown model, in order
-     * to gain new information to refine it and thus gain more reward.
+     * certain percentage of the time (1-epsilon), and otherwise takes a random
+     * action. They are useful to force the agent to explore an unknown model,
+     * in order to gain new information to refine it and thus gain more reward.
      *
      * Please note that to obtain an epsilon-greedy policy the wrapped
      * policy needs to already be greedy with respect to the model.
@@ -41,14 +41,14 @@ namespace AIToolbox {
              * @param p The policy that is being extended.
              * @param epsilon The parameter that controls the amount of exploration.
              */
-            EpsilonPolicyInterface(const Base & p, double epsilon = 0.9);
+            EpsilonPolicyInterface(const Base & p, double epsilon = 0.1);
 
             /**
              * @brief This function chooses a random action for state s, following the policy distribution and epsilon.
              *
-             * This function has a probability of (1 - epsilon) of selecting
-             * a random action. Otherwise, it selects an action according
-             * to the distribution specified by the wrapped policy.
+             * This function has a probability of `epsilon` of selecting a
+             * random action. Otherwise, it selects an action according to the
+             * distribution specified by the wrapped policy.
              *
              * @param s The sampled state of the policy.
              *
@@ -75,8 +75,8 @@ namespace AIToolbox {
              * The epsilon parameter determines the amount of exploration this
              * policy will enforce when selecting actions. In particular
              * actions are going to selected randomly with probability
-             * (1-epsilon), and are going to be selected following the
-             * underlying policy with probability epsilon.
+             * `epsilon`, and are going to be selected following the underlying
+             * policy with probability `1-epsilon`.
              *
              * The epsilon parameter must be >= 0.0 and <= 1.0,
              * otherwise the function will do throw std::invalid_argument.
@@ -124,16 +124,16 @@ namespace AIToolbox {
 
     template <typename State, typename Sampling, typename Action>
     Action EpsilonPolicyInterface<State, Sampling, Action>::sampleAction(const Sampling & s) const {
-        if ( probabilityDistribution(this->rand_) > epsilon_ )
+        if ( probabilityDistribution(this->rand_) <= epsilon_ )
             return sampleRandomAction();
 
         return policy_.sampleAction(s);
     }
 
     template <typename State, typename Sampling, typename Action>
     double EpsilonPolicyInterface<State, Sampling, Action>::getActionProbability(const Sampling & s, const Action & a) const {
-        //          Probability of taking old decision          Other probability
-        return epsilon_ * policy_.getActionProbability(s,a) + ( 1.0 - epsilon_ ) * getRandomActionProbability();
+        // Probability of taking old decision               Random action probability
+        return (1.0 - epsilon_) * policy_.getActionProbability(s,a) + epsilon_ * getRandomActionProbability();
     }
 
     template <typename State, typename Sampling, typename Action>

diff --git a/include/AIToolbox/Factored/MDP/Policies/EpsilonPolicy.hpp b/include/AIToolbox/Factored/MDP/Policies/EpsilonPolicy.hpp
@@ -24,7 +24,7 @@ namespace AIToolbox::Factored::MDP {
              * @param p The policy that is being extended.
              * @param epsilon The parameter that controls the amount of exploration.
              */
-            EpsilonPolicy(const Base::Base & p, double epsilon = 0.9);
+            EpsilonPolicy(const Base::Base & p, double epsilon = 0.1);
 
         protected:
             /**

diff --git a/include/AIToolbox/LP.hpp b/include/AIToolbox/LP.hpp
@@ -172,7 +172,7 @@ namespace AIToolbox {
              *
              * No guarantees though!
              *
-             * @return The "epsilon" of precision that we hope the solutions, if found, should have.
+             * @return The precision that we hope the solutions, if found, should have.
              */
             static double getPrecision();
 

diff --git a/include/AIToolbox/MDP/Algorithms/Dyna2.hpp b/include/AIToolbox/MDP/Algorithms/Dyna2.hpp
@@ -36,10 +36,10 @@ namespace AIToolbox::MDP {
              * @param m The model to be used to update the QFunction.
              * @param alpha The learning rate of the internal SARSAL methods.
              * @param lambda The lambda parameter for the eligibility traces.
-             * @param epsilon The cutoff point for eligibility traces.
+             * @param tolerance The cutoff point for eligibility traces.
              * @param n The number of sampling passes to do on the model upon batchUpdateQ().
              */
-            explicit Dyna2(const M & m, double alpha = 0.1, double lambda = 0.9, double epsilon = 0.001, unsigned n = 50);
+            explicit Dyna2(const M & m, double alpha = 0.1, double lambda = 0.9, double tolerance = 0.001, unsigned n = 50);
 
             /**
              * @brief This function updates the internal QFunction.
@@ -166,16 +166,16 @@ namespace AIToolbox::MDP {
              * This sets the parameter for both the transient and permanent
              * SARSAL.
              *
-             * @param e The new trace cutoff value.
+             * @param t The new trace cutoff value.
              */
-            void setEpsilon(double e);
+            void setTolerance(double t);
 
             /**
              * @brief This function returns the currently set trace cutoff parameter.
              *
              * @return The currently set trace cutoff parameter.
              */
-            double getEpsilon() const;
+            double getTolerance() const;
 
             /**
              * @brief This function returns a reference to the internal permanent QFunction.
@@ -207,10 +207,10 @@ namespace AIToolbox::MDP {
     };
 
     template <typename M>
-    Dyna2<M>::Dyna2(const M & m, const double alpha, const double lambda, const double epsilon, const unsigned n) :
+    Dyna2<M>::Dyna2(const M & m, const double alpha, const double lambda, const double tolerance, const unsigned n) :
             N(n), model_(m),
-            permanentLearning_(model_, alpha, lambda, epsilon),
-            transientLearning_(model_, alpha, lambda, epsilon),
+            permanentLearning_(model_, alpha, lambda, tolerance),
+            transientLearning_(model_, alpha, lambda, tolerance),
             internalPolicy_(new RandomPolicy(model_.getS(), model_.getA()))
     {
     }
@@ -279,14 +279,14 @@ namespace AIToolbox::MDP {
     }
 
     template <typename M>
-    void Dyna2<M>::setEpsilon(double e) {
-        transientLearning_.setEpsilon(e);
-        permanentLearning_.setEpsilon(e);
+    void Dyna2<M>::setTolerance(const double t) {
+        transientLearning_.setTolerance(t);
+        permanentLearning_.setTolerance(t);
     }
 
     template <typename M>
-    double Dyna2<M>::getEpsilon() const {
-        return permanentLearning_.getEpsilon();
+    double Dyna2<M>::getTolerance() const {
+        return permanentLearning_.getTolerance();
     }
 
     template <typename M>

diff --git a/include/AIToolbox/MDP/Algorithms/ImportanceSampling.hpp b/include/AIToolbox/MDP/Algorithms/ImportanceSampling.hpp
@@ -21,7 +21,7 @@ namespace AIToolbox::MDP {
              * This function returns the ratio between the assumed epsilon-greedy policy and the behaviour policy.
              */
             double getTraceDiscount(const size_t s, const size_t a, const size_t, const double, const size_t maxA) const {
-                const auto prob = (1.0 - exploration_) / A + (a == maxA) * exploration_;
+                const auto prob = epsilon_ / A + (a == maxA) * (1.0 - epsilon_);
                 return prob / behaviour_.getActionProbability(s, a);
             }
     };

diff --git a/include/AIToolbox/MDP/Algorithms/PolicyIteration.hpp b/include/AIToolbox/MDP/Algorithms/PolicyIteration.hpp
@@ -28,9 +28,9 @@ namespace AIToolbox::MDP {
              * @brief Basic constructor.
              *
              * @param horizon The horizon parameter to use during the PolicyEvaluation phase.
-             * @param epsilon The epsilon parameter to use during the PolicyEvaluation phase.
+             * @param tolerance The tolerance parameter to use during the PolicyEvaluation phase.
              */
-            PolicyIteration(unsigned horizon, double epsilon = 0.001);
+            PolicyIteration(unsigned horizon, double tolerance = 0.001);
 
             /**
              * @brief This function applies policy iteration on an MDP to solve it.
@@ -44,21 +44,21 @@ namespace AIToolbox::MDP {
             QFunction operator()(const M & m);
 
             /**
-             * @brief This function sets the epsilon parameter.
+             * @brief This function sets the tolerance parameter.
              *
-             * The epsilon parameter must be >= 0 or the function will throw.
+             * The tolerance parameter must be >= 0 or the function will throw.
              */
-            void setEpsilon(double e);
+            void setTolerance(double t);
 
             /**
              * @brief This function sets the horizon parameter.
              */
             void setHorizon(unsigned h);
 
             /**
-             * @brief This function returns the currently set epsilon parameter.
+             * @brief This function returns the currently set tolerance parameter.
              */
-            double getEpsilon() const;
+            double getTolerance() const;
 
             /**
              * @brief This function returns the currently set horizon parameter.
@@ -67,15 +67,15 @@ namespace AIToolbox::MDP {
 
         private:
             unsigned horizon_;
-            double epsilon_;
+            double tolerance_;
     };
 
     template <typename M, typename>
     QFunction PolicyIteration::operator()(const M & m) {
         const auto S = m.getS();
         const auto A = m.getA();
 
-        PolicyEvaluation<M> eval(m, horizon_, epsilon_);
+        PolicyEvaluation<M> eval(m, horizon_, tolerance_);
 
         auto qfun = makeQFunction(m.getS(), m.getA());
         QGreedyPolicy p(qfun);

diff --git a/include/AIToolbox/MDP/Algorithms/QL.hpp b/include/AIToolbox/MDP/Algorithms/QL.hpp
@@ -10,16 +10,26 @@ namespace AIToolbox::MDP {
      * \sa QLEvaluation
      *
      * This method behaves as an inefficient QLearning if you set the lambda
-     * parameter to zero (effectively cutting all traces), and the exploration
+     * parameter to zero (effectively cutting all traces), and the epsilon
      * parameter to zero (forcing a perfectly greedy target policy).
      */
     class QL : public OffPolicyControl<QL> {
         public:
             using Parent = OffPolicyControl<QL>;
 
-            QL(const PolicyInterface & behaviour, const double lambda, const double exploration = 0.9,
-               const double discount = 1.0, const double alpha = 0.1, const double epsilon = 0.001) :
-                    Parent(behaviour, exploration, discount, alpha, epsilon)
+            /**
+             * @brief Basic constructor.
+             *
+             * @param behaviour Behaviour policy
+             * @param lambda Lambda trace parameter.
+             * @param epsilon The epsilon of the implied target greedy epsilon policy.
+             * @param discount Discount for the problem.
+             * @param alpha Learning rate parameter.
+             * @param tolerance Trace cutoff parameter.
+             */
+            QL(const PolicyInterface & behaviour, const double lambda, const double epsilon = 0.1,
+               const double discount = 1.0, const double alpha = 0.1, const double tolerance = 0.001) :
+                    Parent(behaviour, epsilon, discount, alpha, tolerance)
             {
                 setLambda(lambda);
             }
@@ -80,11 +90,11 @@ namespace AIToolbox::MDP {
              * @param lambda Lambda trace parameter.
              * @param discount Discount for the problem.
              * @param alpha Learning rate parameter.
-             * @param epsilon Trace cutoff parameter.
+             * @param tolerance Trace cutoff parameter.
              */
             QLEvaluation(const PolicyInterface & target, const PolicyInterface & behaviour,
-                         const double lambda, const double discount, const double alpha, const double epsilon) :
-                    Parent(target, behaviour, discount, alpha, epsilon)
+                         const double lambda, const double discount, const double alpha, const double tolerance) :
+                    Parent(target, behaviour, discount, alpha, tolerance)
             {
                 setLambda(lambda);
             }

diff --git a/include/AIToolbox/MDP/Algorithms/RetraceL.hpp b/include/AIToolbox/MDP/Algorithms/RetraceL.hpp
@@ -13,9 +13,19 @@ namespace AIToolbox::MDP {
         public:
             using Parent = OffPolicyControl<RetraceL>;
 
-            RetraceL(const PolicyInterface & behaviour, const double lambda, const double exploration = 0.9,
-                     const double discount = 1.0, const double alpha = 0.1, const double epsilon = 0.001) :
-                    Parent(behaviour, exploration, discount, alpha, epsilon)
+            /**
+             * @brief Basic constructor.
+             *
+             * @param behaviour Behaviour policy
+             * @param lambda Lambda trace parameter.
+             * @param epsilon The epsilon of the implied target greedy epsilon policy.
+             * @param discount Discount for the problem.
+             * @param alpha Learning rate parameter.
+             * @param tolerance Trace cutoff parameter.
+             */
+            RetraceL(const PolicyInterface & behaviour, const double lambda, const double epsilon = 0.1,
+                     const double discount = 1.0, const double alpha = 0.1, const double tolerance = 0.001) :
+                    Parent(behaviour, epsilon, discount, alpha, tolerance)
             {
                 setLambda(lambda);
             }
@@ -24,7 +34,7 @@ namespace AIToolbox::MDP {
              * @brief This function returns the trace discount for the learning.
              */
             double getTraceDiscount(const size_t s, const size_t a, const size_t, const double, const size_t maxA) const {
-                const auto prob = (1.0 - exploration_) / A + (a == maxA) * exploration_;
+                const auto prob = epsilon_ / A + (a == maxA) * (1.0 - epsilon_);
                 return lambda_ * std::min(1.0, prob / behaviour_.getActionProbability(s, a));
             }
 
@@ -75,11 +85,11 @@ namespace AIToolbox::MDP {
              * @param lambda Lambda trace parameter.
              * @param discount Discount for the problem.
              * @param alpha Learning rate parameter.
-             * @param epsilon Trace cutoff parameter.
+             * @param tolerance Trace cutoff parameter.
              */
             RetraceLEvaluation(const PolicyInterface & target, const PolicyInterface & behaviour,
-                               const double lambda, const double discount, const double alpha, const double epsilon) :
-                    Parent(target, behaviour, discount, alpha, epsilon)
+                               const double lambda, const double discount, const double alpha, const double tolerance) :
+                    Parent(target, behaviour, discount, alpha, tolerance)
             {
                 setLambda(lambda);
             }