Skip to content

Commit

Permalink
Merge branch 'epsilon_fix' into test
Browse files Browse the repository at this point in the history
  • Loading branch information
Svalorzen committed Sep 17, 2018
2 parents 6938851 + 57aac10 commit d116504
Show file tree
Hide file tree
Showing 66 changed files with 608 additions and 583 deletions.
2 changes: 1 addition & 1 deletion examples/MDP/tiger_antelope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ int main() {
auto solution = solver(model);

printCurrentTimeString();
std::cout << " - Converged: " << (std::get<0>(solution) < solver.getEpsilon()) << "\n";
std::cout << " - Converged: " << (std::get<0>(solution) < solver.getTolerance()) << "\n";

AIToolbox::MDP::Policy policy(world.getS(), world.getA(), std::get<1>(solution));

Expand Down
17 changes: 8 additions & 9 deletions examples/MDP/tiger_antelope.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def printState(coord):
print ".",
print ""

def solve_mdp(horizon, epsilon, discount=0.9):
def solve_mdp(horizon, tolerance, discount=0.9):
"""
Construct the gridworld MDP, and solve it using value iteration. Print the
best found policy for sample states.
Expand Down Expand Up @@ -291,14 +291,13 @@ def solve_mdp(horizon, epsilon, discount=0.9):
model.setDiscount(discount)

# Perform value iteration
print time.strftime("%H:%M:%S"), "- Solving MDP using ValueIteration(horizon={}, epsilon={})".format(
horizon, epsilon)
print time.strftime("%H:%M:%S"), "- Solving MDP using ValueIteration(horizon={}, tolerance={})".format(
horizon, tolerance)


solver = MDP.ValueIteration(horizon, epsilon)
solver = MDP.ValueIteration(horizon, tolerance)
solution = solver(model)

print time.strftime("%H:%M:%S"), "- Converged:", solution[0] < solver.getEpsilon()
print time.strftime("%H:%M:%S"), "- Converged:", solution[0] < solver.getTolerance()
_, value_function, q_function = solution

policy = MDP.Policy(len(S), len(A), value_function)
Expand Down Expand Up @@ -334,11 +333,11 @@ def solve_mdp(horizon, epsilon, discount=0.9):
help="Size of the square gridworld.")
parser.add_argument('-ho', '--horizon', default=1000000, type=int,
help="Horizon parameter for value iteration")
parser.add_argument('-e', '--epsilon', default=0.001, type=float,
help="Epsilon parameter for value iteration")
parser.add_argument('-t', '--tolerance', default=0.001, type=float,
help="Tolerance parameter for value iteration")
parser.add_argument('-d', '--discount', default=0.9, type=float,
help="Discount parameter for value iteration")

args = parser.parse_args()
SQUARE_SIZE = args.square_size
solve_mdp(horizon=args.horizon, epsilon=args.epsilon)
solve_mdp(horizon=args.horizon, tolerance=args.tolerance)
2 changes: 1 addition & 1 deletion examples/POMDP/tiger_door.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ int main() {
// we're just going to do one thing only, and we're done. 2 means we get to
// do a single action, observe the result, and act again. And so on.
unsigned horizon = 15;
// The 0.0 is the epsilon factor, used with high horizons. It gives a way
// The 0.0 is the tolerance factor, used with high horizons. It gives a way
// to stop the computation if the policy has converged to something static.
AIToolbox::POMDP::IncrementalPruning solver(horizon, 0.0);

Expand Down
2 changes: 1 addition & 1 deletion examples/POMDP/tiger_door.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def makeTigerProblem():
# we're just going to do one thing only, and we're done. 2 means we get to
# do a single action, observe the result, and act again. And so on.
horizon = 15
# The 0.0 is the epsilon factor, used with high horizons. It gives a way
# The 0.0 is the tolerance factor, used with high horizons. It gives a way
# to stop the computation if the policy has converged to something static.
solver = POMDP.IncrementalPruning(horizon, 0.0)

Expand Down
24 changes: 12 additions & 12 deletions include/AIToolbox/EpsilonPolicyInterface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ namespace AIToolbox {
* automatic exploratory behaviour (e.g. epsilon-greedy policies).
*
* An epsilon-greedy policy is a policy that takes a greedy action a
* certain percentage of the time, and otherwise takes a random action.
* They are useful to force the agent to explore an unknown model, in order
* to gain new information to refine it and thus gain more reward.
* certain percentage of the time (1-epsilon), and otherwise takes a random
* action. They are useful to force the agent to explore an unknown model,
* in order to gain new information to refine it and thus gain more reward.
*
* Please note that to obtain an epsilon-greedy policy the wrapped
* policy needs to already be greedy with respect to the model.
Expand All @@ -41,14 +41,14 @@ namespace AIToolbox {
* @param p The policy that is being extended.
* @param epsilon The parameter that controls the amount of exploration.
*/
EpsilonPolicyInterface(const Base & p, double epsilon = 0.9);
EpsilonPolicyInterface(const Base & p, double epsilon = 0.1);

/**
* @brief This function chooses a random action for state s, following the policy distribution and epsilon.
*
* This function has a probability of (1 - epsilon) of selecting
* a random action. Otherwise, it selects an action according
* to the distribution specified by the wrapped policy.
* This function has a probability of `epsilon` of selecting a
* random action. Otherwise, it selects an action according to the
* distribution specified by the wrapped policy.
*
* @param s The sampled state of the policy.
*
Expand All @@ -75,8 +75,8 @@ namespace AIToolbox {
* The epsilon parameter determines the amount of exploration this
* policy will enforce when selecting actions. In particular
* actions are going to selected randomly with probability
* (1-epsilon), and are going to be selected following the
* underlying policy with probability epsilon.
* `epsilon`, and are going to be selected following the underlying
* policy with probability `1-epsilon`.
*
* The epsilon parameter must be >= 0.0 and <= 1.0,
* otherwise the function will do throw std::invalid_argument.
Expand Down Expand Up @@ -124,16 +124,16 @@ namespace AIToolbox {

template <typename State, typename Sampling, typename Action>
Action EpsilonPolicyInterface<State, Sampling, Action>::sampleAction(const Sampling & s) const {
if ( probabilityDistribution(this->rand_) > epsilon_ )
if ( probabilityDistribution(this->rand_) <= epsilon_ )
return sampleRandomAction();

return policy_.sampleAction(s);
}

template <typename State, typename Sampling, typename Action>
double EpsilonPolicyInterface<State, Sampling, Action>::getActionProbability(const Sampling & s, const Action & a) const {
// Probability of taking old decision Other probability
return epsilon_ * policy_.getActionProbability(s,a) + ( 1.0 - epsilon_ ) * getRandomActionProbability();
// Probability of taking old decision Random action probability
return (1.0 - epsilon_) * policy_.getActionProbability(s,a) + epsilon_ * getRandomActionProbability();
}

template <typename State, typename Sampling, typename Action>
Expand Down
2 changes: 1 addition & 1 deletion include/AIToolbox/Factored/MDP/Policies/EpsilonPolicy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace AIToolbox::Factored::MDP {
* @param p The policy that is being extended.
* @param epsilon The parameter that controls the amount of exploration.
*/
EpsilonPolicy(const Base::Base & p, double epsilon = 0.9);
EpsilonPolicy(const Base::Base & p, double epsilon = 0.1);

protected:
/**
Expand Down
2 changes: 1 addition & 1 deletion include/AIToolbox/LP.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ namespace AIToolbox {
*
* No guarantees though!
*
* @return The "epsilon" of precision that we hope the solutions, if found, should have.
* @return The precision that we hope the solutions, if found, should have.
*/
static double getPrecision();

Expand Down
26 changes: 13 additions & 13 deletions include/AIToolbox/MDP/Algorithms/Dyna2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ namespace AIToolbox::MDP {
* @param m The model to be used to update the QFunction.
* @param alpha The learning rate of the internal SARSAL methods.
* @param lambda The lambda parameter for the eligibility traces.
* @param epsilon The cutoff point for eligibility traces.
* @param tolerance The cutoff point for eligibility traces.
* @param n The number of sampling passes to do on the model upon batchUpdateQ().
*/
explicit Dyna2(const M & m, double alpha = 0.1, double lambda = 0.9, double epsilon = 0.001, unsigned n = 50);
explicit Dyna2(const M & m, double alpha = 0.1, double lambda = 0.9, double tolerance = 0.001, unsigned n = 50);

/**
* @brief This function updates the internal QFunction.
Expand Down Expand Up @@ -166,16 +166,16 @@ namespace AIToolbox::MDP {
* This sets the parameter for both the transient and permanent
* SARSAL.
*
* @param e The new trace cutoff value.
* @param t The new trace cutoff value.
*/
void setEpsilon(double e);
void setTolerance(double t);

/**
* @brief This function returns the currently set trace cutoff parameter.
*
* @return The currently set trace cutoff parameter.
*/
double getEpsilon() const;
double getTolerance() const;

/**
* @brief This function returns a reference to the internal permanent QFunction.
Expand Down Expand Up @@ -207,10 +207,10 @@ namespace AIToolbox::MDP {
};

template <typename M>
Dyna2<M>::Dyna2(const M & m, const double alpha, const double lambda, const double epsilon, const unsigned n) :
Dyna2<M>::Dyna2(const M & m, const double alpha, const double lambda, const double tolerance, const unsigned n) :
N(n), model_(m),
permanentLearning_(model_, alpha, lambda, epsilon),
transientLearning_(model_, alpha, lambda, epsilon),
permanentLearning_(model_, alpha, lambda, tolerance),
transientLearning_(model_, alpha, lambda, tolerance),
internalPolicy_(new RandomPolicy(model_.getS(), model_.getA()))
{
}
Expand Down Expand Up @@ -279,14 +279,14 @@ namespace AIToolbox::MDP {
}

template <typename M>
void Dyna2<M>::setEpsilon(double e) {
transientLearning_.setEpsilon(e);
permanentLearning_.setEpsilon(e);
void Dyna2<M>::setTolerance(const double t) {
transientLearning_.setTolerance(t);
permanentLearning_.setTolerance(t);
}

template <typename M>
double Dyna2<M>::getEpsilon() const {
return permanentLearning_.getEpsilon();
double Dyna2<M>::getTolerance() const {
return permanentLearning_.getTolerance();
}

template <typename M>
Expand Down
2 changes: 1 addition & 1 deletion include/AIToolbox/MDP/Algorithms/ImportanceSampling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace AIToolbox::MDP {
* This function returns the ratio between the assumed epsilon-greedy policy and the behaviour policy.
*/
double getTraceDiscount(const size_t s, const size_t a, const size_t, const double, const size_t maxA) const {
const auto prob = (1.0 - exploration_) / A + (a == maxA) * exploration_;
const auto prob = epsilon_ / A + (a == maxA) * (1.0 - epsilon_);
return prob / behaviour_.getActionProbability(s, a);
}
};
Expand Down
18 changes: 9 additions & 9 deletions include/AIToolbox/MDP/Algorithms/PolicyIteration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ namespace AIToolbox::MDP {
* @brief Basic constructor.
*
* @param horizon The horizon parameter to use during the PolicyEvaluation phase.
* @param epsilon The epsilon parameter to use during the PolicyEvaluation phase.
* @param tolerance The tolerance parameter to use during the PolicyEvaluation phase.
*/
PolicyIteration(unsigned horizon, double epsilon = 0.001);
PolicyIteration(unsigned horizon, double tolerance = 0.001);

/**
* @brief This function applies policy iteration on an MDP to solve it.
Expand All @@ -44,21 +44,21 @@ namespace AIToolbox::MDP {
QFunction operator()(const M & m);

/**
* @brief This function sets the epsilon parameter.
* @brief This function sets the tolerance parameter.
*
* The epsilon parameter must be >= 0 or the function will throw.
* The tolerance parameter must be >= 0 or the function will throw.
*/
void setEpsilon(double e);
void setTolerance(double t);

/**
* @brief This function sets the horizon parameter.
*/
void setHorizon(unsigned h);

/**
* @brief This function returns the currently set epsilon parameter.
* @brief This function returns the currently set tolerance parameter.
*/
double getEpsilon() const;
double getTolerance() const;

/**
* @brief This function returns the currently set horizon parameter.
Expand All @@ -67,15 +67,15 @@ namespace AIToolbox::MDP {

private:
unsigned horizon_;
double epsilon_;
double tolerance_;
};

template <typename M, typename>
QFunction PolicyIteration::operator()(const M & m) {
const auto S = m.getS();
const auto A = m.getA();

PolicyEvaluation<M> eval(m, horizon_, epsilon_);
PolicyEvaluation<M> eval(m, horizon_, tolerance_);

auto qfun = makeQFunction(m.getS(), m.getA());
QGreedyPolicy p(qfun);
Expand Down
24 changes: 17 additions & 7 deletions include/AIToolbox/MDP/Algorithms/QL.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,26 @@ namespace AIToolbox::MDP {
* \sa QLEvaluation
*
* This method behaves as an inefficient QLearning if you set the lambda
* parameter to zero (effectively cutting all traces), and the exploration
* parameter to zero (effectively cutting all traces), and the epsilon
* parameter to zero (forcing a perfectly greedy target policy).
*/
class QL : public OffPolicyControl<QL> {
public:
using Parent = OffPolicyControl<QL>;

QL(const PolicyInterface & behaviour, const double lambda, const double exploration = 0.9,
const double discount = 1.0, const double alpha = 0.1, const double epsilon = 0.001) :
Parent(behaviour, exploration, discount, alpha, epsilon)
/**
* @brief Basic constructor.
*
* @param behaviour Behaviour policy
* @param lambda Lambda trace parameter.
* @param epsilon The epsilon of the implied target greedy epsilon policy.
* @param discount Discount for the problem.
* @param alpha Learning rate parameter.
* @param tolerance Trace cutoff parameter.
*/
QL(const PolicyInterface & behaviour, const double lambda, const double epsilon = 0.1,
const double discount = 1.0, const double alpha = 0.1, const double tolerance = 0.001) :
Parent(behaviour, epsilon, discount, alpha, tolerance)
{
setLambda(lambda);
}
Expand Down Expand Up @@ -80,11 +90,11 @@ namespace AIToolbox::MDP {
* @param lambda Lambda trace parameter.
* @param discount Discount for the problem.
* @param alpha Learning rate parameter.
* @param epsilon Trace cutoff parameter.
* @param tolerance Trace cutoff parameter.
*/
QLEvaluation(const PolicyInterface & target, const PolicyInterface & behaviour,
const double lambda, const double discount, const double alpha, const double epsilon) :
Parent(target, behaviour, discount, alpha, epsilon)
const double lambda, const double discount, const double alpha, const double tolerance) :
Parent(target, behaviour, discount, alpha, tolerance)
{
setLambda(lambda);
}
Expand Down
24 changes: 17 additions & 7 deletions include/AIToolbox/MDP/Algorithms/RetraceL.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,19 @@ namespace AIToolbox::MDP {
public:
using Parent = OffPolicyControl<RetraceL>;

RetraceL(const PolicyInterface & behaviour, const double lambda, const double exploration = 0.9,
const double discount = 1.0, const double alpha = 0.1, const double epsilon = 0.001) :
Parent(behaviour, exploration, discount, alpha, epsilon)
/**
* @brief Basic constructor.
*
* @param behaviour Behaviour policy
* @param lambda Lambda trace parameter.
* @param epsilon The epsilon of the implied target greedy epsilon policy.
* @param discount Discount for the problem.
* @param alpha Learning rate parameter.
* @param tolerance Trace cutoff parameter.
*/
RetraceL(const PolicyInterface & behaviour, const double lambda, const double epsilon = 0.1,
const double discount = 1.0, const double alpha = 0.1, const double tolerance = 0.001) :
Parent(behaviour, epsilon, discount, alpha, tolerance)
{
setLambda(lambda);
}
Expand All @@ -24,7 +34,7 @@ namespace AIToolbox::MDP {
* @brief This function returns the trace discount for the learning.
*/
double getTraceDiscount(const size_t s, const size_t a, const size_t, const double, const size_t maxA) const {
const auto prob = (1.0 - exploration_) / A + (a == maxA) * exploration_;
const auto prob = epsilon_ / A + (a == maxA) * (1.0 - epsilon_);
return lambda_ * std::min(1.0, prob / behaviour_.getActionProbability(s, a));
}

Expand Down Expand Up @@ -75,11 +85,11 @@ namespace AIToolbox::MDP {
* @param lambda Lambda trace parameter.
* @param discount Discount for the problem.
* @param alpha Learning rate parameter.
* @param epsilon Trace cutoff parameter.
* @param tolerance Trace cutoff parameter.
*/
RetraceLEvaluation(const PolicyInterface & target, const PolicyInterface & behaviour,
const double lambda, const double discount, const double alpha, const double epsilon) :
Parent(target, behaviour, discount, alpha, epsilon)
const double lambda, const double discount, const double alpha, const double tolerance) :
Parent(target, behaviour, discount, alpha, tolerance)
{
setLambda(lambda);
}
Expand Down
Loading

0 comments on commit d116504

Please sign in to comment.