This repository has been archived by the owner on Jul 26, 2022. It is now read-only.
/
sampler.h
136 lines (120 loc) · 4.47 KB
/
sampler.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
* Copyright (c) 2017-present, Facebook, Inc.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include "trainer.h"
#include <autogradpp/autograd.h>
namespace cpid {
/**
* A sampler takes the output of the model, and outputs an action accordingly.
* The exact shape of the action is dependent on the rest of the training loop.
* For convenience, the base sampling function is the identity.
*/
class BaseSampler {
public:
BaseSampler(){};
virtual ~BaseSampler() = default;
virtual ag::Variant sample(ag::Variant in) {
return in;
};
virtual ag::Variant computeProba(
const ag::Variant& in,
const ag::Variant& action) {
throw std::runtime_error("Proba computation not implemented...");
return ag::Variant(0);
}
};
/**
* This sampler expects as input an unordered_map<string, Variant>, which
* contains an entry policyKey, which is a tensor of size [b, n]. It outputs the
* same map, with a new key actionKey, a tensor of size [b] where each entry is
* in [0,n-1], and is the result of multinomial sampling over pi. It also adds a
* key pActionKey which corresponds to the probability of the sampled action.
*/
class MultinomialSampler : public BaseSampler {
public:
MultinomialSampler(
const std::string& policyKey = kPiKey,
const std::string& actionKey = kActionKey,
const std::string& pActionKey = kPActionKey);
ag::Variant sample(ag::Variant in) override;
ag::Variant computeProba(const ag::Variant& in, const ag::Variant& action)
override;
protected:
std::string policyKey_, actionKey_, pActionKey_;
};
/**
* This sampler expects as input an unordered_map<string, Variant>, containing
* an entry QKey, which is a tensor of size [b, n]. It outputs the same map,
* with a new key kActionKey, a tensor of size [b] where each entry is in
* [0,n-1], and correspond to the action with the highest score.
*/
class DiscreteMaxSampler : public BaseSampler {
public:
DiscreteMaxSampler(
const std::string& policyKey = kPiKey,
const std::string& actionKey = kActionKey);
ag::Variant sample(ag::Variant in) override;
protected:
std::string policyKey_, actionKey_;
};
/**
* This sampler expects as input an unordered_map<string, Variant>, containing
* an entry policyKey, which is a tensor of size [b, n]. It outputs the same
* map, with a new key kActionKey, a tensor of size [b] where each entry
* action[i] is sampled from a normal distribution centered in policy[i]. It
* also expects the stdKey to be set, it will be used as the standard deviation
* of the normal. It can be either a float/double, in which case the deviation
* will be the same for the batch, or it can be the same shape as the policy,
* for a finer control. It also adds a key pActionKey which corresponds to the
* probability of the sampled action.
*/
class ContinuousGaussianSampler : public BaseSampler {
public:
ContinuousGaussianSampler(
const std::string& policyKey = kPiKey,
const std::string& stdKey = kSigmaKey,
const std::string& actionKey = kActionKey,
const std::string& pActionKey = kPActionKey);
ag::Variant sample(ag::Variant in) override;
ag::Variant computeProba(const ag::Variant& in, const ag::Variant& action)
override;
protected:
std::string policyKey_, stdKey_;
std::string actionKey_, pActionKey_;
};
/**
* This sampler expects as input an unordered_map<string, Variant> containing an
* entry policyKey, which is a tensor of size [b, n]. It outputs the same map,
* with a new key kActionKey, a clone of the policy.
*/
class ContinuousDeterministicSampler : public BaseSampler {
public:
ContinuousDeterministicSampler(
const std::string& policyKey = kPiKey,
const std::string& actionKey = kActionKey);
ag::Variant sample(ag::Variant in) override;
protected:
std::string policyKey_;
std::string actionKey_;
};
/**
* This sampler expects as input an unordered_map<string, Variant> containing an
* entry QKey, which is a tensor of size [b, n]. It outputs the same map, with a
* new key actionKey, which contains the best action with proba 1-eps, and a
* random action with proba eps.
*/
class EpsGreedySampler : public BaseSampler {
public:
EpsGreedySampler(
double eps = 0.07,
const std::string& QKey = kQKey,
const std::string& actionKey = kActionKey);
ag::Variant sample(ag::Variant in) override;
double eps_;
std::string QKey_, actionKey_;
};
} // namespace cpid