Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rec: Purge map of failed auths periodically by keeping a last changed timestamp. #8525

Merged
merged 6 commits into from Nov 19, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion pdns/pdns_recursor.cc
Expand Up @@ -2844,7 +2844,9 @@ static void doStats(void)

g_log<<Logger::Notice<<"stats: throttle map: "
<< broadcastAccFunction<uint64_t>(pleaseGetThrottleSize) <<", ns speeds: "
<< broadcastAccFunction<uint64_t>(pleaseGetNsSpeedsSize)<<endl;
<< broadcastAccFunction<uint64_t>(pleaseGetNsSpeedsSize)<<", failed ns: "
<< broadcastAccFunction<uint64_t>(pleaseGetFailedServersSize)<<", ednsmap: "
<<broadcastAccFunction<uint64_t>(pleaseGetEDNSStatusesSize)<<endl;
g_log<<Logger::Notice<<"stats: outpacket/query ratio "<<(int)(SyncRes::s_outqueries*100.0/SyncRes::s_queries)<<"%";
g_log<<Logger::Notice<<", "<<(int)(SyncRes::s_throttledqueries*100.0/(SyncRes::s_outqueries+SyncRes::s_throttledqueries))<<"% throttled, "
<<SyncRes::s_nodelegated<<" no-delegation drops"<<endl;
Expand Down Expand Up @@ -2908,6 +2910,10 @@ static void houseKeeping(void *)
if(!((cleanCounter++)%40)) { // this is a full scan!
time_t limit=now.tv_sec-300;
SyncRes::pruneNSSpeeds(limit);
limit = now.tv_sec - SyncRes::s_serverdownthrottletime * 10;
SyncRes::pruneFailedServers(limit);
limit = now.tv_sec - 2*3600;
SyncRes::pruneEDNSStatuses(limit);
omoerbeek marked this conversation as resolved.
Show resolved Hide resolved
}
last_prune=time(0);
}
Expand Down
43 changes: 42 additions & 1 deletion pdns/rec_channel_rec.cc
Expand Up @@ -231,6 +231,11 @@ static uint64_t* pleaseDumpThrottleMap(int fd)
return new uint64_t(SyncRes::doDumpThrottleMap(fd));
}

static uint64_t* pleaseDumpFailedServers(int fd)
{
return new uint64_t(SyncRes::doDumpFailedServers(fd));
}

template<typename T>
static string doDumpNSSpeeds(T begin, T end)
{
Expand Down Expand Up @@ -367,6 +372,28 @@ static string doDumpThrottleMap(T begin, T end)
return "dumped "+std::to_string(total)+" records\n";
}

template<typename T>
static string doDumpFailedServers(T begin, T end)
{
T i=begin;
string fname;

if(i!=end)
fname=*i;

int fd=open(fname.c_str(), O_CREAT | O_EXCL | O_WRONLY, 0660);
if(fd < 0)
return "Error opening dump file for writing: "+stringerror()+"\n";
uint64_t total = 0;
try {
total = broadcastAccFunction<uint64_t>(boost::bind(pleaseDumpFailedServers, fd));
}
catch(...){}

close(fd);
return "dumped "+std::to_string(total)+" records\n";
}

uint64_t* pleaseWipeCache(const DNSName& canon, bool subtree)
{
return new uint64_t(t_RC->doWipeCache(canon, subtree));
Expand Down Expand Up @@ -885,6 +912,16 @@ static uint64_t getNsSpeedsSize()
return broadcastAccFunction<uint64_t>(pleaseGetNsSpeedsSize);
}

uint64_t* pleaseGetFailedServersSize()
{
return new uint64_t(SyncRes::getFailedServersSize());
}

uint64_t* pleaseGetEDNSStatusesSize()
{
return new uint64_t(SyncRes::getEDNSStatusesSize());
}

uint64_t* pleaseGetConcurrentQueries()
{
return new uint64_t(getMT() ? getMT()->numProcesses() : 0);
Expand Down Expand Up @@ -1612,7 +1649,8 @@ string RecursorControlParser::getAnswer(const string& question, RecursorControlP
"dump-edns [status] <filename> dump EDNS status to the named file\n"
"dump-nsspeeds <filename> dump nsspeeds statistics to the named file\n"
"dump-rpz <zone name> <filename> dump the content of a RPZ zone to the named file\n"
"dump-throttlemap <filename> dump the contents of the throttle to the named file\n"
"dump-throttlemap <filename> dump the contents of the throttle map to the named file\n"
"dump-failedservers <filename> dump the failed servers to the named file\n"
"get [key1] [key2] .. get specific statistics\n"
"get-all get all statistics\n"
"get-dont-throttle-names get the list of names that are not allowed to be throttled\n"
Expand Down Expand Up @@ -1684,6 +1722,9 @@ string RecursorControlParser::getAnswer(const string& question, RecursorControlP
if(cmd=="dump-nsspeeds")
return doDumpNSSpeeds(begin, end);

if(cmd=="dump-failedservers")
return doDumpFailedServers(begin, end);

if(cmd=="dump-rpz") {
return doDumpRPZ(begin, end);
}
Expand Down
16 changes: 15 additions & 1 deletion pdns/recursordist/docs/manpages/rec_control.1.rst
Expand Up @@ -146,7 +146,21 @@ dump-throttlemap *FILENAME*
:program:`pdns_recursor` often runs in a chroot. You can
retrieve the file using::

rec_control dump-rpz ZONE_NAME /tmp/file
rec_control dump-throttlemap /tmp/file
mv /proc/$(pidof pdns_recursor)/root/tmp/file /tmp/filename

dump-failedservers *FILENAME*
Dump the contents of the failed server map to the *FILENAME* mentioned.
This file should not exist already, PowerDNS will refuse to
overwrite it otherwise. While dumping, the recursor will not answer
questions.

.. note::

:program:`pdns_recursor` often runs in a chroot. You can
retrieve the file using::

rec_control dump-failedservers /tmp/file
mv /proc/$(pidof pdns_recursor)/root/tmp/file /tmp/filename

get *STATISTIC* [*STATISTIC*]...
Expand Down
53 changes: 38 additions & 15 deletions pdns/syncres.cc
Expand Up @@ -472,6 +472,27 @@ uint64_t SyncRes::doDumpThrottleMap(int fd)
return count;
}

uint64_t SyncRes::doDumpFailedServers(int fd)
{
auto fp = std::unique_ptr<FILE, int(*)(FILE*)>(fdopen(dup(fd), "w"), fclose);
if(!fp)
return 0;
fprintf(fp.get(), "; failed servers dump follows\n");
fprintf(fp.get(), "; remote IP\tcount\ttimestamp\n");
uint64_t count=0;

for(const auto& i : t_sstorage.fails.getMap())
{
count++;
char tmp[26];
ctime_r(&i.second.last, tmp);
fprintf(fp.get(), "%s\t%lld\t%s", i.first.toString().c_str(),
static_cast<long long>(i.second.value), tmp);
}

return count;
}

/* so here is the story. First we complete the full resolution process for a domain name. And only THEN do we decide
to also do DNSSEC validation, which leads to new queries. To make this simple, we *always* ask for DNSSEC records
so that if there are RRSIGs for a name, we'll have them.
Expand Down Expand Up @@ -515,16 +536,15 @@ int SyncRes::asyncresolveWrapper(const ComboAddress& ip, bool ednsMANDATORY, con
If '3', send bare queries
*/

SyncRes::EDNSStatus* ednsstatus;
ednsstatus = &t_sstorage.ednsstatus[ip]; // does this include port? YES
SyncRes::EDNSStatus* ednsstatus = &t_sstorage.ednsstatus[ip]; // does this include port? YES

if(ednsstatus->modeSetAt && ednsstatus->modeSetAt + 3600 < d_now.tv_sec) {
*ednsstatus=SyncRes::EDNSStatus();
if (ednsstatus->modeSetAt && ednsstatus->modeSetAt + 3600 < d_now.tv_sec) {
*ednsstatus = SyncRes::EDNSStatus();
// cerr<<"Resetting EDNS Status for "<<ip.toString()<<endl);
}

SyncRes::EDNSStatus::EDNSMode& mode=ednsstatus->mode;
SyncRes::EDNSStatus::EDNSMode oldmode = mode;
SyncRes::EDNSStatus::EDNSMode *mode = &ednsstatus->mode;
SyncRes::EDNSStatus::EDNSMode oldmode = *mode;
int EDNSLevel = 0;
auto luaconfsLocal = g_luaconfs.getLocal();
ResolveContext ctx;
Expand All @@ -539,11 +559,11 @@ int SyncRes::asyncresolveWrapper(const ComboAddress& ip, bool ednsMANDATORY, con
for(int tries = 0; tries < 3; ++tries) {
// cerr<<"Remote '"<<ip.toString()<<"' currently in mode "<<mode<<endl;

if(mode==EDNSStatus::NOEDNS) {
if (*mode == EDNSStatus::NOEDNS) {
g_stats.noEdnsOutQueries++;
EDNSLevel = 0; // level != mode
}
else if(ednsMANDATORY || mode==EDNSStatus::UNKNOWN || mode==EDNSStatus::EDNSOK || mode==EDNSStatus::EDNSIGNORANT)
else if (ednsMANDATORY || *mode == EDNSStatus::UNKNOWN || *mode == EDNSStatus::EDNSOK || *mode == EDNSStatus::EDNSIGNORANT)
EDNSLevel = 1;

DNSName sendQname(domain);
Expand All @@ -556,32 +576,35 @@ int SyncRes::asyncresolveWrapper(const ComboAddress& ip, bool ednsMANDATORY, con
else {
ret=asyncresolve(ip, sendQname, type, doTCP, sendRDQuery, EDNSLevel, now, srcmask, ctx, d_outgoingProtobufServers, d_frameStreamServers, luaconfsLocal->outgoingProtobufExportConfig.exportTypes, res, chained);
}
// ednsstatus might be cleared, so do a new lookup
ednsstatus = &t_sstorage.ednsstatus[ip]; // does this include port? YES
mode = &ednsstatus->mode;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should update oldmode as well here? The only difference it would make is that if the status has already been updated in the meantime, we will set modeSetAt again. It's a matter of seconds, though, so it probably doesn't matter.

if(ret < 0) {
return ret; // transport error, nothing to learn here
}

if(ret == 0) { // timeout, not doing anything with it now
return ret;
}
else if(mode==EDNSStatus::UNKNOWN || mode==EDNSStatus::EDNSOK || mode == EDNSStatus::EDNSIGNORANT ) {
else if (*mode == EDNSStatus::UNKNOWN || *mode == EDNSStatus::EDNSOK || *mode == EDNSStatus::EDNSIGNORANT ) {
if(res->d_validpacket && !res->d_haveEDNS && res->d_rcode == RCode::FormErr) {
// cerr<<"Downgrading to NOEDNS because of "<<RCode::to_s(res->d_rcode)<<" for query to "<<ip.toString()<<" for '"<<domain<<"'"<<endl;
mode = EDNSStatus::NOEDNS;
*mode = EDNSStatus::NOEDNS;
continue;
}
else if(!res->d_haveEDNS) {
if(mode != EDNSStatus::EDNSIGNORANT) {
mode = EDNSStatus::EDNSIGNORANT;
if (*mode != EDNSStatus::EDNSIGNORANT) {
*mode = EDNSStatus::EDNSIGNORANT;
// cerr<<"We find that "<<ip.toString()<<" is an EDNS-ignorer for '"<<domain<<"', moving to mode 2"<<endl;
}
}
else {
mode = EDNSStatus::EDNSOK;
*mode = EDNSStatus::EDNSOK;
// cerr<<"We find that "<<ip.toString()<<" is EDNS OK!"<<endl;
}

}
if(oldmode != mode || !ednsstatus->modeSetAt)
if (oldmode != *mode || !ednsstatus->modeSetAt)
ednsstatus->modeSetAt=d_now.tv_sec;
// cerr<<"Result: ret="<<ret<<", EDNS-level: "<<EDNSLevel<<", haveEDNS: "<<res->d_haveEDNS<<", new mode: "<<mode<<endl;
return ret;
Expand Down Expand Up @@ -3150,7 +3173,7 @@ bool SyncRes::doResolveAtThisIP(const std::string& prefix, const DNSName& qname,
t_sstorage.nsSpeeds[nsName.empty()? DNSName(remoteIP.toStringWithPort()) : nsName].submit(remoteIP, 1000000, &d_now); // 1 sec

// code below makes sure we don't filter COM or the root
if (s_serverdownmaxfails > 0 && (auth != g_rootdnsname) && t_sstorage.fails.incr(remoteIP) >= s_serverdownmaxfails) {
if (s_serverdownmaxfails > 0 && (auth != g_rootdnsname) && t_sstorage.fails.incr(remoteIP, d_now) >= s_serverdownmaxfails) {
LOG(prefix<<qname<<": Max fails reached resolving on "<< remoteIP.toString() <<". Going full throttle for "<< s_serverdownthrottletime <<" seconds" <<endl);
// mark server as down
t_sstorage.throttle.throttle(d_now.tv_sec, boost::make_tuple(remoteIP, "", 0), s_serverdownthrottletime, 10000);
Expand Down
88 changes: 59 additions & 29 deletions pdns/syncres.hh
Expand Up @@ -214,60 +214,74 @@ private:
template<class Thing> class Counters : public boost::noncopyable
{
public:
Counters()
{
typedef unsigned long counter_t;
struct value_t {
counter_t value;
time_t last;
};
typedef std::map<Thing, value_t> cont_t;

cont_t getMap() const {
return d_cont;
}
unsigned long value(const Thing& t) const
counter_t value(const Thing& t) const
{
typename cont_t::const_iterator i=d_cont.find(t);
typename cont_t::const_iterator i = d_cont.find(t);

if(i==d_cont.end()) {
if (i == d_cont.end()) {
return 0;
}
return (unsigned long)i->second;
return i->second.value;
}
unsigned long incr(const Thing& t)

counter_t incr(const Thing& t, const struct timeval & now)
{
typename cont_t::iterator i=d_cont.find(t);
typename cont_t::iterator i = d_cont.find(t);

if(i==d_cont.end()) {
d_cont[t]=1;
if (i == d_cont.end()) {
d_cont[t].value = 1;
d_cont[t].last = now.tv_sec;
omoerbeek marked this conversation as resolved.
Show resolved Hide resolved
return 1;
}
else {
if (i->second < std::numeric_limits<unsigned long>::max())
i->second++;
return (unsigned long)i->second;
}
if (i->second.value < std::numeric_limits<counter_t>::max()) {
i->second.value++;
}
i->second.last = now.tv_sec;
return i->second.value;
}
}
unsigned long decr(const Thing& t)
{
typename cont_t::iterator i=d_cont.find(t);

if(i!=d_cont.end() && --i->second == 0) {
d_cont.erase(i);
return 0;
} else
return (unsigned long)i->second;
}
void clear(const Thing& t)
{
typename cont_t::iterator i=d_cont.find(t);
typename cont_t::iterator i = d_cont.find(t);

if(i!=d_cont.end()) {
if (i != d_cont.end()) {
d_cont.erase(i);
}
}

void clear()
{
d_cont.clear();
}

size_t size() const
{
return d_cont.size();
}

void prune(time_t cutoff) {
for (auto it = d_cont.begin(); it != d_cont.end(); ) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any idea of how many entries we usually have in there? If it might be a lot perhaps we need to use a multi_index keyed on the last field to avoid having to scan the whole map.

if (it->second.last <= cutoff) {
it = d_cont.erase(it);
} else {
++it;
}
}
}

private:
typedef map<Thing,unsigned long> cont_t;
cont_t d_cont;
};

Expand All @@ -280,9 +294,8 @@ public:

struct EDNSStatus
{
EDNSStatus() : mode(UNKNOWN), modeSetAt(0) {}
enum EDNSMode { UNKNOWN=0, EDNSOK=1, EDNSIGNORANT=2, NOEDNS=3 } mode;
time_t modeSetAt;
time_t modeSetAt{0};
enum EDNSMode { UNKNOWN=0, EDNSOK=1, EDNSIGNORANT=2, NOEDNS=3 } mode{UNKNOWN};
};

enum class HardenNXD { No, DNSSEC, Yes };
Expand Down Expand Up @@ -406,6 +419,7 @@ public:
static uint64_t doEDNSDump(int fd);
static uint64_t doDumpNSSpeeds(int fd);
static uint64_t doDumpThrottleMap(int fd);
static uint64_t doDumpFailedServers(int fd);
static int getRootNS(struct timeval now, asyncresolve_t asyncCallback);
static void clearDelegationOnly()
{
Expand Down Expand Up @@ -498,6 +512,16 @@ public:
{
t_sstorage.ednsstatus.clear();
}
static void pruneEDNSStatuses(time_t cutoff)
{
for (auto it = t_sstorage.ednsstatus.begin(); it != t_sstorage.ednsstatus.end(); ) {
if (it->second.modeSetAt <= cutoff) {
it = t_sstorage.ednsstatus.erase(it);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question about the number of entries?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On one of our small recursors (sub 1kqps) that has only been up for a week we have 200,000 edns status entries - if that data point helps.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With cleanup, I suppose it will follow the same trend as the nsspeed size, which in that instance is an order of magnitude smaller.

} else {
++it;
}
}
}
static uint64_t getThrottledServersSize()
{
return t_sstorage.throttle.size();
Expand Down Expand Up @@ -526,6 +550,10 @@ public:
{
t_sstorage.fails.clear();
}
static void pruneFailedServers(time_t cutoff)
{
t_sstorage.fails.prune(cutoff);
}
static unsigned long getServerFailsCount(const ComboAddress& server)
{
return t_sstorage.fails.value(server);
Expand Down Expand Up @@ -1063,6 +1091,8 @@ template<class T> T broadcastAccFunction(const boost::function<T*()>& func);

std::shared_ptr<SyncRes::domainmap_t> parseAuthAndForwards();
uint64_t* pleaseGetNsSpeedsSize();
uint64_t* pleaseGetFailedServersSize();
uint64_t* pleaseGetEDNSStatusesSize();
uint64_t* pleaseGetCacheSize();
uint64_t* pleaseGetNegCacheSize();
uint64_t* pleaseGetCacheHits();
Expand Down