diff --git a/host_checker.go b/host_checker.go index 42228504d7e..0b15bbb4601 100644 --- a/host_checker.go +++ b/host_checker.go @@ -112,8 +112,21 @@ func (h *HostUptimeChecker) HostReporter() { case okHost := <-h.okChan: // Clear host from unhealthylist if it exists if h.unHealthyList[okHost.CheckURL] { - h.upCallback(okHost) - delete(h.unHealthyList, okHost.CheckURL) + newVal := 1 + if count, found := h.sampleCache.Get(okHost.CheckURL); found { + newVal = count.(int) - 1 + } + + if newVal <= 0 { + // Reset the count + h.sampleCache.Delete(okHost.CheckURL) + log.Warning("[HOST CHECKER] [HOST UP]: ", okHost.CheckURL) + h.upCallback(okHost) + delete(h.unHealthyList, okHost.CheckURL) + } else { + log.Warning("[HOST CHECKER] [HOST UP BUT NOT REACHED LIMIT]: ", okHost.CheckURL) + h.sampleCache.Set(okHost.CheckURL, newVal, cache.DefaultExpiration) + } } go h.pingCallback(okHost) @@ -123,16 +136,15 @@ func (h *HostUptimeChecker) HostReporter() { newVal = count.(int) + 1 } - h.sampleCache.Set(failedHost.CheckURL, newVal, cache.DefaultExpiration) - if newVal >= h.sampleTriggerLimit { - log.Debug("[HOST CHECKER] [HOST WARNING]: ", failedHost.CheckURL) - // Reset the count - h.sampleCache.Set(failedHost.CheckURL, 1, cache.DefaultExpiration) + log.Warning("[HOST CHECKER] [HOST DOWN]: ", failedHost.CheckURL) // track it h.unHealthyList[failedHost.CheckURL] = true // Call the custom callback hook go h.failureCallback(failedHost) + } else { + log.Warning("[HOST CHECKER] [HOST DOWN BUT NOT REACHED LIMIT]: ", failedHost.CheckURL) + h.sampleCache.Set(failedHost.CheckURL, newVal, cache.DefaultExpiration) } go h.pingCallback(failedHost) @@ -198,7 +210,7 @@ func (h *HostUptimeChecker) CheckHost(toCheck HostData) { } func (h *HostUptimeChecker) Init(workers, triggerLimit, timeout int, hostList map[string]HostData, failureCallback func(HostHealthReport), upCallback func(HostHealthReport), pingCallback func(HostHealthReport)) { - h.sampleCache = cache.New(30*time.Second, 5*time.Second) + h.sampleCache = cache.New(30*time.Second, 30*time.Second) h.stopPollingChan = make(chan bool) h.errorChan = make(chan HostHealthReport) h.okChan = make(chan HostHealthReport) diff --git a/host_checker_manager.go b/host_checker_manager.go index db495231a01..bb46dab068d 100644 --- a/host_checker_manager.go +++ b/host_checker_manager.go @@ -211,7 +211,7 @@ func (hc *HostCheckerManager) OnHostDown(report HostHealthReport) { log.WithFields(logrus.Fields{ "prefix": "host-check-mgr", }).Debug("Update key: ", hc.getHostKey(report)) - hc.store.SetKey(hc.getHostKey(report), "1", int64(hc.checker.checkTimeout+1)) + hc.store.SetKey(hc.getHostKey(report), "1", int64(hc.checker.checkTimeout*hc.checker.sampleTriggerLimit)) spec := getApiSpec(report.MetaData[UnHealthyHostMetaDataAPIKey]) if spec == nil { diff --git a/host_checker_test.go b/host_checker_test.go index 8e2a70e54d8..b720993413b 100644 --- a/host_checker_test.go +++ b/host_checker_test.go @@ -149,7 +149,7 @@ func TestHostChecker(t *testing.T) { } redisStore := GlobalHostChecker.store.(storage.RedisCluster) - if ttl, _ := redisStore.GetKeyTTL(PoolerHostSentinelKeyPrefix + testHttpFailure); int(ttl) != GlobalHostChecker.checker.checkTimeout+1 { + if ttl, _ := redisStore.GetKeyTTL(PoolerHostSentinelKeyPrefix + testHttpFailure); int(ttl) != GlobalHostChecker.checker.checkTimeout*GlobalHostChecker.checker.sampleTriggerLimit { t.Error("HostDown expiration key should be checkTimeout + 1", ttl) } GlobalHostChecker.checkerMu.Unlock()