Skip to content

Commit

Permalink
FIxed some bugs, made sky puppy more crash safe and better logging
Browse files Browse the repository at this point in the history
  • Loading branch information
Phara0h committed Dec 11, 2020
1 parent 5e2b149 commit 628dc13
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 83 deletions.
2 changes: 1 addition & 1 deletion src/alerts.js
Expand Up @@ -209,7 +209,7 @@ class Alerts {
await fasquest.request(JSON.parse(JSON.stringify(request)));
} catch (e) {
log.error(
`ERROR: Alerter [${alert.alerter}] of type [${alert.type}] could not be reached. Errored with message ${e.err.message}`
`ERROR: Alerter [${alert.alerter}] of type [${alert.type}] could not be reached. Errored with message ${e.err ? e.err.message : e.message}`
);
}
} else {
Expand Down
185 changes: 103 additions & 82 deletions src/health-check.js
Expand Up @@ -173,7 +173,7 @@ class HealthCheck {
this.services[name] = nService;

this.services[name]._sTimeoutHandler = setTimeout(() => {
this._runCheck(this.services[name]);
this._run(this.services[name]);
}, (nService.config.start_delay || 0) * 1000);
}
}
Expand All @@ -197,106 +197,127 @@ class HealthCheck {
return message || '';
}

async _runCheck(service) {
if (service && service.enabled) {
const startTime = process.hrtime.bigint();
// const oldStatus = service.status.up;
async _runChecker(service, startTime) {
try {
var res = await service.checker.check();

try {
var res = await service.checker.check();
service.status.time =
Number(process.hrtime.bigint() - startTime) / 1000000;
service.status.code = res.code;
service.status.message = this._mapMessages(
res.code,
res.message,
service
);
service.status.up = 1;

if (service.config.expected_status != service.status.code) {
service.status.up = 0;
service.status.count.unhealthy_status++;
log.info(service.name, ' Unhealthy status: ' + service.status.code);
}

if (service.status.time > service.config.expected_response_time) {
service.status.up = 0;
service.status.count.unhealthy_response_time++;
log.info(
service.name,
' Unhealthy response time: ' + service.status.time.toFixed(2) + 'ms'
);
}

if (service.status.up > 0) {
service.status.count.healthy++;
} else {
service.status.count.unhealthy++;
}
} catch (e) {
if (e.message.indexOf('ETIMEDOUT') > -1) {
service.status.time =
Number(process.hrtime.bigint() - startTime) / 1000000;
service.status.code = res.code;

service.status.count.unhealthy++;
service.status.up = 0;
service.status.code = 0;

service.status.message = this._mapMessages(
res.code,
res.message,
service.status.code,
'Timedout',
service
);
service.status.up = 1;

if (service.config.expected_status != service.status.code) {
service.status.up = 0;
service.status.count.unhealthy_status++;
log.info(service.name, ' Unhealthy status: ' + service.status.code);
}
log.info(service.name, ' Unhealthy ETIMEDOUT!');
} else {
service.status.time =
Number(process.hrtime.bigint() - startTime) / 1000000;
service.status.count.down++;
service.status.up = -1;
service.status.code = -1;
service.status.message = this._mapMessages(
service.status.code,
e.message,
service
);
log.info(service.name, ' Down! ', e.message);
}

if (service.status.time > service.config.expected_response_time) {
service.status.up = 0;
service.status.count.unhealthy_response_time++;
log.info(
service.name,
' Unhealthy response time: ' + service.status.time.toFixed(2) + 'ms'
);
}
log.debug(service.name, e.message);
}

if (service.status.up > 0) {
service.status.count.healthy++;
} else {
service.status.count.unhealthy++;
}
} catch (e) {
if (e.message.indexOf('ETIMEDOUT') > -1) {
service.status.up = 0;
service.status.count.unhealthy++;
log.info(service.name, ' Unhealthy ETIMEDOUT!');
} else {
service.status.time =
Number(process.hrtime.bigint() - startTime) / 1000000;
service.status.count.down++;
service.status.up = -1;
service.status.code = 0;
}
if (service.status.last_status == null) {
service.status.last_status = service.status.up;
}

log.debug(service.name, e.message);
if (service.status.up > 0) {
if (!service.status.last_healthy) {
service.status.last_healthy = process.hrtime.bigint();
}

if (service.status.last_status == null) {
service.status.last_status = service.status.up;
if (service.status.last_status < 1 && service.status.last_healthy) {
service.status.last_unhealthy_total_duration = (
Number(process.hrtime.bigint() - service.status.last_unhealthy) /
1000000000
).toFixed(3);
log.info(
service.name,
`healthy again after ${service.status.last_unhealthy_total_duration} second of down time!`
);
service.status.last_healthy = process.hrtime.bigint();
}
} else if (!service.status.last_unhealthy || (service.status.last_status > 0 && service.status.last_unhealthy)) {
service.status.last_unhealthy = process.hrtime.bigint();
}
}
async _run(service) {
if (service && service.enabled) {
const startTime = process.hrtime.bigint();

if (service.status.up > 0) {
if (!service.status.last_healthy) {
service.status.last_healthy = process.hrtime.bigint();
}
if (service.status.last_status < 1 && service.status.last_healthy) {
service.status.last_unhealthy_total_duration = (
Number(process.hrtime.bigint() - service.status.last_unhealthy) /
1000000000
).toFixed(3);
log.info(
service.name,
`healthy again after ${service.status.last_unhealthy_total_duration} second of down time!`
);
service.status.last_healthy = process.hrtime.bigint();
}
} else {
if (!service.status.last_unhealthy) {
service.status.last_unhealthy = process.hrtime.bigint();
}
if (service.status.last_status > 0 && service.status.last_unhealthy) {
service.status.last_unhealthy = process.hrtime.bigint();
}
try {
await this._runChecker(service, startTime);
this.stats.updateService(service.name, service.status);
await this.alerts.alert(service);
} catch (e) {
log.error(e.message);
}

this.stats.updateService(service.name, service.status);
try {
service.status.last_status = service.status.up;
const tout =
service.config.interval -
Number(process.hrtime.bigint() - startTime) / 1000000;

await this.alerts.alert(service);
service.status.last_status = service.status.up;
const tout =
service.config.interval -
Number(process.hrtime.bigint() - startTime) / 1000000;
if (tout <= 0) {
log.debug(service.name + ' tout: ' + (tout > 0 ? tout : 0));
}

if (tout <= 0) {
log.debug(service.name + ' tout: ' + (tout > 0 ? tout : 0));
this.services[service.name]._sTimeoutHandler = setTimeout(
async () => {
this._run(service);
},
tout > 0 ? tout : 0
);
} catch (e) {
log.fatal('Could not run service: ' + (service ? service.name : 'Unknown' + ' e:' + e.message));
}

this.services[service.name]._sTimeoutHandler = setTimeout(
async () => {
this._runCheck(service);
},
tout > 0 ? tout : 0
);
}
}
}
Expand Down

0 comments on commit 628dc13

Please sign in to comment.