Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

app: instrument not ready reasons #1302

Merged
merged 4 commits into from
Oct 17, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion app/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ import (
"github.com/obolnetwork/charon/eth2util"
)

const (
// readyzReady indicates the readyz is returning 200s and the node is operational.
corverroos marked this conversation as resolved.
Show resolved Hide resolved
readyzReady = 1
// readyzBeaconNodeDown indicates the readyz is returning 500s since the Beacon Node API is down.
readyzBeaconNodeDown = 2
// readyzBeaconNodeSyncing indicates the readyz is returning 500s since the Beacon Node is syncing.
readyzBeaconNodeSyncing = 3
// readyzInsufficientPeers indicates the readyz is returning 500s since this node isn't connected
// to quorum peers via the P2P network.
readyzInsufficientPeers = 4
)

var (
versionGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "app",
Expand Down Expand Up @@ -52,7 +64,11 @@ var (
Namespace: "app",
Subsystem: "monitoring",
Name: "readyz",
Help: "Set to 1 if monitoring api `/readyz` endpoint returned 200 or else 0",
Help: "Set to 1 if the node is operational and monitoring api `/readyz` endpoint is returning 200s. " +
"Else `/readyz` is returning 500s and this metric is either set to " +
"2 if the beacon node is down, or" +
"3 if the beacon node is syncing, or" +
"4 if quorum peers are not connected.",
})

thresholdGauge = promauto.NewGauge(prometheus.GaugeOpts{
Expand Down
22 changes: 11 additions & 11 deletions app/monitoringapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ import (
)

var (
errReadyUninitialised = errors.New("ready check uninitialised")
errReadyTooFewPeers = errors.New("quorum peers not connected")
errReadySyncing = errors.New("beacon node not synced")
errReadyBeaconNodeFailed = errors.New("failed to get beacon sync state")
errReadyUninitialised = errors.New("ready check uninitialised")
errReadyInsufficientPeers = errors.New("quorum peers not connected")
errReadyBeaconNodeSyncing = errors.New("beacon node not synced")
errReadyBeaconNodeDown = errors.New("beacon node down")
)

// wireMonitoringAPI constructs the monitoring API and registers it with the life cycle manager.
Expand Down Expand Up @@ -118,16 +118,16 @@ func startReadyChecker(ctx context.Context, tcpNode host.Host, eth2Cl eth2client

syncing, err := beaconNodeSyncing(ctx, eth2Cl)
if err != nil {
err = errReadyBeaconNodeFailed
readyzGauge.Set(0)
err = errReadyBeaconNodeDown
readyzGauge.Set(readyzBeaconNodeDown)
} else if syncing {
err = errReadySyncing
readyzGauge.Set(0)
err = errReadyBeaconNodeSyncing
readyzGauge.Set(readyzBeaconNodeSyncing)
} else if notConnectedRounds >= minNotConnected {
err = errReadyTooFewPeers
readyzGauge.Set(0)
err = errReadyInsufficientPeers
readyzGauge.Set(readyzInsufficientPeers)
} else {
readyzGauge.Set(1)
readyzGauge.Set(readyzReady)
}

mu.Lock()
Expand Down
4 changes: 2 additions & 2 deletions app/monitoringapi_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ func TestStartChecker(t *testing.T) {
isSyncing: true,
numPeers: 5,
absentPeers: 0,
err: errReadySyncing,
err: errReadyBeaconNodeSyncing,
},
{
name: "too few peers",
isSyncing: false,
numPeers: 5,
absentPeers: 3,
err: errReadyTooFewPeers,
err: errReadyInsufficientPeers,
},
{
name: "success",
Expand Down
5 changes: 5 additions & 0 deletions app/peerinfo/peerinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"bytes"
"context"
"fmt"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -210,6 +211,7 @@ func supported(protocols []string) bool {
// newMetricsSubmitter returns a prometheus metric submitter.
func newMetricsSubmitter() metricSubmitter {
var (
mu sync.Mutex
prevVersions = make(map[string]string)
prevGitHashes = make(map[string]string)
)
Expand Down Expand Up @@ -238,6 +240,9 @@ func newMetricsSubmitter() metricSubmitter {
peerGitHash.WithLabelValues(peerName, gitHash).Set(1)

// Clear previous metrics if changed
mu.Lock()
defer mu.Unlock()

if prev, ok := prevVersions[peerName]; ok && version != prev {
peerVersion.WithLabelValues(peerName, prev).Set(0)
}
Expand Down
69 changes: 55 additions & 14 deletions testutil/compose/static/grafana/dash_simnet.json
Original file line number Diff line number Diff line change
Expand Up @@ -327,14 +327,29 @@
{
"options": {
"0": {
"color": "orange",
"color": "text",
"index": 1,
"text": "NOK"
"text": "Unknown"
},
"1": {
"color": "green",
"index": 0,
"text": "OK"
},
"2": {
"color": "red",
"index": 3,
"text": "BeaconNode Down"
},
"3": {
"color": "orange",
"index": 4,
"text": "BeaconNode Syncing"
},
"4": {
"color": "orange",
"index": 5,
"text": "Insufficient Peers"
}
},
"type": "value"
Expand All @@ -343,6 +358,7 @@
"options": {
"match": "null",
"result": {
"color": "text",
"index": 2,
"text": "Dead"
}
Expand Down Expand Up @@ -397,7 +413,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "app_monitoring_readyz{job=\"$node\"} ",
"expr": "app_monitoring_readyz{job=\"$node\"}",
"hide": false,
"instant": true,
"interval": "",
Expand Down Expand Up @@ -772,7 +788,21 @@
"properties": [
{
"id": "unit",
"value": "s"
"value": "ms"
},
{
"id": "mappings",
"value": [
{
"options": {
"0": {
"index": 0,
"text": "🆗"
}
},
"type": "value"
}
]
}
]
},
Expand Down Expand Up @@ -997,7 +1027,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "max(app_peerinfo_clock_offset_seconds{job=\"$node\"}) by (peer)",
"expr": "max(round(app_peerinfo_clock_offset_seconds{job=\"$node\"},0.1)*1000) by (peer) ",
"format": "table",
"hide": false,
"instant": true,
Expand Down Expand Up @@ -1065,6 +1095,7 @@
"Time 1": true,
"Time 10": true,
"Time 11": true,
"Time 12": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
Expand Down Expand Up @@ -1113,6 +1144,7 @@
"Value #F": "You",
"Value #G": "PrepareAgg",
"Value #H": "Aggregate",
"Value #I": "Exit",
"Value #J": "ClockDiff",
"Value #K": "Direct",
"Value #L": "",
Expand Down Expand Up @@ -1375,7 +1407,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1507,7 +1540,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1599,7 +1633,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1691,7 +1726,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1783,7 +1819,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1876,7 +1913,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1983,7 +2021,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -2078,7 +2117,8 @@
"mode": "absolute",
"steps": [
{
"color": "light-blue"
"color": "light-blue",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -2282,7 +2322,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down