Skip to content

Commit

Permalink
core: remove high cardinality pubkey metrics (#940)
Browse files Browse the repository at this point in the history
Remove pubkey from `bcast` and `scheduler` metrics since it can be high cardinality when loading 1000 validator clusters or when aggregating metrics across clusters.

category: refactor
ticket: none
  • Loading branch information
corverroos committed Aug 10, 2022
1 parent cbaa50c commit 36be0ad
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 18 deletions.
2 changes: 1 addition & 1 deletion app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ func wireP2P(ctx context.Context, life *lifecycle.Manager, conf Config,
for _, relay := range relays {
life.RegisterStart(lifecycle.AsyncAppCtx, lifecycle.StartRelay, p2p.NewRelayReserver(tcpNode, relay))
}
life.RegisterStart(lifecycle.AsyncAppCtx, lifecycle.StartP2PEventCollector, p2p.NewEventCollecter(tcpNode))
life.RegisterStart(lifecycle.AsyncAppCtx, lifecycle.StartP2PEventCollector, p2p.NewEventCollector(tcpNode))

return tcpNode, localEnode, nil
}
Expand Down
2 changes: 1 addition & 1 deletion core/bcast/bcast.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (b Broadcaster) Broadcast(ctx context.Context, duty core.Duty,
ctx = log.WithCtx(ctx, z.Any("pubkey", pubkey))
defer func() {
if err == nil {
instrumentDuty(duty, pubkey, b.delayFunc(duty.Slot))
instrumentDuty(duty, b.delayFunc(duty.Slot))
}
}()

Expand Down
10 changes: 5 additions & 5 deletions core/bcast/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,19 @@ var broadcastCounter = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "bcast",
Name: "broadcast_total",
Help: "The total count of successfully broadcast duties by pubkey and type",
}, []string{"type", "pubkey"})
Help: "The total count of successfully broadcast duties by type",
}, []string{"duty"})

var broadcastDelay = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "core",
Subsystem: "bcast",
Name: "broadcast_delay_seconds",
Help: "Duty broadcast delay from start of slot in seconds by type",
Buckets: []float64{.05, .1, .25, .5, 1, 2.5, 5, 10, 20, 30, 60},
}, []string{"type"})
}, []string{"duty"})

// instrumentDuty increments the duty counter.
func instrumentDuty(duty core.Duty, pubkey core.PubKey, delay time.Duration) {
broadcastCounter.WithLabelValues(duty.Type.String(), pubkey.String()).Inc()
func instrumentDuty(duty core.Duty, delay time.Duration) {
broadcastCounter.WithLabelValues(duty.Type.String()).Inc()
broadcastDelay.WithLabelValues(duty.Type.String()).Observe(delay.Seconds())
}
2 changes: 1 addition & 1 deletion core/parsigdb/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ var exitCounter = promauto.NewCounterVec(prometheus.CounterOpts{
Subsystem: "parsigdb",
Name: "exit_total",
Help: "Total number of partially signed voluntary exits per public key",
}, []string{"pubkey"})
}, []string{"pubkey"}) // Ok to use pubkey (high cardinality) here since these are very rare
8 changes: 3 additions & 5 deletions core/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ var (
Namespace: "core",
Subsystem: "scheduler",
Name: "duty_total",
Help: "The total count of duties scheduled by pubkey and type",
}, []string{"type", "pubkey"})
Help: "The total count of duties scheduled by type",
}, []string{"duty"})

syncMedianGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "core",
Expand Down Expand Up @@ -74,7 +74,5 @@ func instrumentSlot(slot slot) {

// instrumentDuty increments the duty counter.
func instrumentDuty(duty core.Duty, defSet core.DutyDefinitionSet) {
for pubkey := range defSet {
dutyCounter.WithLabelValues(duty.Type.String(), pubkey.String()).Inc()
}
dutyCounter.WithLabelValues(duty.Type.String()).Add(float64(len(defSet)))
}
3 changes: 2 additions & 1 deletion p2p/p2p.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ func multiAddrViaRelay(relayPeer Peer, peerID peer.ID) (ma.Multiaddr, error) {
return transportAddr.Encapsulate(relayAddr), nil
}

func NewEventCollecter(tcpNode host.Host) lifecycle.HookFuncCtx {
// NewEventCollector returns a lifecycle hook that instruments libp2p events.
func NewEventCollector(tcpNode host.Host) lifecycle.HookFuncCtx {
return func(ctx context.Context) {
sub, err := tcpNode.EventBus().Subscribe(new(event.EvtLocalReachabilityChanged))
if err != nil {
Expand Down
12 changes: 8 additions & 4 deletions testutil/compose/static/grafana/dash_simnet.json
Original file line number Diff line number Diff line change
Expand Up @@ -845,10 +845,12 @@
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"exemplar": true,
"expr": "increase(core_bcast_broadcast_total{job=\"$node\"}[30s]) ",
"interval": "",
"legendFormat": "{{type}}",
"legendFormat": "{{duty}}",
"range": true,
"refId": "A"
},
{
Expand Down Expand Up @@ -1317,7 +1319,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1406,7 +1409,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1461,7 +1465,7 @@
"list": [
{
"current": {
"selected": true,
"selected": false,
"text": [
"node0"
],
Expand Down

0 comments on commit 36be0ad

Please sign in to comment.