Skip to content

Commit

Permalink
testutil/compose: poll prometheus alerts (#720)
Browse files Browse the repository at this point in the history
Polls prometheus alerts during `compose auto`.

category: test
ticket: #631
  • Loading branch information
corverroos committed Jun 15, 2022
1 parent 3998a8e commit 87b91c1
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 64 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
restore-keys: |
${{ runner.os }}-go-
- run: docker pull consensys/teku:latest
- run: go test -timeout=5m -v github.com/obolnetwork/charon/app -integration -slow
- run: go test -timeout=5m github.com/obolnetwork/charon/app -integration -slow

compose_tests:
runs-on: ubuntu-latest
Expand All @@ -62,4 +62,4 @@ jobs:
restore-keys: |
${{ runner.os }}-go-
- run: GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o testutil/compose/compose # Pre-build current SHA charon binary
- run: go test -v github.com/obolnetwork/charon/testutil/compose/compose -integration -sudo-perms -prebuilt-binary=charon
- run: go test github.com/obolnetwork/charon/testutil/compose/compose -integration -sudo-perms -prebuilt-binary=charon
123 changes: 71 additions & 52 deletions testutil/compose/compose/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,78 +16,97 @@
package main

import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"

"golang.org/x/sync/errgroup"
"os/exec"
"time"

"github.com/obolnetwork/charon/app/errors"
"github.com/obolnetwork/charon/app/log"
"github.com/obolnetwork/charon/app/z"
)

// startAlertCollector starts a server that accepts alert webhooks until the context is closed and returns
// a channel on which the received alert titles will be sent.
func startAlertCollector(ctx context.Context, port int) (chan string, error) {
l, err := net.Listen("tcp", fmt.Sprintf("0.0.0.0:%d", port))
if err != nil {
return nil, errors.Wrap(err, "new listener")
}

// startAlertCollector starts a goroutine that polls prometheus alerts until the context is closed and returns
// a channel on which the received alert descriptions will be sent.
func startAlertCollector(ctx context.Context, dir string) (chan string, error) {
dedup := make(map[string]bool)
resp := make(chan string, 100)
server := http.Server{
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()

b, err := io.ReadAll(r.Body)
if err != nil {
log.Error(ctx, "Read request body", err)
return
}
go func() {
defer close(resp)
for ctx.Err() == nil {
time.Sleep(time.Second * 5)

wrapper := struct {
Body string `json:"body"`
}{}
if err := json.Unmarshal(b, &wrapper); err != nil {
log.Error(ctx, "Unmarshal body wrapper", err, z.Str("body", string(b)))
cmd := exec.CommandContext(ctx, "docker-compose", "exec", "-T", "curl", "curl", "-s", "http://prometheus:9090/api/v1/rules?type=alert")
cmd.Dir = dir
out, err := cmd.CombinedOutput()
if ctx.Err() != nil {
return
} else if err != nil {
log.Error(ctx, "Exec curl alerts", err, z.Str("out", string(out)))
continue
}

alert := struct {
Title string `json:"title"`
}{}
if err := json.Unmarshal(b, &alert); err != nil {
log.Error(ctx, "Unmarshal alert", err, z.Str("body", string(b)))
return
} else if alert.Title == "" {
log.Error(ctx, "Alert title empty", err, z.Str("body", string(b)))
return
var alerts promAlerts
if err := json.Unmarshal(bytes.TrimSpace(out), &alerts); err != nil {
resp <- errors.Wrap(err, "unmarshal alerts", z.Str("out", string(out))).Error()
continue
}

log.Info(ctx, "Received webhook", z.Str("body", string(b)), z.Str("title", alert.Title))
if alerts.Status != "success" {
resp <- "non success status from prometheus alerts: " + alerts.Status
continue
}

resp <- alert.Title
}),
}
for _, active := range getActiveAlerts(alerts) {
if dedup[active] {
continue
}
dedup[active] = true
log.Info(ctx, "Detected new alert", z.Str("alert", active))

eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error {
return server.Serve(l) //nolint:wrapcheck
})
eg.Go(func() error {
<-ctx.Done()
return server.Close() //nolint:wrapcheck
})
go func() {
if err := eg.Wait(); !errors.Is(err, context.Canceled) && !errors.Is(err, http.ErrServerClosed) {
log.Error(ctx, "Alert collector", err)
resp <- active
}
}
close(resp)
}()

return resp, nil
}

func getActiveAlerts(alerts promAlerts) []string {
var resp []string
for _, group := range alerts.Data.Groups {
for _, rule := range group.Rules {
for _, alert := range rule.Alerts {
if alert.State != "active" {
continue
}

resp = append(resp, alert.Annotations.Description)
}
}
}

return resp
}

// promAlerts is the json response returned by querying prometheus alerts.
// nolint: revive // Nested structs are ok in this case.
type promAlerts struct {
Status string `json:"status"`
Data struct {
Groups []struct {
Name string `json:"name"`
Rules []struct {
Name string `json:"name"`
Alerts []struct {
State string `json:"state"`
Annotations struct {
Description string `json:"description"`
} `json:"annotations"`
} `json:"alerts"`
} `json:"rules"`
} `json:"groups"`
} `json:"data"`
}
7 changes: 4 additions & 3 deletions testutil/compose/compose/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ func newAutoCmd(tmplCallback func(data *compose.TmplData)) *cobra.Command {
defer cancel()
}

alerts, err := startAlertCollector(ctx, 26354)
alerts, err := startAlertCollector(ctx, *dir)
if err != nil {
return err
}
Expand All @@ -204,7 +204,7 @@ func newAutoCmd(tmplCallback func(data *compose.TmplData)) *cobra.Command {
_ = execDown(rootCtx, *dir)
}()

if err := execUp(ctx, *dir); !errors.Is(err, context.DeadlineExceeded) {
if err := execUp(ctx, *dir); err != nil && !errors.Is(err, context.DeadlineExceeded) {
return err
}

Expand All @@ -216,6 +216,7 @@ func newAutoCmd(tmplCallback func(data *compose.TmplData)) *cobra.Command {
if fail {
return errors.New("alerts detected")
}
log.Info(ctx, "No alerts detected")

return nil
}
Expand Down Expand Up @@ -315,7 +316,6 @@ func addUpFlag(flags *pflag.FlagSet) *bool {
// execUp executes `docker-compose up`.
func execUp(ctx context.Context, dir string) error {
log.Info(ctx, "Executing docker-compose up")

cmd := exec.CommandContext(ctx, "docker-compose", "up",
"--remove-orphans",
"--build",
Expand All @@ -325,6 +325,7 @@ func execUp(ctx context.Context, dir string) error {
cmd.Dir = dir
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr

if err := cmd.Run(); err != nil {
if ctx.Err() != nil {
err = ctx.Err()
Expand Down
6 changes: 6 additions & 0 deletions testutil/compose/docker-compose.template
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ services:
{{end -}}
{{end -}}
{{if .Monitoring}}
curl:
# Can be used to curl services; e.g. docker-compose exec curl curl http://prometheus:9090/api/v1/rules\?type\=alert
image: curlimages/curl:latest
command: sleep 1d
networks: [compose]

prometheus:
image: prom/prometheus:latest
{{if .MonitoringPorts}}ports:
Expand Down
14 changes: 7 additions & 7 deletions testutil/compose/static/prometheus/rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,40 +5,40 @@ groups:
expr: up == 0
for: 15s
annotations:
description: "Ensures charon node(s) are available"
description: "Charon {{ $labels.job }} is down"

- alert: Error Log Rate
expr: app_log_error_total > 0
for: 15s
annotations:
description: "Ensures no error logs"
description: "Charon {{ $labels.job }} has a high error rate"

- alert: Warn Log Rate
expr: increase(app_log_warn_total[30s]) > 2
for: 15s
annotations:
description: "Ensures warning log rate is low"
description: "Charon {{ $labels.job }} has a high warning rate"

- alert: Validator API Error Rate
expr: increase(core_validatorapi_request_error_total{endpoint!="proxy"}[30s]) > 1
for: 15s
annotations:
description: "Ensures validator api error rate is very low"
description: "Charon {{ $labels.job }} validator API a high error rate"

- alert: Proxy API Error Rate
expr: increase(core_validatorapi_request_error_total{endpoint="proxy"}[30s]) > 5
for: 15s
annotations:
description: "Ensures proxy api error rate is low"
description: "Charon {{ $labels.job }} proxy API a high error rate"

- alert: Broadcast Duty Rate
expr: increase(core_bcast_broadcast_total[30s]) < 0.5
for: 15s
annotations:
description: "Ensures broadcast duty rate is not low / is high"
description: "Charon {{ $labels.job }} is not broadcasting enough duties"

- alert: Outstanding Duty Rate
expr: core_bcast_broadcast_total - core_scheduler_duty_total > 50
for: 15s
annotations:
description: "Ensures outstanding duties remain low"
description: "Charon {{ $labels.job }} has too many outstanding duties"
6 changes: 6 additions & 0 deletions testutil/compose/testdata/TestDockerCompose_run_yml.golden
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,12 @@ services:
volumes:
- .:/compose

curl:
# Can be used to curl services; e.g. docker-compose exec curl curl http://prometheus:9090/api/v1/rules\?type\=alert
image: curlimages/curl:latest
command: sleep 1d
networks: [compose]

prometheus:
image: prom/prometheus:latest
ports:
Expand Down

0 comments on commit 87b91c1

Please sign in to comment.