From 87b91c1d3dbb87b1825310bff5dc1f2e19390b1b Mon Sep 17 00:00:00 2001 From: corverroos Date: Wed, 15 Jun 2022 18:41:52 +0200 Subject: [PATCH] testutil/compose: poll prometheus alerts (#720) Polls prometheus alerts during `compose auto`. category: test ticket: #631 --- .github/workflows/test.yml | 4 +- testutil/compose/compose/alert.go | 123 ++++++++++-------- testutil/compose/compose/main.go | 7 +- testutil/compose/docker-compose.template | 6 + testutil/compose/static/prometheus/rules.yml | 14 +- .../testdata/TestDockerCompose_run_yml.golden | 6 + 6 files changed, 96 insertions(+), 64 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a9eba416c..adfdc6fc7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -44,7 +44,7 @@ jobs: restore-keys: | ${{ runner.os }}-go- - run: docker pull consensys/teku:latest - - run: go test -timeout=5m -v github.com/obolnetwork/charon/app -integration -slow + - run: go test -timeout=5m github.com/obolnetwork/charon/app -integration -slow compose_tests: runs-on: ubuntu-latest @@ -62,4 +62,4 @@ jobs: restore-keys: | ${{ runner.os }}-go- - run: GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o testutil/compose/compose # Pre-build current SHA charon binary - - run: go test -v github.com/obolnetwork/charon/testutil/compose/compose -integration -sudo-perms -prebuilt-binary=charon + - run: go test github.com/obolnetwork/charon/testutil/compose/compose -integration -sudo-perms -prebuilt-binary=charon diff --git a/testutil/compose/compose/alert.go b/testutil/compose/compose/alert.go index ad35151c0..a69b187c8 100644 --- a/testutil/compose/compose/alert.go +++ b/testutil/compose/compose/alert.go @@ -16,78 +16,97 @@ package main import ( + "bytes" "context" "encoding/json" - "fmt" - "io" - "net" - "net/http" - - "golang.org/x/sync/errgroup" + "os/exec" + "time" "github.com/obolnetwork/charon/app/errors" "github.com/obolnetwork/charon/app/log" "github.com/obolnetwork/charon/app/z" ) -// startAlertCollector starts a server that accepts alert webhooks until the context is closed and returns -// a channel on which the received alert titles will be sent. -func startAlertCollector(ctx context.Context, port int) (chan string, error) { - l, err := net.Listen("tcp", fmt.Sprintf("0.0.0.0:%d", port)) - if err != nil { - return nil, errors.Wrap(err, "new listener") - } - +// startAlertCollector starts a goroutine that polls prometheus alerts until the context is closed and returns +// a channel on which the received alert descriptions will be sent. +func startAlertCollector(ctx context.Context, dir string) (chan string, error) { + dedup := make(map[string]bool) resp := make(chan string, 100) - server := http.Server{ - Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - defer r.Body.Close() - b, err := io.ReadAll(r.Body) - if err != nil { - log.Error(ctx, "Read request body", err) - return - } + go func() { + defer close(resp) + for ctx.Err() == nil { + time.Sleep(time.Second * 5) - wrapper := struct { - Body string `json:"body"` - }{} - if err := json.Unmarshal(b, &wrapper); err != nil { - log.Error(ctx, "Unmarshal body wrapper", err, z.Str("body", string(b))) + cmd := exec.CommandContext(ctx, "docker-compose", "exec", "-T", "curl", "curl", "-s", "http://prometheus:9090/api/v1/rules?type=alert") + cmd.Dir = dir + out, err := cmd.CombinedOutput() + if ctx.Err() != nil { return + } else if err != nil { + log.Error(ctx, "Exec curl alerts", err, z.Str("out", string(out))) + continue } - alert := struct { - Title string `json:"title"` - }{} - if err := json.Unmarshal(b, &alert); err != nil { - log.Error(ctx, "Unmarshal alert", err, z.Str("body", string(b))) - return - } else if alert.Title == "" { - log.Error(ctx, "Alert title empty", err, z.Str("body", string(b))) - return + var alerts promAlerts + if err := json.Unmarshal(bytes.TrimSpace(out), &alerts); err != nil { + resp <- errors.Wrap(err, "unmarshal alerts", z.Str("out", string(out))).Error() + continue } - log.Info(ctx, "Received webhook", z.Str("body", string(b)), z.Str("title", alert.Title)) + if alerts.Status != "success" { + resp <- "non success status from prometheus alerts: " + alerts.Status + continue + } - resp <- alert.Title - }), - } + for _, active := range getActiveAlerts(alerts) { + if dedup[active] { + continue + } + dedup[active] = true + log.Info(ctx, "Detected new alert", z.Str("alert", active)) - eg, ctx := errgroup.WithContext(ctx) - eg.Go(func() error { - return server.Serve(l) //nolint:wrapcheck - }) - eg.Go(func() error { - <-ctx.Done() - return server.Close() //nolint:wrapcheck - }) - go func() { - if err := eg.Wait(); !errors.Is(err, context.Canceled) && !errors.Is(err, http.ErrServerClosed) { - log.Error(ctx, "Alert collector", err) + resp <- active + } } - close(resp) }() return resp, nil } + +func getActiveAlerts(alerts promAlerts) []string { + var resp []string + for _, group := range alerts.Data.Groups { + for _, rule := range group.Rules { + for _, alert := range rule.Alerts { + if alert.State != "active" { + continue + } + + resp = append(resp, alert.Annotations.Description) + } + } + } + + return resp +} + +// promAlerts is the json response returned by querying prometheus alerts. +// nolint: revive // Nested structs are ok in this case. +type promAlerts struct { + Status string `json:"status"` + Data struct { + Groups []struct { + Name string `json:"name"` + Rules []struct { + Name string `json:"name"` + Alerts []struct { + State string `json:"state"` + Annotations struct { + Description string `json:"description"` + } `json:"annotations"` + } `json:"alerts"` + } `json:"rules"` + } `json:"groups"` + } `json:"data"` +} diff --git a/testutil/compose/compose/main.go b/testutil/compose/compose/main.go index cc24dbf2f..15c96dd4e 100644 --- a/testutil/compose/compose/main.go +++ b/testutil/compose/compose/main.go @@ -195,7 +195,7 @@ func newAutoCmd(tmplCallback func(data *compose.TmplData)) *cobra.Command { defer cancel() } - alerts, err := startAlertCollector(ctx, 26354) + alerts, err := startAlertCollector(ctx, *dir) if err != nil { return err } @@ -204,7 +204,7 @@ func newAutoCmd(tmplCallback func(data *compose.TmplData)) *cobra.Command { _ = execDown(rootCtx, *dir) }() - if err := execUp(ctx, *dir); !errors.Is(err, context.DeadlineExceeded) { + if err := execUp(ctx, *dir); err != nil && !errors.Is(err, context.DeadlineExceeded) { return err } @@ -216,6 +216,7 @@ func newAutoCmd(tmplCallback func(data *compose.TmplData)) *cobra.Command { if fail { return errors.New("alerts detected") } + log.Info(ctx, "No alerts detected") return nil } @@ -315,7 +316,6 @@ func addUpFlag(flags *pflag.FlagSet) *bool { // execUp executes `docker-compose up`. func execUp(ctx context.Context, dir string) error { log.Info(ctx, "Executing docker-compose up") - cmd := exec.CommandContext(ctx, "docker-compose", "up", "--remove-orphans", "--build", @@ -325,6 +325,7 @@ func execUp(ctx context.Context, dir string) error { cmd.Dir = dir cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { if ctx.Err() != nil { err = ctx.Err() diff --git a/testutil/compose/docker-compose.template b/testutil/compose/docker-compose.template index b297a0f0d..5c39e831d 100644 --- a/testutil/compose/docker-compose.template +++ b/testutil/compose/docker-compose.template @@ -60,6 +60,12 @@ services: {{end -}} {{end -}} {{if .Monitoring}} + curl: + # Can be used to curl services; e.g. docker-compose exec curl curl http://prometheus:9090/api/v1/rules\?type\=alert + image: curlimages/curl:latest + command: sleep 1d + networks: [compose] + prometheus: image: prom/prometheus:latest {{if .MonitoringPorts}}ports: diff --git a/testutil/compose/static/prometheus/rules.yml b/testutil/compose/static/prometheus/rules.yml index ae856dde3..4d8fe110f 100644 --- a/testutil/compose/static/prometheus/rules.yml +++ b/testutil/compose/static/prometheus/rules.yml @@ -5,40 +5,40 @@ groups: expr: up == 0 for: 15s annotations: - description: "Ensures charon node(s) are available" + description: "Charon {{ $labels.job }} is down" - alert: Error Log Rate expr: app_log_error_total > 0 for: 15s annotations: - description: "Ensures no error logs" + description: "Charon {{ $labels.job }} has a high error rate" - alert: Warn Log Rate expr: increase(app_log_warn_total[30s]) > 2 for: 15s annotations: - description: "Ensures warning log rate is low" + description: "Charon {{ $labels.job }} has a high warning rate" - alert: Validator API Error Rate expr: increase(core_validatorapi_request_error_total{endpoint!="proxy"}[30s]) > 1 for: 15s annotations: - description: "Ensures validator api error rate is very low" + description: "Charon {{ $labels.job }} validator API a high error rate" - alert: Proxy API Error Rate expr: increase(core_validatorapi_request_error_total{endpoint="proxy"}[30s]) > 5 for: 15s annotations: - description: "Ensures proxy api error rate is low" + description: "Charon {{ $labels.job }} proxy API a high error rate" - alert: Broadcast Duty Rate expr: increase(core_bcast_broadcast_total[30s]) < 0.5 for: 15s annotations: - description: "Ensures broadcast duty rate is not low / is high" + description: "Charon {{ $labels.job }} is not broadcasting enough duties" - alert: Outstanding Duty Rate expr: core_bcast_broadcast_total - core_scheduler_duty_total > 50 for: 15s annotations: - description: "Ensures outstanding duties remain low" + description: "Charon {{ $labels.job }} has too many outstanding duties" diff --git a/testutil/compose/testdata/TestDockerCompose_run_yml.golden b/testutil/compose/testdata/TestDockerCompose_run_yml.golden index 205b1b9f0..21cc13138 100644 --- a/testutil/compose/testdata/TestDockerCompose_run_yml.golden +++ b/testutil/compose/testdata/TestDockerCompose_run_yml.golden @@ -146,6 +146,12 @@ services: volumes: - .:/compose + curl: + # Can be used to curl services; e.g. docker-compose exec curl curl http://prometheus:9090/api/v1/rules\?type\=alert + image: curlimages/curl:latest + command: sleep 1d + networks: [compose] + prometheus: image: prom/prometheus:latest ports: