From e21bc4307ce7b36ed30cc9289126df3eba8448ed Mon Sep 17 00:00:00 2001 From: shawn Date: Tue, 5 Mar 2024 12:59:54 -0800 Subject: [PATCH] services/horizon: Add new metrics counters for db connection close events (#5225) --- services/horizon/CHANGELOG.md | 5 + support/db/main.go | 8 +- support/db/metrics.go | 73 +++++++++- support/db/session.go | 81 ++++++++--- support/db/session_test.go | 267 ++++++++++++++++++++++++++++++---- 5 files changed, 369 insertions(+), 65 deletions(-) diff --git a/services/horizon/CHANGELOG.md b/services/horizon/CHANGELOG.md index cd411b7120..6bb186b393 100644 --- a/services/horizon/CHANGELOG.md +++ b/services/horizon/CHANGELOG.md @@ -3,6 +3,11 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## unreleased + +### Added +- New `db_error_total` metrics key with labels `ctx_error`, `db_error`, and `db_error_extra` ([5225](https://github.com/stellar/go/pull/5225)). + ## 2.28.3 ### Fixed diff --git a/support/db/main.go b/support/db/main.go index 2fb1f18a10..dca23526ee 100644 --- a/support/db/main.go +++ b/support/db/main.go @@ -118,10 +118,14 @@ type Session struct { // DB is the database connection that queries should be executed against. DB *sqlx.DB - tx *sqlx.Tx - txOptions *sql.TxOptions + tx *sqlx.Tx + txOptions *sql.TxOptions + errorHandlers []ErrorHandlerFunc } +// dbErr - the Postgres error +// ctx - the caller's context +type ErrorHandlerFunc func(dbErr error, ctx context.Context) type SessionInterface interface { BeginTx(ctx context.Context, opts *sql.TxOptions) error Begin(ctx context.Context) error diff --git a/support/db/metrics.go b/support/db/metrics.go index 0726a85f91..5abfe3013a 100644 --- a/support/db/metrics.go +++ b/support/db/metrics.go @@ -3,11 +3,13 @@ package db import ( "context" "database/sql" + "errors" "fmt" "strings" "time" "github.com/Masterminds/squirrel" + "github.com/lib/pq" "github.com/prometheus/client_golang/prometheus" ) @@ -58,6 +60,7 @@ type SessionWithMetrics struct { maxLifetimeClosedCounter prometheus.CounterFunc roundTripProbe *roundTripProbe roundTripTimeSummary prometheus.Summary + errorCounter *prometheus.CounterVec } func RegisterMetrics(base *Session, namespace string, sub Subservice, registry *prometheus.Registry) SessionInterface { @@ -66,6 +69,8 @@ func RegisterMetrics(base *Session, namespace string, sub Subservice, registry * registry: registry, } + base.AddErrorHandler(s.handleErrorEvent) + s.queryCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, @@ -226,6 +231,18 @@ func RegisterMetrics(base *Session, namespace string, sub Subservice, registry * ) registry.MustRegister(s.maxLifetimeClosedCounter) + s.errorCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: "db", + Name: "error_total", + Help: "total number of db related errors, details are captured in labels", + ConstLabels: prometheus.Labels{"subservice": string(sub)}, + }, + []string{"ctx_error", "db_error", "db_error_extra"}, + ) + registry.MustRegister(s.errorCounter) + s.roundTripTimeSummary = prometheus.NewSummary( prometheus.SummaryOpts{ Namespace: namespace, @@ -262,15 +279,10 @@ func (s *SessionWithMetrics) Close() error { s.registry.Unregister(s.maxIdleClosedCounter) s.registry.Unregister(s.maxIdleTimeClosedCounter) s.registry.Unregister(s.maxLifetimeClosedCounter) + s.registry.Unregister(s.errorCounter) return s.SessionInterface.Close() } -// TODO: Implement these -// func (s *SessionWithMetrics) BeginTx(ctx context.Context, opts *sql.TxOptions) error { -// func (s *SessionWithMetrics) Begin(ctx context.Context) error { -// func (s *SessionWithMetrics) Commit(ctx context.Context) error -// func (s *SessionWithMetrics) Rollback(ctx context.Context) error - func (s *SessionWithMetrics) TruncateTables(ctx context.Context, tables []string) (err error) { timer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) { s.queryDurationSummary.With(prometheus.Labels{ @@ -314,6 +326,7 @@ func (s *SessionWithMetrics) Clone() SessionInterface { maxIdleClosedCounter: s.maxIdleClosedCounter, maxIdleTimeClosedCounter: s.maxIdleTimeClosedCounter, maxLifetimeClosedCounter: s.maxLifetimeClosedCounter, + errorCounter: s.errorCounter, } } @@ -356,6 +369,53 @@ func getQueryType(ctx context.Context, query squirrel.Sqlizer) QueryType { return UndefinedQueryType } +// derive the db 'error_total' metric from the err returned by libpq +// +// dbErr - the error returned by any libpq method call +// ctx - the caller's context used on libpb method call +func (s *SessionWithMetrics) handleErrorEvent(dbErr error, ctx context.Context) { + if dbErr == nil || s.NoRows(dbErr) { + return + } + + ctxError := "n/a" + dbError := "other" + errorExtra := "n/a" + var pqErr *pq.Error + + switch { + case errors.As(dbErr, &pqErr): + dbError = string(pqErr.Code) + switch pqErr.Message { + case "canceling statement due to user request": + errorExtra = "user_request" + case "canceling statement due to statement timeout": + errorExtra = "statement_timeout" + } + case strings.Contains(dbErr.Error(), "driver: bad connection"): + dbError = "driver_bad_connection" + case strings.Contains(dbErr.Error(), "sql: transaction has already been committed or rolled back"): + dbError = "tx_already_rollback" + case errors.Is(dbErr, context.Canceled): + dbError = "canceled" + case errors.Is(dbErr, context.DeadlineExceeded): + dbError = "deadline_exceeded" + } + + switch { + case errors.Is(ctx.Err(), context.Canceled): + ctxError = "canceled" + case errors.Is(ctx.Err(), context.DeadlineExceeded): + ctxError = "deadline_exceeded" + } + + s.errorCounter.With(prometheus.Labels{ + "ctx_error": ctxError, + "db_error": dbError, + "db_error_extra": errorExtra, + }).Inc() +} + func (s *SessionWithMetrics) Get(ctx context.Context, dest interface{}, query squirrel.Sqlizer) (err error) { queryType := string(getQueryType(ctx, query)) timer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) { @@ -373,7 +433,6 @@ func (s *SessionWithMetrics) Get(ctx context.Context, dest interface{}, query sq "route": contextRoute(ctx), }).Inc() }() - err = s.SessionInterface.Get(ctx, dest, query) return err } diff --git a/support/db/session.go b/support/db/session.go index 4ad0bc86b5..472fc40a37 100644 --- a/support/db/session.go +++ b/support/db/session.go @@ -3,6 +3,7 @@ package db import ( "context" "database/sql" + go_errors "errors" "fmt" "reflect" "strings" @@ -10,6 +11,7 @@ import ( sq "github.com/Masterminds/squirrel" "github.com/jmoiron/sqlx" + "github.com/lib/pq" "github.com/stellar/go/support/db/sqlutils" "github.com/stellar/go/support/errors" "github.com/stellar/go/support/log" @@ -23,7 +25,7 @@ func (s *Session) Begin(ctx context.Context) error { tx, err := s.DB.BeginTxx(ctx, nil) if err != nil { - if knownErr := s.replaceWithKnownError(err, ctx); knownErr != nil { + if knownErr := s.handleError(err, ctx); knownErr != nil { return knownErr } @@ -44,7 +46,7 @@ func (s *Session) BeginTx(ctx context.Context, opts *sql.TxOptions) error { tx, err := s.DB.BeginTxx(ctx, opts) if err != nil { - if knownErr := s.replaceWithKnownError(err, ctx); knownErr != nil { + if knownErr := s.handleError(err, ctx); knownErr != nil { return knownErr } @@ -92,7 +94,7 @@ func (s *Session) Commit() error { s.tx = nil s.txOptions = nil - if knownErr := s.replaceWithKnownError(err, context.Background()); knownErr != nil { + if knownErr := s.handleError(err, context.Background()); knownErr != nil { return knownErr } return err @@ -146,7 +148,7 @@ func (s *Session) GetRaw(ctx context.Context, dest interface{}, query string, ar return nil } - if knownErr := s.replaceWithKnownError(err, ctx); knownErr != nil { + if knownErr := s.handleError(err, ctx); knownErr != nil { return knownErr } @@ -215,7 +217,7 @@ func (s *Session) ExecRaw(ctx context.Context, query string, args ...interface{} return result, nil } - if knownErr := s.replaceWithKnownError(err, ctx); knownErr != nil { + if knownErr := s.handleError(err, ctx); knownErr != nil { return nil, knownErr } @@ -232,29 +234,60 @@ func (s *Session) NoRows(err error) bool { return err == sql.ErrNoRows } -// replaceWithKnownError tries to replace Postgres error with package error. -// Returns a new error if the err is known. -func (s *Session) replaceWithKnownError(err error, ctx context.Context) error { - if err == nil { +func (s *Session) AddErrorHandler(handler ErrorHandlerFunc) { + s.errorHandlers = append(s.errorHandlers, handler) +} + +// handleError does housekeeping on errors from db. +// dbErr - the libpq client error +// ctx - the calling context +// +// tries to replace dbErr with horizon package error, returns a new error if the err is known. +// invokes any additional error handlers that may have been +// added to the session, passing the caller's context +func (s *Session) handleError(dbErr error, ctx context.Context) error { + if dbErr == nil { return nil } + for _, handler := range s.errorHandlers { + handler(dbErr, ctx) + } + + var dbErrorCode pq.ErrorCode + var pqErr *pq.Error + + // if libpql sends to server, and then any server side error is reported, + // libpq passes back only an pq.ErrorCode from method call + // even if the caller context generates a cancel/deadline error during the server trip, + // libpq will only return an instance of pq.ErrorCode as a non-wrapped error + if go_errors.As(dbErr, &pqErr) { + dbErrorCode = pqErr.Code + } + switch { - case ctx.Err() == context.Canceled: - return ErrCancelled - case ctx.Err() == context.DeadlineExceeded: - // if libpq waits too long to obtain conn from pool, can get ctx timeout before server trip - return ErrTimeout - case strings.Contains(err.Error(), "pq: canceling statement due to user request"): - return ErrTimeout - case strings.Contains(err.Error(), "pq: canceling statement due to conflict with recovery"): + case strings.Contains(dbErr.Error(), "pq: canceling statement due to conflict with recovery"): return ErrConflictWithRecovery - case strings.Contains(err.Error(), "driver: bad connection"): + case strings.Contains(dbErr.Error(), "driver: bad connection"): return ErrBadConnection - case strings.Contains(err.Error(), "pq: canceling statement due to statement timeout"): - return ErrStatementTimeout - case strings.Contains(err.Error(), "transaction has already been committed or rolled back"): + case strings.Contains(dbErr.Error(), "transaction has already been committed or rolled back"): return ErrAlreadyRolledback + case go_errors.Is(ctx.Err(), context.Canceled): + // when horizon's context is cancelled by it's upstream api client, + // it will propagate to here and libpq will emit a wrapped err that has the cancel err + return ErrCancelled + case go_errors.Is(ctx.Err(), context.DeadlineExceeded): + // when horizon's context times out(it's set to app connection-timeout), + // it will trigger libpq to emit a wrapped err that has the deadline err + return ErrTimeout + case dbErrorCode == "57014": + // https://www.postgresql.org/docs/12/errcodes-appendix.html, query_canceled + // this code can be generated for multiple cases, + // by libpq sending a signal to server when it experiences a context cancel/deadline + // or it could happen based on just server statement_timeout setting + // since we check the context cancel/deadline err state first, getting here means + // this can only be from a statement timeout + return ErrStatementTimeout default: return nil } @@ -284,7 +317,7 @@ func (s *Session) QueryRaw(ctx context.Context, query string, args ...interface{ return result, nil } - if knownErr := s.replaceWithKnownError(err, ctx); knownErr != nil { + if knownErr := s.handleError(err, ctx); knownErr != nil { return nil, knownErr } @@ -318,7 +351,7 @@ func (s *Session) Rollback() error { s.tx = nil s.txOptions = nil - if knownErr := s.replaceWithKnownError(err, context.Background()); knownErr != nil { + if knownErr := s.handleError(err, context.Background()); knownErr != nil { return knownErr } return err @@ -362,7 +395,7 @@ func (s *Session) SelectRaw( return nil } - if knownErr := s.replaceWithKnownError(err, ctx); knownErr != nil { + if knownErr := s.handleError(err, ctx); knownErr != nil { return knownErr } diff --git a/support/db/session_test.go b/support/db/session_test.go index 8629b2ca7e..1fd2a3902b 100644 --- a/support/db/session_test.go +++ b/support/db/session_test.go @@ -2,33 +2,72 @@ package db import ( "context" + "sync" "testing" "time" + //"github.com/lib/pq" + "github.com/lib/pq" + "github.com/prometheus/client_golang/prometheus" "github.com/stellar/go/support/db/dbtest" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -func TestServerTimeout(t *testing.T) { +func TestContextTimeoutDuringSql(t *testing.T) { db := dbtest.Postgres(t).Load(testSchema) defer db.Close() var cancel context.CancelFunc ctx := context.Background() - ctx, cancel = context.WithTimeout(ctx, time.Duration(1)) + ctx, cancel = context.WithTimeout(ctx, 2*time.Second) assert := assert.New(t) - sess := &Session{DB: db.Open()} - defer sess.DB.Close() + sessRaw := &Session{DB: db.Open()} + reg := prometheus.NewRegistry() + + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + defer sess.Close() defer cancel() var count int - err := sess.GetRaw(ctx, &count, "SELECT pg_sleep(2), COUNT(*) FROM people") - assert.ErrorIs(err, ErrTimeout, "long running db server operation past context timeout, should return timeout") + var wg sync.WaitGroup + wg.Add(1) + + go func() { + err := sess.GetRaw(ctx, &count, "SELECT pg_sleep(5) FROM people") + assert.ErrorIs(err, ErrTimeout, "long running db server operation past context timeout, should return timeout") + wg.Done() + }() + + require.Eventually(t, func() bool { wg.Wait(); return true }, 5*time.Second, time.Second) + assertDbErrorMetrics(reg, "deadline_exceeded", "57014", "user_request", assert) +} + +func TestContextTimeoutBeforeSql(t *testing.T) { + db := dbtest.Postgres(t).Load(testSchema) + defer db.Close() + + var cancel context.CancelFunc + ctx := context.Background() + ctx, cancel = context.WithTimeout(ctx, time.Millisecond) + assert := assert.New(t) + + sessRaw := &Session{DB: db.Open()} + reg := prometheus.NewRegistry() + + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + defer sess.Close() + defer cancel() + + var count int + time.Sleep(500 * time.Millisecond) + err := sess.GetRaw(ctx, &count, "SELECT pg_sleep(5) FROM people") + assert.ErrorIs(err, ErrTimeout, "any db server operation should return error immediately if context already timed out") + assertDbErrorMetrics(reg, "deadline_exceeded", "deadline_exceeded", "n/a", assert) } -func TestUserCancel(t *testing.T) { +func TestContextCancelledBeforeSql(t *testing.T) { db := dbtest.Postgres(t).Load(testSchema) defer db.Close() @@ -37,14 +76,68 @@ func TestUserCancel(t *testing.T) { ctx, cancel = context.WithCancel(ctx) assert := assert.New(t) - sess := &Session{DB: db.Open()} - defer sess.DB.Close() + sessRaw := &Session{DB: db.Open()} + reg := prometheus.NewRegistry() + + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + defer sess.Close() defer cancel() var count int cancel() err := sess.GetRaw(ctx, &count, "SELECT pg_sleep(2), COUNT(*) FROM people") - assert.ErrorIs(err, ErrCancelled, "any ongoing db server operation should return error immediately after user cancel") + assert.ErrorIs(err, ErrCancelled, "any db server operation should return error immediately if user already cancel") + assertDbErrorMetrics(reg, "canceled", "canceled", "n/a", assert) +} + +func TestContextCancelDuringSql(t *testing.T) { + db := dbtest.Postgres(t).Load(testSchema) + defer db.Close() + + var cancel context.CancelFunc + ctx := context.Background() + ctx, cancel = context.WithCancel(ctx) + assert := assert.New(t) + + sessRaw := &Session{DB: db.Open()} + reg := prometheus.NewRegistry() + + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + defer sess.Close() + defer cancel() + + var count int + var wg sync.WaitGroup + wg.Add(1) + + go func() { + err := sess.GetRaw(ctx, &count, "SELECT pg_sleep(5) FROM people") + assert.ErrorIs(err, ErrCancelled, "any ongoing db server operation should return error immediately after user cancel") + wg.Done() + }() + time.Sleep(time.Second) + cancel() + + require.Eventually(t, func() bool { wg.Wait(); return true }, 5*time.Second, time.Second) + assertDbErrorMetrics(reg, "canceled", "57014", "user_request", assert) +} + +func TestStatementTimeout(t *testing.T) { + assert := assert.New(t) + db := dbtest.Postgres(t).Load(testSchema) + defer db.Close() + + sessRaw, err := Open(db.Dialect, db.DSN, StatementTimeout(50*time.Millisecond)) + reg := prometheus.NewRegistry() + + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + assert.NoError(err) + defer sess.Close() + + var count int + err = sess.GetRaw(context.Background(), &count, "SELECT pg_sleep(2) FROM people") + assert.ErrorIs(err, ErrStatementTimeout) + assertDbErrorMetrics(reg, "n/a", "57014", "statement_timeout", assert) } func TestSession(t *testing.T) { @@ -54,10 +147,13 @@ func TestSession(t *testing.T) { ctx := context.Background() assert := assert.New(t) require := require.New(t) - sess := &Session{DB: db.Open()} - defer sess.DB.Close() + sessRaw := &Session{DB: db.Open()} + reg := prometheus.NewRegistry() - assert.Equal("postgres", sess.Dialect()) + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + defer sess.Close() + + assert.Equal("postgres", sessRaw.Dialect()) var count int err := sess.GetRaw(ctx, &count, "SELECT COUNT(*) FROM people") @@ -124,61 +220,168 @@ func TestSession(t *testing.T) { assert.Len(names, 2) // Test ReplacePlaceholders - out, err := sess.ReplacePlaceholders("? = ? = ? = ??") + out, err := sessRaw.ReplacePlaceholders("? = ? = ? = ??") if assert.NoError(err) { assert.Equal("$1 = $2 = $3 = ?", out) } + + assertZeroErrorMetrics(reg, assert) } -func TestStatementTimeout(t *testing.T) { +func TestIdleTransactionTimeout(t *testing.T) { assert := assert.New(t) db := dbtest.Postgres(t).Load(testSchema) defer db.Close() - sess, err := Open(db.Dialect, db.DSN, StatementTimeout(50*time.Millisecond)) + sessRaw, err := Open(db.Dialect, db.DSN, IdleTransactionTimeout(50*time.Millisecond)) assert.NoError(err) + reg := prometheus.NewRegistry() + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + defer sess.Close() + assert.NoError(sess.Begin(context.Background())) + <-time.After(150 * time.Millisecond) + var count int - err = sess.GetRaw(context.Background(), &count, "SELECT pg_sleep(2), COUNT(*) FROM people") - assert.ErrorIs(err, ErrStatementTimeout) + err = sess.GetRaw(context.Background(), &count, "SELECT COUNT(*) FROM people") + assert.ErrorIs(err, ErrBadConnection) + assertDbErrorMetrics(reg, "n/a", "driver_bad_connection", "n/a", assert) } -func TestIdleTransactionTimeout(t *testing.T) { +func TestIdleTransactionTimeoutAndContextTimeout(t *testing.T) { assert := assert.New(t) db := dbtest.Postgres(t).Load(testSchema) defer db.Close() - sess, err := Open(db.Dialect, db.DSN, IdleTransactionTimeout(50*time.Millisecond)) + var cancel context.CancelFunc + ctx := context.Background() + ctx, cancel = context.WithTimeout(ctx, 150*time.Millisecond) + + sessRaw, err := Open(db.Dialect, db.DSN, IdleTransactionTimeout(100*time.Millisecond)) assert.NoError(err) + reg := prometheus.NewRegistry() + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + defer sess.Close() + defer cancel() + var wg sync.WaitGroup + wg.Add(1) assert.NoError(sess.Begin(context.Background())) - <-time.After(150 * time.Millisecond) - var count int - err = sess.GetRaw(context.Background(), &count, "SELECT COUNT(*) FROM people") - assert.ErrorIs(err, ErrBadConnection) + <-time.After(200 * time.Millisecond) + + go func() { + _, err := sess.ExecRaw(ctx, "SELECT pg_sleep(5) FROM people") + assert.ErrorIs(err, ErrTimeout, "long running db server operation past context timeout, should return timeout") + wg.Done() + }() + + require.Eventually(t, func() bool { wg.Wait(); return true }, 5*time.Second, time.Second) + // this demonstrates subtley of libpq error handling: + // first a server session was created + // 100ms elapsed and idle server session was triggered on server side, server sent signal back to libpq, libpq marks the session locally as bad + // 150ms caller ctx deadlined + // now caller invokes libpq and tries to submit a sql statement on the now-closed session with deadlined ctx also + // libpq only reports an error of deadline exceeded it will not emit the driver_bad_connection due to closed server session + assertDbErrorMetrics(reg, "deadline_exceeded", "deadline_exceeded", "n/a", assert) +} + +func TestDbServerErrorCodeInMetrics(t *testing.T) { + assert := assert.New(t) + db := dbtest.Postgres(t).Load(testSchema) + defer db.Close() + + sessRaw := &Session{DB: db.Open()} + reg := prometheus.NewRegistry() + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + + defer sess.Close() + var pqErr *pq.Error + + _, err := sess.ExecRaw(context.Background(), "oops, invalid sql") + assert.ErrorAs(err, &pqErr) + assertDbErrorMetrics(reg, "n/a", "42601", "n/a", assert) } -func TestSessionRollbackAfterContextCanceled(t *testing.T) { +func TestDbOtherErrorInMetrics(t *testing.T) { + assert := assert.New(t) db := dbtest.Postgres(t).Load(testSchema) defer db.Close() - sess := setupRolledbackTx(t, db) - defer sess.DB.Close() + conn := db.Open() + conn.Close() + sessRaw := &Session{DB: conn} + reg := prometheus.NewRegistry() + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + + defer sess.Close() - assert.ErrorIs(t, sess.Rollback(), ErrAlreadyRolledback) + var count int + err := sess.GetRaw(context.Background(), &count, "SELECT COUNT(*) FROM people") + assert.ErrorContains(err, "sql: database is closed") + assertDbErrorMetrics(reg, "n/a", "other", "n/a", assert) } -func TestSessionCommitAfterContextCanceled(t *testing.T) { +func TestSessionAfterRollback(t *testing.T) { + assert := assert.New(t) db := dbtest.Postgres(t).Load(testSchema) defer db.Close() - sess := setupRolledbackTx(t, db) - defer sess.DB.Close() + sessRaw := setupRolledbackTx(t, db) + reg := prometheus.NewRegistry() + sess := RegisterMetrics(sessRaw, "test", "subtest", reg) + defer sess.Close() + + var count int + err := sess.GetRaw(context.Background(), &count, "SELECT COUNT(*) FROM people") + assert.ErrorIs(err, ErrAlreadyRolledback) + assertDbErrorMetrics(reg, "n/a", "tx_already_rollback", "n/a", assert) +} - assert.ErrorIs(t, sess.Commit(), ErrAlreadyRolledback) +func assertZeroErrorMetrics(reg *prometheus.Registry, assert *assert.Assertions) { + metrics, err := reg.Gather() + assert.NoError(err) + + for _, metricFamily := range metrics { + if metricFamily.GetName() == "test_db_error_total" { + assert.Fail("error_total metrics should not be present, never incremented") + } + } + +} + +func assertDbErrorMetrics(reg *prometheus.Registry, assertCtxError, assertDbError, assertDbErrorExtra string, assert *assert.Assertions) { + metrics, err := reg.Gather() + assert.NoError(err) + + for _, metricFamily := range metrics { + if metricFamily.GetName() == "test_db_error_total" { + assert.Len(metricFamily.GetMetric(), 1) + assert.Equal(metricFamily.GetMetric()[0].GetCounter().GetValue(), float64(1)) + var ctxError = "" + var dbError = "" + var dbErrorExtra = "" + for _, label := range metricFamily.GetMetric()[0].GetLabel() { + if label.GetName() == "ctx_error" { + ctxError = label.GetValue() + } + if label.GetName() == "db_error" { + dbError = label.GetValue() + } + if label.GetName() == "db_error_extra" { + dbErrorExtra = label.GetValue() + } + } + + assert.Equal(ctxError, assertCtxError) + assert.Equal(dbError, assertDbError) + assert.Equal(dbErrorExtra, assertDbErrorExtra) + return + } + } + assert.Fail("error_total metrics were not correct") } func setupRolledbackTx(t *testing.T, db *dbtest.DB) *Session {