From 8910e74a623843ee5dba3815a612089c785a9948 Mon Sep 17 00:00:00 2001 From: Neil Cook Date: Tue, 28 Oct 2025 17:09:24 +0000 Subject: [PATCH 1/6] metrics: Additional metrics for S3 backend - New histogram metric measuring the duration of S3 calls by method - New counter recording errors by method and error type (this is intended to replace the existing metric that just records errors by method) --- backends/s3/metrics.go | 17 +++++++++++++++++ backends/s3/s3.go | 30 ++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/backends/s3/metrics.go b/backends/s3/metrics.go index 5ba24db..a935fa7 100644 --- a/backends/s3/metrics.go +++ b/backends/s3/metrics.go @@ -26,10 +26,27 @@ var ( }, []string{"method"}, ) + metricCallErrorsType = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "storage_s3_call_error_by_type_total", + Help: "S3 API call errors by method and error type", + }, + []string{"method", "error"}, + ) + metricCallHistogram = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "storage_s3_call_duration_seconds", + Help: "S3 API call duration by method", + Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60}, + }, + []string{"method"}, + ) ) func init() { prometheus.MustRegister(metricLastCallTimestamp) prometheus.MustRegister(metricCalls) prometheus.MustRegister(metricCallErrors) + prometheus.MustRegister(metricCallErrorsType) + prometheus.MustRegister(metricCallHistogram) } diff --git a/backends/s3/s3.go b/backends/s3/s3.go index c23550b..1b4aa88 100644 --- a/backends/s3/s3.go +++ b/backends/s3/s3.go @@ -236,9 +236,16 @@ func (b *Backend) List(ctx context.Context, prefix string) (blobList simpleblob. return blobs.WithPrefix(prefix), nil } +func recordMinioDurationMetric(method string, start time.Time) { + elapsed := time.Since(start) + metricCallHistogram.WithLabelValues(method).Observe(elapsed.Seconds()) +} + func (b *Backend) doList(ctx context.Context, prefix string) (simpleblob.BlobList, error) { var blobs simpleblob.BlobList + defer recordMinioDurationMetric("list", time.Now()) + // Runes to strip from blob names for GlobalPrefix // This is fine, because we can trust the API to only return with the prefix. // TODO: trust but verify @@ -252,6 +259,7 @@ func (b *Backend) doList(ctx context.Context, prefix string) (simpleblob.BlobLis // Handle error returned by MinIO client if err := convertMinioError(obj.Err, true); err != nil { metricCallErrors.WithLabelValues("list").Inc() + metricCallErrorsType.WithLabelValues("list", errorToMetricsLabel(err)).Inc() return nil, err } @@ -303,9 +311,12 @@ func (b *Backend) doLoadReader(ctx context.Context, name string) (io.ReadCloser, metricCalls.WithLabelValues("load").Inc() metricLastCallTimestamp.WithLabelValues("load").SetToCurrentTime() + defer recordMinioDurationMetric("load", time.Now()) + obj, err := b.client.GetObject(ctx, b.opt.Bucket, name, minio.GetObjectOptions{}) if err = convertMinioError(err, false); err != nil { metricCallErrors.WithLabelValues("load").Inc() + metricCallErrorsType.WithLabelValues("load", errorToMetricsLabel(err)).Inc() return nil, err } if obj == nil { @@ -314,6 +325,7 @@ func (b *Backend) doLoadReader(ctx context.Context, name string) (io.ReadCloser, info, err := obj.Stat() if err = convertMinioError(err, false); err != nil { metricCallErrors.WithLabelValues("load").Inc() + metricCallErrorsType.WithLabelValues("load", errorToMetricsLabel(err)).Inc() return nil, err } if info.Key == "" { @@ -347,6 +359,7 @@ func (b *Backend) doStore(ctx context.Context, name string, data []byte) (minio. func (b *Backend) doStoreReader(ctx context.Context, name string, r io.Reader, size int64) (minio.UploadInfo, error) { metricCalls.WithLabelValues("store").Inc() metricLastCallTimestamp.WithLabelValues("store").SetToCurrentTime() + defer recordMinioDurationMetric("store", time.Now()) putObjectOptions := minio.PutObjectOptions{ NumThreads: b.opt.NumMinioThreads, @@ -358,6 +371,7 @@ func (b *Backend) doStoreReader(ctx context.Context, name string, r io.Reader, s err = convertMinioError(err, false) if err != nil { metricCallErrors.WithLabelValues("store").Inc() + metricCallErrorsType.WithLabelValues("store", errorToMetricsLabel(err)).Inc() } return info, err } @@ -377,10 +391,12 @@ func (b *Backend) Delete(ctx context.Context, name string) error { func (b *Backend) doDelete(ctx context.Context, name string) error { metricCalls.WithLabelValues("delete").Inc() metricLastCallTimestamp.WithLabelValues("delete").SetToCurrentTime() + defer recordMinioDurationMetric("delete", time.Now()) err := b.client.RemoveObject(ctx, b.opt.Bucket, name, minio.RemoveObjectOptions{}) if err = convertMinioError(err, false); err != nil { metricCallErrors.WithLabelValues("delete").Inc() + metricCallErrorsType.WithLabelValues("delete", errorToMetricsLabel(err)).Inc() } return err } @@ -555,6 +571,20 @@ func convertMinioError(err error, isList bool) error { return err } +func errorToMetricsLabel(err error) string { + if err == nil { + return "ok" + } + if errors.Is(err, os.ErrNotExist) { + return "NotFound" + } + if errors.Is(err, context.DeadlineExceeded) { + return "Timeout" + } + errRes := minio.ToErrorResponse(err) + return errRes.Code +} + func getOpt[T comparable](optVal, defaultVal T) T { var zero T if optVal == zero { From f5d24d6802fe23a95c10befc3f93cf6d4b2c7642 Mon Sep 17 00:00:00 2001 From: Neil Cook Date: Wed, 29 Oct 2025 10:42:40 +0000 Subject: [PATCH 2/6] metrics: Add additional error-handling cases to errorToMetricsLabel() --- backends/s3/s3.go | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/backends/s3/s3.go b/backends/s3/s3.go index 1b4aa88..4b60b43 100644 --- a/backends/s3/s3.go +++ b/backends/s3/s3.go @@ -571,6 +571,13 @@ func convertMinioError(err error, isList bool) error { return err } +// errorToMetricsLabel converts an error into a prometheus label. +// If error is a NotExist error, "NotFound" is returned. +// If error is a timeout, "Timeout" is returned. +// If error is a DNS error, the DNS error is returned. +// If error is a URL error, the URL error is returned. +// If error is a MinIO error, the MinIO error code is returned. +// Otherwise "Unknown" is returned. func errorToMetricsLabel(err error) string { if err == nil { return "ok" @@ -578,11 +585,24 @@ func errorToMetricsLabel(err error) string { if errors.Is(err, os.ErrNotExist) { return "NotFound" } - if errors.Is(err, context.DeadlineExceeded) { + var netError *net.OpError + if errors.Is(err, context.DeadlineExceeded) || + (errors.As(err, &netError) && netError.Timeout()) { return "Timeout" } + var dnsErr *net.DNSError + if errors.As(err, &dnsErr) { + return dnsErr.Error() + } + var urlErr *url.Error + if errors.As(err, &urlErr) { + return urlErr.Err.Error() + } errRes := minio.ToErrorResponse(err) - return errRes.Code + if errRes.Code != "" { + return errRes.Code + } + return "Unknown" } func getOpt[T comparable](optVal, defaultVal T) T { From b9afd3942650f7542e0ec22da1f46e16e242849b Mon Sep 17 00:00:00 2001 From: Neil Cook Date: Wed, 29 Oct 2025 10:43:13 +0000 Subject: [PATCH 3/6] metrics: Add tests for metrics, especially error cases --- backends/s3/s3_test.go | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/backends/s3/s3_test.go b/backends/s3/s3_test.go index af0dfb4..323f793 100644 --- a/backends/s3/s3_test.go +++ b/backends/s3/s3_test.go @@ -2,10 +2,12 @@ package s3 import ( "context" + "strings" "testing" "time" "github.com/minio/minio-go/v7" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/testcontainers/testcontainers-go" @@ -64,15 +66,49 @@ func getBackend(ctx context.Context, t *testing.T) (b *Backend) { return b } +func getBadBackend(ctx context.Context, url string, t *testing.T) (b *Backend) { + b, err := New(ctx, Options{ + EndpointURL: url, + AccessKey: "foo", + SecretKey: "bar", + Bucket: "test-bucket", + CreateBucket: false, + DialTimeout: 1 * time.Second, + }) + require.NoError(t, err) + return b +} + func TestBackend(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - b := getBackend(ctx, t) tester.DoBackendTests(t, b) assert.Len(t, b.lastMarker, 0) } +func TestMetrics(t *testing.T) { + bTimeout := getBadBackend(context.Background(), "http://1.2.3.4:1234", t) + + _, err := bTimeout.List(context.Background(), "") + assert.Error(t, err) + + expectedMetric := "# HELP storage_s3_call_error_by_type_total S3 API call errors by method and error type\n# TYPE storage_s3_call_error_by_type_total counter\nstorage_s3_call_error_by_type_total{error=\"Timeout\",method=\"list\"} 1\nstorage_s3_call_error_by_type_total{error=\"NotFound\",method=\"load\"} 3\n" + + err = testutil.CollectAndCompare(metricCallErrorsType, strings.NewReader(expectedMetric), "storage_s3_call_error_by_type_total") + assert.NoError(t, err) + + bBadHost := getBadBackend(context.Background(), "http://nosuchhost:1234", t) + + _, err = bBadHost.List(context.Background(), "") + assert.Error(t, err) + + expectedMetric += "storage_s3_call_error_by_type_total{error=\"lookup nosuchhost: no such host\",method=\"list\"} 1\n" + + err = testutil.CollectAndCompare(metricCallErrorsType, strings.NewReader(expectedMetric), "storage_s3_call_error_by_type_total") + assert.NoError(t, err) +} + func TestBackend_marker(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() From 5eb13f1ff7777d8de5f07f38c5846dd581e41251 Mon Sep 17 00:00:00 2001 From: Neil Cook Date: Wed, 29 Oct 2025 10:47:55 +0000 Subject: [PATCH 4/6] Update go.mod and go.sum with new dependencies from additional metrics --- go.mod | 1 + go.sum | 2 ++ 2 files changed, 3 insertions(+) diff --git a/go.mod b/go.mod index 8cfff84..a93684d 100644 --- a/go.mod +++ b/go.mod @@ -44,6 +44,7 @@ require ( github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/cpuid/v2 v2.2.11 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/magiconair/properties v1.8.10 // indirect github.com/minio/crc64nvme v1.0.2 // indirect diff --git a/go.sum b/go.sum index a5a69e4..c0b4e3d 100644 --- a/go.sum +++ b/go.sum @@ -74,6 +74,8 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE= From 658fd0b2b0198e010b63499ebf453a1a16db9bd3 Mon Sep 17 00:00:00 2001 From: Neil Cook Date: Wed, 29 Oct 2025 12:29:57 +0000 Subject: [PATCH 5/6] metrics: Simplify the label for dns and url errors to a single string for lower cardinality and better reproducibility in tests --- backends/s3/s3.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/s3/s3.go b/backends/s3/s3.go index 4b60b43..0a3bed8 100644 --- a/backends/s3/s3.go +++ b/backends/s3/s3.go @@ -592,11 +592,11 @@ func errorToMetricsLabel(err error) string { } var dnsErr *net.DNSError if errors.As(err, &dnsErr) { - return dnsErr.Error() + return "DNSError" } var urlErr *url.Error if errors.As(err, &urlErr) { - return urlErr.Err.Error() + return "URLError" } errRes := minio.ToErrorResponse(err) if errRes.Code != "" { From 5df19ddfec574a2d1a8f72f44486f51e66096343 Mon Sep 17 00:00:00 2001 From: Neil Cook Date: Wed, 29 Oct 2025 12:30:33 +0000 Subject: [PATCH 6/6] metrics: Fix metrics test to match simpler DNS error label --- backends/s3/s3_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/s3/s3_test.go b/backends/s3/s3_test.go index 323f793..f1104da 100644 --- a/backends/s3/s3_test.go +++ b/backends/s3/s3_test.go @@ -103,7 +103,7 @@ func TestMetrics(t *testing.T) { _, err = bBadHost.List(context.Background(), "") assert.Error(t, err) - expectedMetric += "storage_s3_call_error_by_type_total{error=\"lookup nosuchhost: no such host\",method=\"list\"} 1\n" + expectedMetric += "storage_s3_call_error_by_type_total{error=\"DNSError\",method=\"list\"} 1\n" err = testutil.CollectAndCompare(metricCallErrorsType, strings.NewReader(expectedMetric), "storage_s3_call_error_by_type_total") assert.NoError(t, err)