diff --git a/backends/s3/metrics.go b/backends/s3/metrics.go index 5ba24db..a935fa7 100644 --- a/backends/s3/metrics.go +++ b/backends/s3/metrics.go @@ -26,10 +26,27 @@ var ( }, []string{"method"}, ) + metricCallErrorsType = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "storage_s3_call_error_by_type_total", + Help: "S3 API call errors by method and error type", + }, + []string{"method", "error"}, + ) + metricCallHistogram = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "storage_s3_call_duration_seconds", + Help: "S3 API call duration by method", + Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60}, + }, + []string{"method"}, + ) ) func init() { prometheus.MustRegister(metricLastCallTimestamp) prometheus.MustRegister(metricCalls) prometheus.MustRegister(metricCallErrors) + prometheus.MustRegister(metricCallErrorsType) + prometheus.MustRegister(metricCallHistogram) } diff --git a/backends/s3/s3.go b/backends/s3/s3.go index c23550b..0a3bed8 100644 --- a/backends/s3/s3.go +++ b/backends/s3/s3.go @@ -236,9 +236,16 @@ func (b *Backend) List(ctx context.Context, prefix string) (blobList simpleblob. return blobs.WithPrefix(prefix), nil } +func recordMinioDurationMetric(method string, start time.Time) { + elapsed := time.Since(start) + metricCallHistogram.WithLabelValues(method).Observe(elapsed.Seconds()) +} + func (b *Backend) doList(ctx context.Context, prefix string) (simpleblob.BlobList, error) { var blobs simpleblob.BlobList + defer recordMinioDurationMetric("list", time.Now()) + // Runes to strip from blob names for GlobalPrefix // This is fine, because we can trust the API to only return with the prefix. // TODO: trust but verify @@ -252,6 +259,7 @@ func (b *Backend) doList(ctx context.Context, prefix string) (simpleblob.BlobLis // Handle error returned by MinIO client if err := convertMinioError(obj.Err, true); err != nil { metricCallErrors.WithLabelValues("list").Inc() + metricCallErrorsType.WithLabelValues("list", errorToMetricsLabel(err)).Inc() return nil, err } @@ -303,9 +311,12 @@ func (b *Backend) doLoadReader(ctx context.Context, name string) (io.ReadCloser, metricCalls.WithLabelValues("load").Inc() metricLastCallTimestamp.WithLabelValues("load").SetToCurrentTime() + defer recordMinioDurationMetric("load", time.Now()) + obj, err := b.client.GetObject(ctx, b.opt.Bucket, name, minio.GetObjectOptions{}) if err = convertMinioError(err, false); err != nil { metricCallErrors.WithLabelValues("load").Inc() + metricCallErrorsType.WithLabelValues("load", errorToMetricsLabel(err)).Inc() return nil, err } if obj == nil { @@ -314,6 +325,7 @@ func (b *Backend) doLoadReader(ctx context.Context, name string) (io.ReadCloser, info, err := obj.Stat() if err = convertMinioError(err, false); err != nil { metricCallErrors.WithLabelValues("load").Inc() + metricCallErrorsType.WithLabelValues("load", errorToMetricsLabel(err)).Inc() return nil, err } if info.Key == "" { @@ -347,6 +359,7 @@ func (b *Backend) doStore(ctx context.Context, name string, data []byte) (minio. func (b *Backend) doStoreReader(ctx context.Context, name string, r io.Reader, size int64) (minio.UploadInfo, error) { metricCalls.WithLabelValues("store").Inc() metricLastCallTimestamp.WithLabelValues("store").SetToCurrentTime() + defer recordMinioDurationMetric("store", time.Now()) putObjectOptions := minio.PutObjectOptions{ NumThreads: b.opt.NumMinioThreads, @@ -358,6 +371,7 @@ func (b *Backend) doStoreReader(ctx context.Context, name string, r io.Reader, s err = convertMinioError(err, false) if err != nil { metricCallErrors.WithLabelValues("store").Inc() + metricCallErrorsType.WithLabelValues("store", errorToMetricsLabel(err)).Inc() } return info, err } @@ -377,10 +391,12 @@ func (b *Backend) Delete(ctx context.Context, name string) error { func (b *Backend) doDelete(ctx context.Context, name string) error { metricCalls.WithLabelValues("delete").Inc() metricLastCallTimestamp.WithLabelValues("delete").SetToCurrentTime() + defer recordMinioDurationMetric("delete", time.Now()) err := b.client.RemoveObject(ctx, b.opt.Bucket, name, minio.RemoveObjectOptions{}) if err = convertMinioError(err, false); err != nil { metricCallErrors.WithLabelValues("delete").Inc() + metricCallErrorsType.WithLabelValues("delete", errorToMetricsLabel(err)).Inc() } return err } @@ -555,6 +571,40 @@ func convertMinioError(err error, isList bool) error { return err } +// errorToMetricsLabel converts an error into a prometheus label. +// If error is a NotExist error, "NotFound" is returned. +// If error is a timeout, "Timeout" is returned. +// If error is a DNS error, the DNS error is returned. +// If error is a URL error, the URL error is returned. +// If error is a MinIO error, the MinIO error code is returned. +// Otherwise "Unknown" is returned. +func errorToMetricsLabel(err error) string { + if err == nil { + return "ok" + } + if errors.Is(err, os.ErrNotExist) { + return "NotFound" + } + var netError *net.OpError + if errors.Is(err, context.DeadlineExceeded) || + (errors.As(err, &netError) && netError.Timeout()) { + return "Timeout" + } + var dnsErr *net.DNSError + if errors.As(err, &dnsErr) { + return "DNSError" + } + var urlErr *url.Error + if errors.As(err, &urlErr) { + return "URLError" + } + errRes := minio.ToErrorResponse(err) + if errRes.Code != "" { + return errRes.Code + } + return "Unknown" +} + func getOpt[T comparable](optVal, defaultVal T) T { var zero T if optVal == zero { diff --git a/backends/s3/s3_test.go b/backends/s3/s3_test.go index af0dfb4..f1104da 100644 --- a/backends/s3/s3_test.go +++ b/backends/s3/s3_test.go @@ -2,10 +2,12 @@ package s3 import ( "context" + "strings" "testing" "time" "github.com/minio/minio-go/v7" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/testcontainers/testcontainers-go" @@ -64,15 +66,49 @@ func getBackend(ctx context.Context, t *testing.T) (b *Backend) { return b } +func getBadBackend(ctx context.Context, url string, t *testing.T) (b *Backend) { + b, err := New(ctx, Options{ + EndpointURL: url, + AccessKey: "foo", + SecretKey: "bar", + Bucket: "test-bucket", + CreateBucket: false, + DialTimeout: 1 * time.Second, + }) + require.NoError(t, err) + return b +} + func TestBackend(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - b := getBackend(ctx, t) tester.DoBackendTests(t, b) assert.Len(t, b.lastMarker, 0) } +func TestMetrics(t *testing.T) { + bTimeout := getBadBackend(context.Background(), "http://1.2.3.4:1234", t) + + _, err := bTimeout.List(context.Background(), "") + assert.Error(t, err) + + expectedMetric := "# HELP storage_s3_call_error_by_type_total S3 API call errors by method and error type\n# TYPE storage_s3_call_error_by_type_total counter\nstorage_s3_call_error_by_type_total{error=\"Timeout\",method=\"list\"} 1\nstorage_s3_call_error_by_type_total{error=\"NotFound\",method=\"load\"} 3\n" + + err = testutil.CollectAndCompare(metricCallErrorsType, strings.NewReader(expectedMetric), "storage_s3_call_error_by_type_total") + assert.NoError(t, err) + + bBadHost := getBadBackend(context.Background(), "http://nosuchhost:1234", t) + + _, err = bBadHost.List(context.Background(), "") + assert.Error(t, err) + + expectedMetric += "storage_s3_call_error_by_type_total{error=\"DNSError\",method=\"list\"} 1\n" + + err = testutil.CollectAndCompare(metricCallErrorsType, strings.NewReader(expectedMetric), "storage_s3_call_error_by_type_total") + assert.NoError(t, err) +} + func TestBackend_marker(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() diff --git a/go.mod b/go.mod index 8cfff84..a93684d 100644 --- a/go.mod +++ b/go.mod @@ -44,6 +44,7 @@ require ( github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/cpuid/v2 v2.2.11 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/magiconair/properties v1.8.10 // indirect github.com/minio/crc64nvme v1.0.2 // indirect diff --git a/go.sum b/go.sum index a5a69e4..c0b4e3d 100644 --- a/go.sum +++ b/go.sum @@ -74,6 +74,8 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE=