Skip to content

Commit

Permalink
fix inconsistent behaviors with prometheus when scraping (#5153)
Browse files Browse the repository at this point in the history
* fix inconsistent behaviors with prometheus when scraping

1. address #4959. skip job with wrong syntax in `scrape_configs` with error logs instead of exiting;
2. show error messages on vmagent /targets ui if there are wrong auth configs in `scrape_configs`, previously will print error logs and do scrape without auth header;
3. don't send requests if there are wrong auth configs in:
    1. vmagent remoteWrite;
    2. vmalert datasource/remoteRead/remoteWrite/notifier.

* add changelogs

* address review comments

* fix ut
  • Loading branch information
Haleygo committed Oct 17, 2023
1 parent 3e2f095 commit e16d3f5
Show file tree
Hide file tree
Showing 17 changed files with 293 additions and 156 deletions.
16 changes: 11 additions & 5 deletions app/vmagent/remotewrite/client.go
Expand Up @@ -323,26 +323,32 @@ func (c *client) runWorker() {
}

func (c *client) doRequest(url string, body []byte) (*http.Response, error) {
req := c.newRequest(url, body)
req, err := c.newRequest(url, body)
if err != nil {
return nil, err
}
resp, err := c.hc.Do(req)
if err != nil && errors.Is(err, io.EOF) {
// it is likely connection became stale.
// So we do one more attempt in hope request will succeed.
// If not, the error should be handled by the caller as usual.
// This should help with https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4139
req = c.newRequest(url, body)
req, _ = c.newRequest(url, body)
resp, err = c.hc.Do(req)
}
return resp, err
}

func (c *client) newRequest(url string, body []byte) *http.Request {
func (c *client) newRequest(url string, body []byte) (*http.Request, error) {
reqBody := bytes.NewBuffer(body)
req, err := http.NewRequest(http.MethodPost, url, reqBody)
if err != nil {
logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", url, err)
}
c.authCfg.SetHeaders(req, true)
err = c.authCfg.SetHeaders(req, true)
if err != nil {
return nil, err
}
h := req.Header
h.Set("User-Agent", "vmagent")
h.Set("Content-Type", "application/x-protobuf")
Expand All @@ -360,7 +366,7 @@ func (c *client) newRequest(url string, body []byte) *http.Request {
logger.Warnf("cannot sign remoteWrite request with AWS sigv4: %s", err)
}
}
return req
return req, nil
}

// sendBlockHTTP sends the given block to c.remoteWriteURL.
Expand Down
4 changes: 4 additions & 0 deletions app/vmalert/datasource/init.go
Expand Up @@ -111,6 +111,10 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
if err != nil {
return nil, fmt.Errorf("failed to configure auth: %w", err)
}
_, err = authCfg.GetAuthHeader()
if err != nil {
return nil, fmt.Errorf("failed to set request auth header to datasource %q: %s", *addr, err)
}

return &VMStorage{
c: &http.Client{Transport: tr},
Expand Down
41 changes: 28 additions & 13 deletions app/vmalert/datasource/vm.go
Expand Up @@ -137,12 +137,15 @@ func NewVMStorage(baseURL string, authCfg *promauth.Config, lookBack time.Durati

// Query executes the given query and returns parsed response
func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) (Result, *http.Request, error) {
req := s.newQueryRequest(query, ts)
req, err := s.newQueryRequest(query, ts)
if err != nil {
return Result{}, nil, err
}
resp, err := s.do(ctx, req)
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
// something in the middle between client and datasource might be closing
// the connection. So we do a one more attempt in hope request will succeed.
req = s.newQueryRequest(query, ts)
req, _ = s.newQueryRequest(query, ts)
resp, err = s.do(ctx, req)
}
if err != nil {
Expand Down Expand Up @@ -173,12 +176,15 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim
if end.IsZero() {
return res, fmt.Errorf("end param is missing")
}
req := s.newQueryRangeRequest(query, start, end)
req, err := s.newQueryRangeRequest(query, start, end)
if err != nil {
return Result{}, err
}
resp, err := s.do(ctx, req)
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
// something in the middle between client and datasource might be closing
// the connection. So we do a one more attempt in hope request will succeed.
req = s.newQueryRangeRequest(query, start, end)
req, _ = s.newQueryRangeRequest(query, start, end)
resp, err = s.do(ctx, req)
}
if err != nil {
Expand Down Expand Up @@ -210,14 +216,20 @@ func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response,
return resp, nil
}

func (s *VMStorage) newQueryRangeRequest(query string, start, end time.Time) *http.Request {
req := s.newRequest()
func (s *VMStorage) newQueryRangeRequest(query string, start, end time.Time) (*http.Request, error) {
req, err := s.newRequest()
if err != nil {
return nil, fmt.Errorf("cannot create query_range request to datasource %q: %s", s.datasourceURL, err)
}
s.setPrometheusRangeReqParams(req, query, start, end)
return req
return req, nil
}

func (s *VMStorage) newQueryRequest(query string, ts time.Time) *http.Request {
req := s.newRequest()
func (s *VMStorage) newQueryRequest(query string, ts time.Time) (*http.Request, error) {
req, err := s.newRequest()
if err != nil {
return nil, fmt.Errorf("cannot create query request to datasource %q: %s", s.datasourceURL, err)
}
switch s.dataSourceType {
case "", datasourcePrometheus:
s.setPrometheusInstantReqParams(req, query, ts)
Expand All @@ -226,20 +238,23 @@ func (s *VMStorage) newQueryRequest(query string, ts time.Time) *http.Request {
default:
logger.Panicf("BUG: engine not found: %q", s.dataSourceType)
}
return req
return req, nil
}

func (s *VMStorage) newRequest() *http.Request {
func (s *VMStorage) newRequest() (*http.Request, error) {
req, err := http.NewRequest(http.MethodPost, s.datasourceURL, nil)
if err != nil {
logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", s.datasourceURL, err)
}
req.Header.Set("Content-Type", "application/json")
if s.authCfg != nil {
s.authCfg.SetHeaders(req, true)
err = s.authCfg.SetHeaders(req, true)
if err != nil {
return nil, err
}
}
for _, h := range s.extraHeaders {
req.Header.Set(h.key, h.value)
}
return req
return req, nil
}
10 changes: 8 additions & 2 deletions app/vmalert/datasource/vm_test.go
Expand Up @@ -637,7 +637,10 @@ func TestRequestParams(t *testing.T) {

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
req := tc.vm.newRequest()
req, err := tc.vm.newRequest()
if err != nil {
t.Fatal(err)
}
switch tc.vm.dataSourceType {
case "", datasourcePrometheus:
if tc.queryRange {
Expand Down Expand Up @@ -732,7 +735,10 @@ func TestHeaders(t *testing.T) {
for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
vm := tt.vmFn()
req := vm.newQueryRequest("foo", time.Now())
req, err := vm.newQueryRequest("foo", time.Now())
if err != nil {
t.Fatal(err)
}
tt.checkFn(t, req)
})
}
Expand Down
5 changes: 4 additions & 1 deletion app/vmalert/notifier/alertmanager.go
Expand Up @@ -88,7 +88,10 @@ func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[st
req = req.WithContext(ctx)

if am.authCfg != nil {
am.authCfg.SetHeaders(req, true)
err = am.authCfg.SetHeaders(req, true)
if err != nil {
return err
}
}
resp, err := am.client.Do(req)
if err != nil {
Expand Down
5 changes: 4 additions & 1 deletion app/vmalert/remotewrite/client.go
Expand Up @@ -280,7 +280,10 @@ func (c *Client) send(ctx context.Context, data []byte) error {
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")

if c.authCfg != nil {
c.authCfg.SetHeaders(req, true)
err = c.authCfg.SetHeaders(req, true)
if err != nil {
return &nonRetriableError{err: err}
}
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
Expand Down
4 changes: 3 additions & 1 deletion docs/CHANGELOG.md
Expand Up @@ -38,6 +38,7 @@ The sandbox cluster installation is running under the constant load generated by
This also means that `datasource.queryTimeAlignment` command-line flag becomes deprecated now and will have no effect if configured. If `datasource.queryTimeAlignment` was set to `false` before, then `eval_alignment` has to be set to `false` explicitly under group.
See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support data ingestion from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent), [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4712).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): skip job with error logs if there is incorrect syntax under `scrape_configs`, previously will exit. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
* FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup.html): add `-filestream.disableFadvise` command-line flag, which can be used for disabling `fadvise` syscall during backup upload to the remote storage. By default `vmbackup` uses `fadvise` syscall in order to prevent from eviction of recently accessed data from the [OS page cache](https://en.wikipedia.org/wiki/Page_cache) when backing up large files. Sometimes the `fadvise` syscall may take significant amounts of CPU when the backup is performed with large value of `-concurrency` command-line flag on systems with big number of CPU cores. In this case it is better to manually disable `fadvise` syscall by passing `-filestream.disableFadvise` command-line flag to `vmbackup`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5120) for details.
* FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup.html): add `-deleteAllObjectVersions` command-line flag, which can be used for forcing removal of all object versions in remote object storage. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5121) issue and [these docs](https://docs.victoriametrics.com/vmbackup.html#permanent-deletion-of-objects-in-s3-compatible-storages) for the details.
* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): account for `vmauth` component for alerts `ServiceDown` and `TooManyRestarts`.
Expand All @@ -50,14 +51,15 @@ The sandbox cluster installation is running under the constant load generated by

* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): don't send requests if there is wrong auth config in `datasource`, `remoteWrite`, `remoteRead` and `notifier` section, previously will send requests without auth header. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
* BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087) for details.
* BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.
* BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph.
* BUGFIX: dashboards/vmalert: properly apply time series filter for panel `No data errors`. Before, the panel didn't respect `job` or `instance` filters.
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): bump hard-coded limit for search query size at `vmstorage` from 1MB to 5MB. The change should be more suitable for real-world scenarios and protect vmstorage from excessive memory usage. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5154) for details
* BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup.html): fix error when creating an incremental backup with the `-origin` command-line flag. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5144) for details.
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fix vmagent ignoring configuration reload for streaming aggregation if it was started with empty streaming aggregation config. Thanks to @aluode99 for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5178).

* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): don't send requests if there is wrong auth config in `scrape_configs` and `remoteWrite` section, previously will send requests without auth header. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).

## [v1.94.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.94.0)

Expand Down

0 comments on commit e16d3f5

Please sign in to comment.