From 817fe7ef66617d9d3b2eb613820c6ce8e30d32af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20=C5=9Awi=C4=85tek?= Date: Fri, 18 Aug 2023 12:17:12 +0200 Subject: [PATCH] feat(metrics/collector): adjust resources and autoscaling This is based on internal dogfooding. The collector is, in general, is heavily memory-bound, much like Prometheus. --- .changelog/3219.changed.txt | 1 + .changelog/3221.added.txt | 1 + deploy/helm/sumologic/README.md | 6 +++--- .../remote-write-proxy.conf | 19 +++++++++++++++++-- .../otelcol/opentelemetrycollector.yaml | 5 ++++- .../remote-write-proxy/deployment.yaml | 1 + .../metrics/remote-write-proxy/service.yaml | 5 ++++- deploy/helm/sumologic/values.yaml | 10 +++++----- .../metrics_collector_otc/basic.output.yaml | 11 +++++++---- .../metrics_collector_otc/custom.output.yaml | 7 +++++-- .../remote_write_proxy/basic.output.yaml | 1 + .../full_config.output.yaml | 1 + .../full_configmap.output.yaml | 16 ++++++++++++++-- tests/integration/internal/constants.go | 1 + .../values/values_helm_ot_metrics.yaml | 2 +- 15 files changed, 66 insertions(+), 21 deletions(-) create mode 100644 .changelog/3219.changed.txt create mode 100644 .changelog/3221.added.txt diff --git a/.changelog/3219.changed.txt b/.changelog/3219.changed.txt new file mode 100644 index 0000000000..836ba24fba --- /dev/null +++ b/.changelog/3219.changed.txt @@ -0,0 +1 @@ +feat(metrics/collector): adjust resources and autoscaling \ No newline at end of file diff --git a/.changelog/3221.added.txt b/.changelog/3221.added.txt new file mode 100644 index 0000000000..20a8eb0a93 --- /dev/null +++ b/.changelog/3221.added.txt @@ -0,0 +1 @@ +feat(metrics/collector): support remote write proxy \ No newline at end of file diff --git a/deploy/helm/sumologic/README.md b/deploy/helm/sumologic/README.md index f94d3c13c9..72baf5d386 100644 --- a/deploy/helm/sumologic/README.md +++ b/deploy/helm/sumologic/README.md @@ -117,13 +117,13 @@ The following table lists the configurable parameters of the Sumo Logic chart an | `sumologic.metrics.serviceMonitors` | Configuration of Sumo Logic Kubernetes Collection components serviceMonitors | See [values.yaml] | | `sumologic.metrics.collector.otelcol.enabled` | Enable experimental otelcol metrics collector | See [values.yaml] | | `sumologic.metrics.collector.otelcol.scrapeInterval` | The default scrape interval for the collector. | `30s` | -| `sumologic.metrics.collector.otelcol.replicaCount` | Replica count for the experimental otelcol metrics collector | `3` | +| `sumologic.metrics.collector.otelcol.replicaCount` | Replica count for the experimental otelcol metrics collector | `1` | | `sumologic.metrics.collector.otelcol.resources` | Resource requests and limits for the experimental otelcol metrics collector | See [values.yaml] | | `sumologic.metrics.collector.otelcol.autoscaling.enabled` | Option to turn autoscaling on for the experimental otelcol metrics and specify params for HPA. Autoscaling needs metrics-server to access cpu metrics. collector | `false` | | `sumologic.metrics.collector.otelcol.autoscaling.maxReplicas` | Default max replicas for autoscaling. collector | `10` | | `sumologic.metrics.collector.otelcol.autoscaling.minReplicas` | Default min replicas for autoscaling. collector | `3` | -| `sumologic.metrics.collector.otelcol.autoscaling.targetCPUUtilizationPercentage` | The desired target CPU utilization for autoscaling. | `100` | -| `sumologic.metrics.collector.otelcol.autoscaling.targetMemoryUtilizationPercentage` | The desired target memory utilization for autoscaling. | `50` | +| `sumologic.metrics.collector.otelcol.autoscaling.targetCPUUtilizationPercentage` | The desired target CPU utilization for autoscaling. | `70` | +| `sumologic.metrics.collector.otelcol.autoscaling.targetMemoryUtilizationPercentage` | The desired target memory utilization for autoscaling. | `70` | | `sumologic.metrics.collector.otelcol.serviceMonitorSelector` | Selector for ServiceMonitors used for target discovery. By default, we select ServiceMonitors created by the Chart. See: https://github.com/open-telemetry/opentelemetry-operator/blob/main/docs/api.md#opentelemetrycollectorspectargetallocatorprometheuscr | `Nil` | | `sumologic.metrics.collector.otelcol.podMonitorSelector` | Selector for PodMonitors used for target discovery. By default, we select PodMonitors created by the Chart. See: https://github.com/open-telemetry/opentelemetry-operator/blob/main/docs/api.md#opentelemetrycollectorspectargetallocatorprometheuscr | `Nil` | | `sumologic.metrics.collector.otelcol.nodeSelector` | Node selector for the experimental otelcol metrics. [See docs/best-practices.md for more information.](/docs/best-practices.md). | `{}` | diff --git a/deploy/helm/sumologic/conf/metrics/remote-write-proxy/remote-write-proxy.conf b/deploy/helm/sumologic/conf/metrics/remote-write-proxy/remote-write-proxy.conf index 7101f01a27..2e0a82ab83 100644 --- a/deploy/helm/sumologic/conf/metrics/remote-write-proxy/remote-write-proxy.conf +++ b/deploy/helm/sumologic/conf/metrics/remote-write-proxy/remote-write-proxy.conf @@ -1,7 +1,11 @@ -upstream remote { +upstream remote_prometheus { server {{ template "sumologic.metadata.name.metrics.service" . }}:9888; } +upstream remote_otel { + server {{ template "sumologic.metadata.name.metrics.service" . }}:4318; +} + server { listen {{ .Values.sumologic.metrics.remoteWriteProxy.config.port }} default_server; {{- if not .Values.sumologic.metrics.remoteWriteProxy.config.enableAccessLogs }} @@ -9,6 +13,17 @@ server { {{- end }} location / { client_body_buffer_size {{ .Values.sumologic.metrics.remoteWriteProxy.config.clientBodyBufferSize }}; - proxy_pass http://remote; + proxy_pass http://remote_prometheus; + } +} + +server { + listen 4318 default_server; +{{- if not .Values.sumologic.metrics.remoteWriteProxy.config.enableAccessLogs }} + access_log off; +{{- end }} + location / { + client_body_buffer_size {{ .Values.sumologic.metrics.remoteWriteProxy.config.clientBodyBufferSize }}; + proxy_pass http://remote_otel; } } diff --git a/deploy/helm/sumologic/templates/metrics/collector/otelcol/opentelemetrycollector.yaml b/deploy/helm/sumologic/templates/metrics/collector/otelcol/opentelemetrycollector.yaml index 97ab4064e9..d06cb24788 100644 --- a/deploy/helm/sumologic/templates/metrics/collector/otelcol/opentelemetrycollector.yaml +++ b/deploy/helm/sumologic/templates/metrics/collector/otelcol/opentelemetrycollector.yaml @@ -59,7 +59,10 @@ spec: {{- end }} env: - name: METADATA_METRICS_SVC - value: {{ template "sumologic.metadata.name.metrics.service" . }} # no need for remote write proxy here + valueFrom: + configMapKeyRef: + name: sumologic-configmap + key: metadataMetrics - name: NAMESPACE valueFrom: fieldRef: diff --git a/deploy/helm/sumologic/templates/metrics/remote-write-proxy/deployment.yaml b/deploy/helm/sumologic/templates/metrics/remote-write-proxy/deployment.yaml index 20e77638ad..31414f9502 100644 --- a/deploy/helm/sumologic/templates/metrics/remote-write-proxy/deployment.yaml +++ b/deploy/helm/sumologic/templates/metrics/remote-write-proxy/deployment.yaml @@ -59,6 +59,7 @@ spec: imagePullPolicy: {{ .Values.sumologic.metrics.remoteWriteProxy.image.pullPolicy }} ports: - containerPort: {{ .Values.sumologic.metrics.remoteWriteProxy.config.port }} + - containerPort: 4318 resources: {{- toYaml .Values.sumologic.metrics.remoteWriteProxy.resources | nindent 10 }} livenessProbe: diff --git a/deploy/helm/sumologic/templates/metrics/remote-write-proxy/service.yaml b/deploy/helm/sumologic/templates/metrics/remote-write-proxy/service.yaml index e13e800051..44892ed820 100644 --- a/deploy/helm/sumologic/templates/metrics/remote-write-proxy/service.yaml +++ b/deploy/helm/sumologic/templates/metrics/remote-write-proxy/service.yaml @@ -9,9 +9,12 @@ metadata: {{- include "sumologic.labels.common" . | nindent 4 }} spec: ports: - - name: http + - name: prometheus port: 9888 targetPort: {{ .Values.sumologic.metrics.remoteWriteProxy.config.port }} + - name: otel + port: 4318 + targetPort: 4318 selector: app: {{ template "sumologic.labels.app.remoteWriteProxy.pod" . }} {{- end }} diff --git a/deploy/helm/sumologic/values.yaml b/deploy/helm/sumologic/values.yaml index ab4a6b8bb9..0c6b5f2654 100644 --- a/deploy/helm/sumologic/values.yaml +++ b/deploy/helm/sumologic/values.yaml @@ -474,8 +474,8 @@ sumologic: enabled: false minReplicas: 3 maxReplicas: 10 - targetCPUUtilizationPercentage: 100 - # targetMemoryUtilizationPercentage: 50 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 70 nodeSelector: {} @@ -487,15 +487,15 @@ sumologic: ## Option to define priorityClassName to assign a priority class to pods. priorityClassName: - replicaCount: 3 + replicaCount: 1 resources: limits: - memory: 1Gi + memory: 2Gi cpu: 1000m requests: memory: 768Mi - cpu: 500m + cpu: 100m ## Selector for ServiceMonitors used for target discovery. By default, this selects resources created by this Chart. ## See https://github.com/open-telemetry/opentelemetry-operator/blob/main/docs/api.md#opentelemetrycollectorspectargetallocatorprometheuscr diff --git a/tests/helm/testdata/goldenfile/metrics_collector_otc/basic.output.yaml b/tests/helm/testdata/goldenfile/metrics_collector_otc/basic.output.yaml index ca5b851cb6..471f2759fa 100644 --- a/tests/helm/testdata/goldenfile/metrics_collector_otc/basic.output.yaml +++ b/tests/helm/testdata/goldenfile/metrics_collector_otc/basic.output.yaml @@ -14,7 +14,7 @@ metadata: sumologic.com/scrape: "true" spec: mode: statefulset - replicas: 3 + replicas: 1 serviceAccount: RELEASE-NAME-sumologic-metrics targetAllocator: serviceAccount: RELEASE-NAME-sumologic-metrics-targetallocator @@ -29,7 +29,10 @@ spec: release: RELEASE-NAME env: - name: METADATA_METRICS_SVC - value: RELEASE-NAME-sumologic-metadata-metrics # no need for remote write proxy here + valueFrom: + configMapKeyRef: + name: sumologic-configmap + key: metadataMetrics - name: NAMESPACE valueFrom: fieldRef: @@ -45,9 +48,9 @@ spec: resources: limits: cpu: 1000m - memory: 1Gi + memory: 2Gi requests: - cpu: 500m + cpu: 100m memory: 768Mi volumes: - name: tmp diff --git a/tests/helm/testdata/goldenfile/metrics_collector_otc/custom.output.yaml b/tests/helm/testdata/goldenfile/metrics_collector_otc/custom.output.yaml index c2f8223466..9f98bf8524 100644 --- a/tests/helm/testdata/goldenfile/metrics_collector_otc/custom.output.yaml +++ b/tests/helm/testdata/goldenfile/metrics_collector_otc/custom.output.yaml @@ -18,7 +18,7 @@ metadata: podKey: podValue spec: mode: statefulset - replicas: 3 + replicas: 1 serviceAccount: RELEASE-NAME-sumologic-metrics targetAllocator: serviceAccount: RELEASE-NAME-sumologic-metrics-targetallocator @@ -45,7 +45,10 @@ spec: targetMemoryUtilization: 90 env: - name: METADATA_METRICS_SVC - value: RELEASE-NAME-sumologic-metadata-metrics # no need for remote write proxy here + valueFrom: + configMapKeyRef: + name: sumologic-configmap + key: metadataMetrics - name: NAMESPACE valueFrom: fieldRef: diff --git a/tests/helm/testdata/goldenfile/remote_write_proxy/basic.output.yaml b/tests/helm/testdata/goldenfile/remote_write_proxy/basic.output.yaml index 45fe80322d..59f36fae93 100644 --- a/tests/helm/testdata/goldenfile/remote_write_proxy/basic.output.yaml +++ b/tests/helm/testdata/goldenfile/remote_write_proxy/basic.output.yaml @@ -32,6 +32,7 @@ spec: imagePullPolicy: IfNotPresent ports: - containerPort: 8080 + - containerPort: 4318 resources: limits: cpu: 1000m diff --git a/tests/helm/testdata/goldenfile/remote_write_proxy/full_config.output.yaml b/tests/helm/testdata/goldenfile/remote_write_proxy/full_config.output.yaml index bc6ad7b4ad..c60f4dce1f 100644 --- a/tests/helm/testdata/goldenfile/remote_write_proxy/full_config.output.yaml +++ b/tests/helm/testdata/goldenfile/remote_write_proxy/full_config.output.yaml @@ -65,6 +65,7 @@ spec: imagePullPolicy: Always ports: - containerPort: 80 + - containerPort: 4318 resources: limits: cpu: 400m diff --git a/tests/helm/testdata/goldenfile/remote_write_proxy/full_configmap.output.yaml b/tests/helm/testdata/goldenfile/remote_write_proxy/full_configmap.output.yaml index b965c349b3..2d40104b16 100644 --- a/tests/helm/testdata/goldenfile/remote_write_proxy/full_configmap.output.yaml +++ b/tests/helm/testdata/goldenfile/remote_write_proxy/full_configmap.output.yaml @@ -12,14 +12,26 @@ metadata: heritage: "Helm" data: remote-write-proxy.conf: | - upstream remote { + upstream remote_prometheus { server RELEASE-NAME-sumologic-metadata-metrics:9888; } + upstream remote_otel { + server RELEASE-NAME-sumologic-metadata-metrics:4318; + } + server { listen 80 default_server; location / { client_body_buffer_size 32k; - proxy_pass http://remote; + proxy_pass http://remote_prometheus; + } + } + + server { + listen 4318 default_server; + location / { + client_body_buffer_size 32k; + proxy_pass http://remote_otel; } } diff --git a/tests/integration/internal/constants.go b/tests/integration/internal/constants.go index dc38b146f4..741ee009de 100644 --- a/tests/integration/internal/constants.go +++ b/tests/integration/internal/constants.go @@ -364,6 +364,7 @@ var ( "otelcol_otelsvc_k8s_ip_lookup_miss", "otelcol_otelsvc_k8s_other_deleted", "kube_pod_container_status_waiting_reason", + "kube_pod_container_status_terminated_reason", // TODO: check different metrics depending on K8s version // scheduler_scheduling_duration_seconds is present for K8s <1.23 // scheduler_scheduling_attempt_duration_seconds is present for K8s >=1.23 diff --git a/tests/integration/values/values_helm_ot_metrics.yaml b/tests/integration/values/values_helm_ot_metrics.yaml index fb4306e098..a64419bb90 100644 --- a/tests/integration/values/values_helm_ot_metrics.yaml +++ b/tests/integration/values/values_helm_ot_metrics.yaml @@ -8,7 +8,7 @@ sumologic: metrics: enabled: true remoteWriteProxy: - enabled: false + enabled: true collector: otelcol: enabled: true