Skip to content

Commit

Permalink
feat(metrics): add experimental otel metrics collector
Browse files Browse the repository at this point in the history
  • Loading branch information
swiatekm-sumo committed Apr 15, 2023
1 parent f3a6b1b commit 66798b4
Show file tree
Hide file tree
Showing 25 changed files with 584 additions and 19 deletions.
3 changes: 3 additions & 0 deletions deploy/helm/sumologic/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ The following table lists the configurable parameters of the Sumo Logic chart an
| `sumologic.metrics.remoteWriteProxy.podAnnotations` | Additional annotations for for the remote write proxy container. | `{}` |
| `sumologic.metrics.remoteWriteProxy.config.port` | Port on which remote write proxy is going to be exposed | `8080` |
| `sumologic.metrics.serviceMonitors` | Configuration of Sumo Logic Kubernetes Collection components serviceMonitors | See [values.yaml] |
| `sumologic.metrics.collector.otelcol.enabled` | Enable experimental otelcol metrics collector | See [values.yaml] |
| `sumologic.traces.enabled` | Set the enabled flag to true to enable tracing ingestion. _Tracing must be enabled for the account first. Please contact your Sumo representative for activation details_ | `true` |
| `sumologic.traces.spans_per_request` | Maximum number of spans sent in single batch | `100` |
| `sumologic.envFromSecret` | If enabled, accessId and accessKey will be sourced from Secret Name given. Be sure to include at least the following env variables in your secret (1) SUMOLOGIC_ACCESSID, (2) SUMOLOGIC_ACCESSKEY | `sumo-api-secret` |
Expand Down Expand Up @@ -396,6 +397,8 @@ The following table lists the configurable parameters of the Sumo Logic chart an
| `opentelemetry-operator.instrumentation.java.traces.enabled` | Flag to control traces export from Java instrumentation in `Instrumentation` resource. | `true` |
| `opentelemetry-operator.instrumentation.python.metrics.enabled` | Flag to control metrics export from Python instrumentation in `Instrumentation` resource. | `true` |
| `opentelemetry-operator.instrumentation.python.traces.enabled` | Flag to control traces export from Python instrumentation in `Instrumentation` resource. | `true` |
| `opentelemetry-operator.manager.collectorImage.repository` | The default collector image repository for OpenTelemetryCollector CRDs. | `public.ecr.aws/sumologic/sumologic-otel-collector` |
| `opentelemetry-operator.manager.collectorImage.tag` | The default collector image tag for OpenTelemetryCollector CRDs. | `0.73.0-sumo-1` |
| `opentelemetry-operator.manager.resources.limits.cpu` | Used to set limit CPU for OpenTelemetry-Operator Manager. | `250m` |
| `opentelemetry-operator.manager.resources.limits.memory` | Used to set limit Memory for OpenTelemetry-Operator Manager. | `512Mi` |
| `opentelemetry-operator.manager.resources.requests.cpu` | Used to set requested CPU for OpenTelemetry-Operator Manager. | `150m` |
Expand Down
123 changes: 123 additions & 0 deletions deploy/helm/sumologic/conf/metrics/collector/otelcol/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
receivers:
prometheus:
config:
global:
scrape_interval: 30s
scrape_configs:
## These scrape configs are for kubelet metrics
## Prometheus operator does this by manually maintaining a Service with Endpoints for all Nodes
## We don't have that capability, so we need to use a static configuration
- job_name: kubelet
scheme: https
authorization:
credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
honor_labels: true
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- action: keep
regex: (?:kubelet_docker_operations_errors(?:|_total)|kubelet_(?:docker|runtime)_operations_duration_seconds_(?:count|sum)|kubelet_running_(?:container|pod)(?:_count|s)|kubelet_(:?docker|runtime)_operations_latency_microseconds(?:|_count|_sum))
source_labels: [__name__]
# TODO: The below can't be used due to a bug in target allocator
# - action: labeldrop
# regex: id
relabel_configs: &relabel_configs # partially copied from what operator generates
- source_labels:
- __meta_kubernetes_node_name
target_label: node
- source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- source_labels:
- __meta_kubernetes_pod_container_name
target_label: container
- target_label: endpoint
replacement: https-metrics
- source_labels:
- __metrics_path__
target_label: metrics_path
action: replace
- source_labels:
- __address__
target_label: instance
action: replace
- job_name: cadvisor
scheme: https
authorization:
credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
honor_labels: true
metrics_path: /metrics/cadvisor
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- action: replace
regex: .*
replacement: kubelet
source_labels: [__name__]
target_label: job
- action: keep
regex: (?:container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_throttled_seconds_total|container_network_receive_bytes_total|container_network_transmit_bytes_total)
source_labels: [__name__]
## Drop container metrics with container tag set to an empty string:
## these are the pod aggregated container metrics which can be aggregated
## in Sumo anyway. There's also some cgroup-specific time series we also
## do not need.
- action: drop
source_labels: [__name__, container]
regex: (?:container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_fs_usage_bytes|container_fs_limit_bytes);$
- action: labelmap
regex: container_name
replacement: container
- action: drop
source_labels: [container]
regex: POD
# TODO: The below can't be used due to a bug in target allocator
# - action: labeldrop
# regex: (id|name)
relabel_configs: *relabel_configs # partially copied from what operator generates
target_allocator:
endpoint: http://{{ template "sumologic.metadata.name.metrics.targetallocator.name" . }}
interval: 30s
collector_id: ${POD_NAME}
extensions:
health_check: {}
{{ if .Values.metadata.persistence.enabled }}
## Configuration for File Storage extension
## ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/release/v0.37.x/extension/storage/filestorage
file_storage:
directory: /var/lib/storage/otc
timeout: 10s
compaction:
on_rebound: true
directory: /tmp
{{ end }}
pprof: {}
exporters:
{{ tpl (.Files.Get "conf/metrics/otelcol/exporters.yaml") . | indent 2 }}
processors:
{{ tpl (.Files.Get "conf/metrics/otelcol/processors.yaml") . | indent 2 }}
service:
telemetry:
logs:
level: {{ .Values.metadata.metrics.logLevel }}
metrics:
address: 0.0.0.0:8888 # this is the default, but setting it explicitly lets the operator add it automatically
extensions:
- health_check
{{ if .Values.metadata.persistence.enabled }}
- file_storage
{{ end }}
- pprof
pipelines:
metrics:
{{ tpl (.Files.Get "conf/metrics/otelcol/pipeline.yaml") . | indent 6 }}

5 changes: 5 additions & 0 deletions deploy/helm/sumologic/conf/metrics/otelcol/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ exporters:
processors:
- memory_limiter
- metricstransform
- groupbyattrs
- resource
- k8s_tagger
- source
Expand All @@ -24,4 +25,8 @@ processors:
- batch
- routing
receivers:
{{- if not .Values.sumologic.metrics.collector.otelcol.enabled }}
- telegraf
{{- else }}
- prometheus
{{- end }}
13 changes: 13 additions & 0 deletions deploy/helm/sumologic/conf/metrics/otelcol/processors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ batch:
## Time duration after which a batch will be sent regardless of size
timeout: 1s

# Prometheus receiver puts all labels in record-level attributes, and we need them in resource
groupbyattrs:
keys:
- container
- namespace
- pod
- service

## The Kubernetes sprocessor automatically tags logs, metrics and traces with Kubernetes metadata like pod name, namespace name etc.
## ref: https://github.com/SumoLogic/sumologic-otel-collector/tree/main/pkg/processor/k8sprocessor
k8s_tagger:
Expand Down Expand Up @@ -79,6 +87,11 @@ resource:
key: prometheus_service
- action: delete
key: service
- action: upsert
from_attribute: service.name
key: job
- action: delete # we don't want service.name to be set, as the schema processor translates it to "service"
key: service.name
- action: upsert
key: _origin # add "_origin" metadata to metrics to keep the same format as for metrics from Fluentd
value: kubernetes
Expand Down
57 changes: 57 additions & 0 deletions deploy/helm/sumologic/templates/_helpers/_metrics.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,63 @@ sumologic.com/scrape: "true"
{{- template "sumologic.labels.app.pvcCleaner" . }}-metrics
{{- end -}}

{{/*
Definitions for metrics collector
*/}}

{{- define "sumologic.labels.app.metrics.collector" -}}
{{- template "sumologic.fullname" . }}-metrics
{{- end -}}

{{- define "sumologic.labels.app.metrics.collector.pod" -}}
{{- template "sumologic.labels.app.metrics.collector" . }}
{{- end -}}

{{- define "sumologic.labels.app.metrics.collector.opentelemetrycollector" -}}
{{- template "sumologic.labels.app.metrics.collector" . }}
{{- end -}}

{{- define "sumologic.metadata.name.metrics.collector" -}}
{{- template "sumologic.fullname" . }}-metrics
{{- end -}}

{{- define "sumologic.metadata.name.metrics.collector.opentelemetrycollector" -}}
{{ template "sumologic.metadata.name.metrics.collector" . }}
{{- end -}}

{{- define "sumologic.metadata.name.metrics.collector.serviceaccount" -}}
{{ template "sumologic.metadata.name.metrics.collector" . }}
{{- end -}}

{{- define "sumologic.metadata.name.metrics.collector.clusterrole" -}}
{{ template "sumologic.metadata.name.metrics.collector" . }}
{{- end -}}

{{- define "sumologic.metadata.name.metrics.collector.clusterrolebinding.prometheus" -}}
{{ template "sumologic.metadata.name.metrics.collector" . }}-prometheus
{{- end -}}

{{- define "sumologic.metadata.name.metrics.collector.clusterrolebinding.metadata" -}}
{{ template "sumologic.metadata.name.metrics.collector" . }}-metadata
{{- end -}}

{{- define "sumologic.metadata.name.metrics.targetallocator.name" -}}
{{ template "sumologic.metadata.name.metrics.collector.opentelemetrycollector" . }}-targetallocator
{{- end -}}

{{- define "sumologic.metadata.name.metrics.targetallocator.serviceaccount" -}}
{{ template "sumologic.metadata.name.metrics.targetallocator.name" . }}
{{- end -}}

{{- define "sumologic.metadata.name.metrics.targetallocator.clusterrole" -}}
{{ template "sumologic.metadata.name.metrics.targetallocator.name" . }}
{{- end -}}

{{- define "sumologic.metadata.name.metrics.targetallocator.clusterrolebinding" -}}
{{ template "sumologic.metadata.name.metrics.targetallocator.name" . }}
{{- end -}}


{{/*
Generate metrics match configuration
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{{ if and (eq (include "metrics.otelcol.enabled" .) "true") .Values.sumologic.metrics.collector.otelcol.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ template "sumologic.metadata.name.metrics.collector.clusterrole" . }}
rules:
- apiGroups: [""]
resources:
- pods
- nodes
- nodes/metrics
- services
- endpoints
verbs:
- get
- watch
- list
- apiGroups: ["networking.k8s.io"]
resources:
- ingresses
verbs:
- get
- watch
- list
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
verbs: ["get"]
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{{ if and (eq (include "metrics.otelcol.enabled" .) "true") .Values.sumologic.metrics.collector.otelcol.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ template "sumologic.metadata.name.metrics.collector.clusterrolebinding.prometheus" . }}
subjects:
- kind: ServiceAccount
name: {{ template "sumologic.metadata.name.metrics.collector.serviceaccount" . }}
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: {{ template "sumologic.metadata.name.metrics.collector.clusterrole" . }}
apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ template "sumologic.metadata.name.metrics.collector.clusterrolebinding.metadata" . }}
labels:
app: {{ template "sumologic.labels.app.roles.clusterrolebinding" . }}
{{- include "sumologic.labels.common" . | nindent 4 }}
subjects:
- kind: ServiceAccount
namespace: {{ .Release.Namespace }}
name: {{ template "sumologic.metadata.name.metrics.collector.serviceaccount" . }}
roleRef:
kind: ClusterRole
name: {{ template "sumologic.metadata.name.roles.clusterrole" . }}
apiGroup: rbac.authorization.k8s.io
{{- end }}

0 comments on commit 66798b4

Please sign in to comment.