Skip to content

Commit

Permalink
feat(metrics): do not filter in default remote write definitions
Browse files Browse the repository at this point in the history
  • Loading branch information
swiatekm-sumo committed Jul 21, 2023
1 parent 4aa0009 commit d92a00c
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 219 deletions.
1 change: 1 addition & 0 deletions .changelog/3157.changed.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
feat(metrics): do not filter in default remote write definitions
215 changes: 14 additions & 201 deletions deploy/helm/sumologic/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2177,166 +2177,23 @@ kube-prometheus-stack:
action: replace

remoteWrite:
## kube non pod state metrics
## kube_daemonset_status_current_number_scheduled
## kube_daemonset_status_desired_number_scheduled
## kube_daemonset_status_number_misscheduled
## kube_daemonset_status_number_unavailable
## kube_deployment_spec_replicas
## kube_deployment_status_replicas_available
## kube_deployment_status_replicas_unavailable
## kube_node_info
## kube_node_status_allocatable
## kube_node_status_capacity
## kube_node_status_condition
## kube_statefulset_metadata_generation
## kube_statefulset_replicas
## kube_statefulset_status_observed_generation
## kube_statefulset_status_replicas
## kube_hpa_spec_max_replicas
## kube_hpa_spec_min_replicas
## kube_hpa_status_condition
## kube_hpa_status_current_replicas
## kube_hpa_status_desired_replicas
## kube_service_info
## kube_service_spec_external_ip
## kube_service_spec_type
## kube_service_status_load_balancer_ingress
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.state
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kube-state-metrics;(?:kube_statefulset_status_observed_generation|kube_statefulset_status_replicas|kube_statefulset_replicas|kube_statefulset_metadata_generation|kube_daemonset_status_current_number_scheduled|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_misscheduled|kube_daemonset_status_number_unavailable|kube_deployment_spec_replicas|kube_deployment_status_replicas_available|kube_deployment_status_replicas_unavailable|kube_node_info|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_condition|kube_hpa_spec_max_replicas|kube_hpa_spec_min_replicas|kube_hpa_status_(condition|(current|desired)_replicas)|kube_service_info|kube_service_spec_external_ip|kube_service_spec_type|kube_service_status_load_balancer_ingress)
sourceLabels: [job, __name__]
## kube pod state metrics
## kube_pod_status_phase
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.state
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kube-state-metrics;(?:kube_pod_status_phase)
sourceLabels: [job, __name__]
## kube container state metrics
## kube_pod_container_info
## kube_pod_container_resource_limits
## kube_pod_container_resource_requests
## kube_pod_container_status_ready
## kube_pod_container_status_restarts_total
## kube_pod_container_status_terminated_reason
## kube_pod_container_status_waiting_reason
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.state
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kube-state-metrics;(?:kube_pod_container_info|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kube_pod_container_status_ready|kube_pod_container_status_terminated_reason|kube_pod_container_status_waiting_reason|kube_pod_container_status_restarts_total)
sourceLabels: [job, __name__]
## controller manager metrics
## https://kubernetes.io/docs/concepts/cluster-administration/monitoring/#kube-controller-manager-metrics
## e.g.
## cloudprovider_aws_api_request_duration_seconds_bucket
## cloudprovider_aws_api_request_duration_seconds_count
## cloudprovider_aws_api_request_duration_seconds_sum
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.controller-manager
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kubelet;cloudprovider_.*_api_request_duration_seconds.*
sourceLabels: [job, __name__]
## scheduler metrics
##
## scheduler_e2e_* is present for K8s <1.23
## scheduler_e2e_scheduling_duration_seconds_bucket
## scheduler_e2e_scheduling_duration_seconds_count
## scheduler_e2e_scheduling_duration_seconds_sum
##
## scheduler_scheduling_attempt_duration_seconds is present for K8s >=1.23
## scheduler_scheduling_attempt_duration_seconds_bucket
## scheduler_scheduling_attempt_duration_seconds_count
## scheduler_scheduling_attempt_duration_seconds_sum
## scheduler_framework_extension_point_duration_seconds_bucket
## scheduler_framework_extension_point_duration_seconds_count
## scheduler_framework_extension_point_duration_seconds_sum
## scheduler_scheduling_algorithm_duration_seconds_bucket
## scheduler_scheduling_algorithm_duration_seconds_count
## scheduler_scheduling_algorithm_duration_seconds_sum
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.scheduler
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kube-scheduler;scheduler_(?:e2e_scheduling|scheduling_attempt|framework_extension_point|scheduling_algorithm)_duration_seconds.*
sourceLabels: [job, __name__]
## api server metrics:
## apiserver_request_count
## apiserver_request_total
## apiserver_request_duration_seconds_count
## apiserver_request_duration_seconds_sum
## etcd_request_cache_get_duration_seconds_count
## etcd_request_cache_get_duration_seconds_sum
## etcd_request_cache_add_duration_seconds_count
## etcd_request_cache_add_duration_seconds_sum
## etcd_request_cache_add_latencies_summary_count
## etcd_request_cache_add_latencies_summary_sum
## etcd_request_cache_get_latencies_summary_count
## etcd_request_cache_get_latencies_summary_sum
## etcd_helper_cache_hit_count
## etcd_helper_cache_hit_total
## etcd_helper_cache_miss_count
## etcd_helper_cache_miss_total
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.apiserver
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: apiserver;(?:apiserver_request_(?:count|total)|apiserver_request_(?:duration_seconds)_(?:count|sum)|etcd_helper_cache_(?:hit|miss)_(?:count|total))
sourceLabels: [job, __name__]
## kubelet metrics:
## kubelet_docker_operations_errors
## kubelet_docker_operations_errors_total
## kubelet_docker_operations_duration_seconds_count
## kubelet_docker_operations_duration_seconds_sum
## kubelet_runtime_operations_duration_seconds_count
## kubelet_runtime_operations_duration_seconds_sum
## kubelet_running_container_count
## kubelet_running_containers
## kubelet_running_pod_count
## kubelet_running_pods
## kubelet_docker_operations_latency_microseconds
## kubelet_docker_operations_latency_microseconds_count
## kubelet_docker_operations_latency_microseconds_sum
## kubelet_runtime_operations_latency_microseconds
## kubelet_runtime_operations_latency_microseconds_count
## kubelet_runtime_operations_latency_microseconds_sum
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.kubelet
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kubelet;(?:kubelet_docker_operations_errors(?:|_total)|kubelet_(?:docker|runtime)_operations_duration_seconds_(?:count|sum)|kubelet_running_(?:container|pod)(?:_count|s)|kubelet_(:?docker|runtime)_operations_latency_microseconds(?:|_count|_sum))
sourceLabels: [job, __name__]
## cadvisor container metrics
## container_cpu_usage_seconds_total
## container_fs_limit_bytes
## container_fs_usage_bytes
## container_memory_working_set_bytes
## container_cpu_cfs_throttled_seconds_total
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.container
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kubelet;.+;(?:container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_throttled_seconds_total)
sourceLabels: [job, container, __name__]
## cadvisor aggregate container metrics
## container_network_receive_bytes_total
## container_network_transmit_bytes_total
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.container
## infrastructure metrics
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kubelet;(?:container_network_receive_bytes_total|container_network_transmit_bytes_total)
sourceLabels: [job, __name__]
## node exporter metrics
## node_cpu_seconds_total
## node_load1
## node_load5
## node_load15
regex: (?:kube-state-metrics|kubelet|kube-scheduler|apiserver|coredns|kube-etcd)
sourceLabels: [job]
## This is only used for recording rules
- action: drop
regex: kube_pod_info
sourceLabels: [__name__]
## We don't want Prometheus' scrape metrics
- action: drop
regex: scrape_.*
sourceLabels: [__name__]

## This needs to be separate because we have a bunch of recording rule based metrics for Nodes which fall into this job
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.node
remoteTimeout: 5s
writeRelabelConfigs:
Expand Down Expand Up @@ -2448,50 +2305,6 @@ kube-prometheus-stack:
- action: keep
regex: (?:up|prometheus_remote_storage_.*|fluentd_.*|fluentbit.*|otelcol.*)
sourceLabels: [__name__]
## control plane metrics
## coredns:
## coredns_cache_entries
## coredns_cache_hits_total
## coredns_cache_misses_total
## coredns_dns_request_duration_seconds_count
## coredns_dns_request_duration_seconds_sum
## coredns_dns_requests_total
## coredns_dns_responses_total
## coredns_forward_requests_total
## process_cpu_seconds_total
## process_open_fds
## process_resident_memory_bytes
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.control-plane.coredns
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: coredns;(?:coredns_cache_(entries|(hits|misses)_total)|coredns_dns_request_duration_seconds_(count|sum)|coredns_(forward_requests|dns_requests|dns_responses)_total|process_(cpu_seconds_total|open_fds|resident_memory_bytes))
sourceLabels: [job, __name__]
## etcd server:
## etcd_mvcc_db_total_size_in_bytes
## etcd_debugging_store_expires_total
## etcd_debugging_store_watchers
## etcd_disk_backend_commit_duration_seconds_bucket
## etcd_disk_wal_fsync_duration_seconds_bucket
## etcd_grpc_proxy_cache_hits_total
## etcd_grpc_proxy_cache_misses_total
## etcd_network_client_grpc_received_bytes_total
## etcd_network_client_grpc_sent_bytes_total
## etcd_server_has_leader
## etcd_server_leader_changes_seen_total
## etcd_server_proposals_applied_total
## etcd_server_proposals_committed_total
## etcd_server_proposals_failed_total
## etcd_server_proposals_pending
## process_cpu_seconds_total
## process_open_fds
## process_resident_memory_bytes
- url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.control-plane.kube-etcd
remoteTimeout: 5s
writeRelabelConfigs:
- action: keep
regex: kube-etcd;(?:etcd_debugging_(store_(expires_total|watchers))|etcd_mvcc_db_total_size_in_bytes|etcd_disk_(backend_commit|wal_fsync)_duration_seconds_.*|etcd_grpc_proxy_cache_(hits|misses)_total|etcd_network_client_grpc_(received|sent)_bytes_total|etcd_server_(has_leader|leader_changes_seen_total)|etcd_server_proposals_(pending|(applied|committed|failed)_total)|process_(cpu_seconds_total|open_fds|resident_memory_bytes))
sourceLabels: [job, __name__]

## Nginx ingress controller metrics
## rel: https://docs.nginx.com/nginx-ingress-controller/logging-and-monitoring/prometheus/#available-metrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,6 @@ data:
path_tag = true
paths = [
"/prometheus.metrics",
"/prometheus.metrics.apiserver",
"/prometheus.metrics.applications.activemq",
"/prometheus.metrics.applications.apache",
"/prometheus.metrics.applications.cassandra",
Expand All @@ -299,17 +298,10 @@ data:
"/prometheus.metrics.applications.squidproxy",
"/prometheus.metrics.applications.tomcat",
"/prometheus.metrics.applications.varnish",
"/prometheus.metrics.container",
"/prometheus.metrics.control-plane.coredns",
"/prometheus.metrics.control-plane.kube-etcd",
"/prometheus.metrics.controller-manager",
"/prometheus.metrics.custom",
"/prometheus.metrics.kubelet",
"/prometheus.metrics.node",
"/prometheus.metrics.operator.rule",
"/prometheus.metrics.others",
"/prometheus.metrics.scheduler",
"/prometheus.metrics.state"
"/prometheus.metrics.others"
]
service:
extensions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,6 @@ data:
path_tag = true
paths = [
"/prometheus.metrics",
"/prometheus.metrics.apiserver",
"/prometheus.metrics.applications.activemq",
"/prometheus.metrics.applications.apache",
"/prometheus.metrics.applications.cassandra",
Expand All @@ -299,15 +298,8 @@ data:
"/prometheus.metrics.applications.squidproxy",
"/prometheus.metrics.applications.tomcat",
"/prometheus.metrics.applications.varnish",
"/prometheus.metrics.container",
"/prometheus.metrics.control-plane.coredns",
"/prometheus.metrics.control-plane.kube-etcd",
"/prometheus.metrics.controller-manager",
"/prometheus.metrics.kubelet",
"/prometheus.metrics.node",
"/prometheus.metrics.operator.rule",
"/prometheus.metrics.scheduler",
"/prometheus.metrics.state"
"/prometheus.metrics.operator.rule"
]
service:
extensions:
Expand Down

0 comments on commit d92a00c

Please sign in to comment.