/
pipelineruns_periodic.go
102 lines (91 loc) · 3.64 KB
/
pipelineruns_periodic.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
package metrics
import (
"fmt"
"sync"
stewardapi "github.com/SAP/stewardci-core/pkg/apis/steward/v1alpha1"
"github.com/SAP/stewardci-core/pkg/metrics"
"github.com/benbjohnson/clock"
"github.com/prometheus/client_golang/prometheus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
var (
// PipelineRunsPeriodic is a metric that observes all existing pipeline runs
// periodically.
PipelineRunsPeriodic PipelineRunsMetric = &pipelineRunsPeriodic{}
)
func init() {
PipelineRunsPeriodic.(*pipelineRunsPeriodic).init()
}
type pipelineRunsPeriodic struct {
clock clock.Clock
initOnlyOnce sync.Once
durationMetric *prometheus.HistogramVec
// TODO remove when deprecated long enough
durationMetricOld *prometheus.HistogramVec
}
func (m *pipelineRunsPeriodic) init() {
m.initOnlyOnce.Do(func() {
if m.clock == nil {
m.clock = clock.New()
}
m.durationMetric = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: subsystem,
Name: "ongoing_state_duration_periodic_observations_seconds",
Help: "A histogram vector partitioned by pipeline run states that counts the number of periodic observations of pipeline runs in a state grouped by the duration of the state at the time of the observation." +
"\n\nThe purpose of this metric is the detection of overly long processing times, caused by e.g. hanging controllers." +
"\n\nThere's one histogram per pipeline run state (label `state`)." +
" All existing pipeline runs get counted periodically, i.e. every observation cycle counts each pipeline run in exactly one histogram." +
" This means a single pipeline run is counted zero, one or multiple times in the same or different buckets of the same or different histograms." +
" This in turn means without knowing the observation and scraping intervals it is not possible to infer the _absolute_ number of pipeline runs observed." +
" It is only meaningful to calculate a _ratio_ between observations in certain buckets and the total number of observations (in a single or across multiple histograms)." +
"\n\nPipeline runs that are marked as deleted are not counted to exclude delays caused by finalization.",
Buckets: prometheus.ExponentialBuckets(60, 2, 7),
},
[]string{
"state",
},
)
metrics.Registerer().MustRegister(m.durationMetric)
m.durationMetricOld = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "steward_pipelinerun_ongoing_state_duration_periodic_observations_seconds",
Help: fmt.Sprintf("Deprecated! Use '%s_ongoing_state_duration_periodic_observations_seconds' instead.", subsystem),
Buckets: prometheus.ExponentialBuckets(60, 2, 7),
},
[]string{
"state",
},
)
metrics.Registerer().MustRegister(m.durationMetricOld)
})
}
func (m *pipelineRunsPeriodic) Observe(run *stewardapi.PipelineRun) {
if m.isNewRun(run) {
m.observe(stewardapi.StateNew, run.CreationTimestamp)
} else {
m.observe(run.Status.State, run.Status.StateDetails.StartedAt)
}
}
func (m *pipelineRunsPeriodic) observe(state stewardapi.State, since metav1.Time) {
if since.IsZero() {
// cannot observe pipeline run if start timestamp is not set
return
}
duration := m.clock.Since(since.Time)
if duration < 0 {
// cannot observe pipeline run if start time lies in the future
return
}
labels := prometheus.Labels{
"state": string(state),
}
m.durationMetric.With(labels).Observe(duration.Seconds())
m.durationMetricOld.With(labels).Observe(duration.Seconds())
}
func (m *pipelineRunsPeriodic) isNewRun(run *stewardapi.PipelineRun) bool {
state := run.Status.State
return false ||
state == stewardapi.StateUndefined ||
state == stewardapi.StateNew
}