diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index dcad3c73..62015db2 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -156,6 +156,7 @@ jobs: TF_VAR_jwt_secret: ${{ secrets.PROD_JWT_SECRET }} TF_VAR_image_version: ${{ inputs.image_tag }} TF_VAR_relay_public_key: ${{ secrets.RELAY_PUBLIC_KEY }} + TF_VAR_notification_channels: NNOynGwVz with: environment: "prod" diff --git a/terraform/main.tf b/terraform/main.tf index da8e7b24..0749ea2c 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -191,6 +191,7 @@ module "monitoring" { prometheus_workspace_id = aws_prometheus_workspace.prometheus.id load_balancer_arn = module.ecs.load_balancer_arn environment = local.environment + notification_channels = var.notification_channels } data "aws_ecr_repository" "repository" { diff --git a/terraform/monitoring/dashboard.jsonnet b/terraform/monitoring/dashboard.jsonnet new file mode 100644 index 00000000..d94fe237 --- /dev/null +++ b/terraform/monitoring/dashboard.jsonnet @@ -0,0 +1,55 @@ +local grafana = import 'grafonnet-lib/grafana.libsonnet'; +local panels = import 'panels/panels.libsonnet'; + +local dashboard = grafana.dashboard; +local row = grafana.row; +local annotation = grafana.annotation; +local layout = grafana.layout; + +local ds = { + prometheus: { + type: 'prometheus', + uid: std.extVar('prometheus_uid'), + }, + cloudwatch: { + type: 'cloudwatch', + uid: std.extVar('cloudwatch_uid'), + }, +}; +local vars = { + namespace: 'Push', + environment: std.extVar('environment'), + notifications: std.parseJson(std.extVar('notifications')), +}; + +//////////////////////////////////////////////////////////////////////////////// + +local height = 8; +local pos = grafana.layout.pos(height); + +//////////////////////////////////////////////////////////////////////////////// + +dashboard.new( + title = std.extVar('dashboard_title'), + uid = std.extVar('dashboard_uid'), + editable = true, + graphTooltip = dashboard.graphTooltips.sharedCrosshair, + timezone = dashboard.timezones.utc, +) +.addAnnotation( + annotation.new( + target = { + limit: 100, + matchAny: false, + tags: [], + type: 'dashboard', + }, + ) +) + +.addPanels(layout.generate_grid([ + ////////////////////////////////////////////////////////////////////////////// + row.new('Application'), + panels.app.postgres_query_rate(ds, vars) { gridPos: pos._6 }, + panels.app.postgres_query_latency(ds, vars) { gridPos: pos._6 }, +])) diff --git a/terraform/monitoring/main.tf b/terraform/monitoring/main.tf index a7f640b9..28d7ffd5 100644 --- a/terraform/monitoring/main.tf +++ b/terraform/monitoring/main.tf @@ -3,12 +3,6 @@ locals { # the Grafana provider e.g. # net/prod-relay-load-balancer/e9a51c46020a0f85 load_balancer = join("/", slice(split("/", var.load_balancer_arn), 1, 4)) - opsgenie_notification_channel = "NNOynGwVz" - notifications = ( - var.environment == "prod" ? - [{ uid = local.opsgenie_notification_channel }] : - [] - ) } resource "grafana_data_source" "prometheus" { @@ -34,7 +28,29 @@ resource "grafana_data_source" "cloudwatch" { }) } +data "jsonnet_file" "dashboard" { + source = "${path.module}/dashboard.jsonnet" + + ext_str = { + dashboard_title = "Push Server - ${title(module.this.stage)}" + dashboard_uid = "push-${module.this.stage}" + + prometheus_uid = grafana_data_source.prometheus.uid + cloudwatch_uid = grafana_data_source.cloudwatch.uid + + environment = module.this.stage + notifications = jsonencode(var.notification_channels) + } +} + resource "grafana_dashboard" "at_a_glance" { + overwrite = true + message = "Updated by Terraform" + config_json = data.jsonnet_file.dashboard.rendered +} + + +resource "grafana_dashboard" "at_a_glance_old" { overwrite = true message = "Updated by Terraform" config_json = jsonencode({ @@ -533,7 +549,7 @@ resource "grafana_dashboard" "at_a_glance" { "name" : "${var.environment} Echo Server 5XX alert", "noDataState" : "no_data", "message" : "Echo server - Prod - 5XX error", - "notifications" : local.notifications + "notifications" : var.notification_channels }, "datasource" : { "type" : "cloudwatch", @@ -804,8 +820,8 @@ resource "grafana_dashboard" "at_a_glance" { }, "timepicker" : {}, "timezone" : "", - "title" : var.app_name, - "uid" : var.app_name, + "title" : "${var.app_name} - old", + "uid" : "${var.app_name}-old", "version" : 13, "weekStart" : "" }) diff --git a/terraform/monitoring/panels/app/postgres_query_latency.libsonnet b/terraform/monitoring/panels/app/postgres_query_latency.libsonnet new file mode 100644 index 00000000..fe863d42 --- /dev/null +++ b/terraform/monitoring/panels/app/postgres_query_latency.libsonnet @@ -0,0 +1,25 @@ +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + +local panels = grafana.panels; +local targets = grafana.targets; + +{ + new(ds, vars):: + panels.timeseries( + title = 'Postgres Query Latency', + datasource = ds.prometheus, + ) + .configure( + defaults.configuration.timeseries + .withUnit('ms') + ) + + .addTarget(targets.prometheus( + datasource = ds.prometheus, + expr = 'sum by (aws_ecs_task_revision, name) (rate(postgres_query_latency_sum[$__rate_interval])) / sum by (aws_ecs_task_revision, name) (rate(postgres_query_latency_count[$__rate_interval]))', + legendFormat = '{{name}} r{{aws_ecs_task_revision}}', + exemplar = false, + refId = 'PostgresQueryLatency', + )) +} diff --git a/terraform/monitoring/panels/app/postgres_query_rate.libsonnet b/terraform/monitoring/panels/app/postgres_query_rate.libsonnet new file mode 100644 index 00000000..2183cd4f --- /dev/null +++ b/terraform/monitoring/panels/app/postgres_query_rate.libsonnet @@ -0,0 +1,33 @@ +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + +local panels = grafana.panels; +local targets = grafana.targets; + +{ + new(ds, vars):: + panels.timeseries( + title = 'Postgres Query Rate', + datasource = ds.prometheus, + ) + .configure( + defaults.configuration.timeseries + .withUnit('cps') + ) + + .addTarget(targets.prometheus( + datasource = ds.prometheus, + expr = 'sum by (aws_ecs_task_revision, name) (rate(postgres_queries_total[$__rate_interval]))', + legendFormat = '{{name}} r{{aws_ecs_task_revision}}', + exemplar = true, + refId = 'PostgresQueryRate', + )) + + .addTarget(targets.prometheus( + datasource = ds.prometheus, + expr = 'sum(rate(postgres_queries_total[$__rate_interval]))', + legendFormat = 'r{{aws_ecs_task_revision}}', + exemplar = true, + refId = 'PostgresQueryRateTotal', + )) +} diff --git a/terraform/monitoring/panels/panels.libsonnet b/terraform/monitoring/panels/panels.libsonnet new file mode 100644 index 00000000..15364c63 --- /dev/null +++ b/terraform/monitoring/panels/panels.libsonnet @@ -0,0 +1,8 @@ +local panels = (import '../grafonnet-lib/defaults.libsonnet').panels; + +{ + app: { + postgres_query_rate: (import 'app/postgres_query_rate.libsonnet' ).new, + postgres_query_latency: (import 'app/postgres_query_latency.libsonnet' ).new, + }, +} diff --git a/terraform/monitoring/variables.tf b/terraform/monitoring/variables.tf index 168f542f..536318aa 100644 --- a/terraform/monitoring/variables.tf +++ b/terraform/monitoring/variables.tf @@ -13,3 +13,8 @@ variable "prometheus_workspace_id" { variable "load_balancer_arn" { type = string } + +variable "notification_channels" { + description = "The notification channels to send alerts to" + type = list(any) +} diff --git a/terraform/variables.tf b/terraform/variables.tf index 8f8c756f..ca7c3f2b 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -41,3 +41,12 @@ variable "relay_public_key" { type = string sensitive = true } + +#------------------------------------------------------------------------------- +# Alerting / Monitoring + +variable "notification_channels" { + description = "The notification channels to send alerts to" + type = list(any) + default = [] +}