From b355f7545b3f4b10f65ee8a24bda37cd150ec837 Mon Sep 17 00:00:00 2001 From: Kevin Berry Date: Tue, 19 Apr 2016 19:43:31 +0000 Subject: [PATCH] Don't try to use nagios-plugins-contrib package in ubuntu 12.04, and add useful plugins --- .../plugins/check_postgres_replication | 229 ++++++++++++++++++ files/default/plugins/check_sidekiq_queue | 125 ++++++++++ recipes/client_package.rb | 2 +- 3 files changed, 355 insertions(+), 1 deletion(-) create mode 100755 files/default/plugins/check_postgres_replication create mode 100755 files/default/plugins/check_sidekiq_queue diff --git a/files/default/plugins/check_postgres_replication b/files/default/plugins/check_postgres_replication new file mode 100755 index 000000000..10953f23e --- /dev/null +++ b/files/default/plugins/check_postgres_replication @@ -0,0 +1,229 @@ +#!/bin/bash +# ======================================================================================== +# Postgres replication lag nagios check using psql and bash. +# +# 2013 Wanelo Inc, Apache License. +# This script expects psql to be in the PATH. +# +# Usage: ./check_postgres_replication [ -h ] [ -m ] [ -U user ] [ -x ] +# [-w ] [-c ] +# -h --host replica host (default 127.0.0.1) +# -m --master master fqdn or ip (required) +# -U --user database user (default postgres) +# -x --units units of measurement to display (KB or MB, default MB) +# -w --warning warning threshold (default 10MB) +# -c --critical critical threshold (default 15MB) +# ======================================================================================== + +# Nagios return codes +readonly STATE_OK=0 +readonly STATE_WARNING=1 +readonly STATE_CRITICAL=2 +readonly STATE_UNKNOWN=3 + +readonly ARGS="$@" + +# set thresholds in bytes +readonly DEFAULT_WARNING_THRESHOLD=10485760 +readonly DEFAULT_CRITICAL_THRESHOLD=15728640 + +readonly DEFAULT_HOST="127.0.0.1" +readonly DEFAULT_USER=postgres +readonly DEFAULT_UNITS=MB + +readonly PATH=/opt/local/bin:${PATH} +readonly NODENAME=$(cat /etc/nodename) +readonly MASTER_SQL="SELECT pg_current_xlog_location()" +readonly REPLICA_SQL="SELECT pg_last_xlog_replay_location()" +readonly REPLICA_TIME_LAG="select now() - pg_last_xact_replay_timestamp()" +readonly ERR=/tmp/repl_chec.$$ + +usage() { + cat <<-EOF +Usage: ./check_postgres_replication [ -h ] [ -m ] [ -U user ] [ -x ] + [-w ] [-c ] + -h --host replica host (default 127.0.0.1) + -m --master master fqdn or ip (required) + -U --user database user (default postgres) + -x --units units of measurement to display (KB or MB, default MB) + -w --warning warning threshold (default 10MB) + -c --critical critical threshold (default 15MB) + + --help show this message + --verbose +EOF +} + +# Parse parameters +parse_arguments() { + local arg=$1 + for arg; do + local delim="" + case "$arg" in + --host) args="${args}-h ";; + --master) args="${args}-m ";; + --user) args="${args}-U ";; + --units) args="${args}-x ";; + --warning) args="${args}-w ";; + --critical) args="${args}-c ";; + --help) args="${args}-H ";; + --verbose) args="${args}-v ";; + *) [[ "${arg:0:1}" == "-" ]] || delim="\"" + args="${args}${delim}${arg}${delim} ";; + esac + done + + eval set -- $args + + while getopts "h:m:U:x:w:c:Hv" OPTION + do + case $OPTION in + v) + set -x + ;; + H) + usage + exit + ;; + h) + local host=$OPTARG + ;; + m) + readonly MASTER=$OPTARG + ;; + U) + local user=$OPTARG + ;; + x) + local units=$OPTARG + ;; + w) + local warning_threshold=$OPTARG + ;; + c) + local critical_threshold=$OPTARG + ;; + esac + done + + readonly USER=${user:-$DEFAULT_USER} + readonly HOST=${host:-$DEFAULT_HOST} + readonly UNITS=${units:-$DEFAULT_UNITS} + readonly WARNING_THRESHOLD=${warning_threshold:-$DEFAULT_WARNING_THRESHOLD} + readonly CRITICAL_THRESHOLD=${critical_threshold:-$DEFAULT_CRITICAL_THRESHOLD} +} + +check_required_arguments() { + if [ -z "$MASTER" ]; then + echo "pass master host in parameters via -m flag" + exit 1 + fi +} + +normalize_units() { + # Error checking of arguments + case "$UNITS" in + KB) + readonly DIVISOR=1024 + ;; + MB) + readonly DIVISOR=1048576 + ;; + *) + echo "Incorrect unit of measurement" + usage + exit 1 + ;; + esac +} + +result() { + local description=$1 + local status=$2 + local diff=$3 + local time_lag=$4 + + local error=$(cat $ERR 2>/dev/null) + + if [[ "${status}" -eq "${STATE_CRITICAL}" && ! -z "${error}" ]]; then + local message="replication check error ${error}" + else + local diff_units=$(bytes_to_units $diff) + local message="replication lag is ${diff_units}${UNITS} : time lag is ${time_lag}" + fi + echo "REPLICATION $description : ${NODENAME} $message|repl=${diff},time_lag=${time_lag};${WARNING_THRESHOLD};${CRITICAL_THRESHOLD}" + rm -f $ERR + exit $status +} + +get_replica_current_xlog() { + echo $(psql -U $USER -Atc "$REPLICA_SQL" -h $HOST 2>$ERR) +} + +get_master_current_xlog() { + echo $(psql -U $USER -Atc "$MASTER_SQL" -h $MASTER 2>$ERR) +} + +check_replica_time_lag() { + echo $(psql -U $USER -Atc "${REPLICA_TIME_LAG}" -h ${HOST} 2>${ERR}) +} + +check_errors() { + if [ $1 -ne 0 ]; then + result "CRITICAL" $STATE_CRITICAL + fi +} + +xlog_to_bytes() { + # http://eulerto.blogspot.com/2011/11/understanding-wal-nomenclature.html + local logid="${1%%/*}" + local offset="${1##*/}" + echo $((0xFF000000 * 0x$logid + 0x$offset)) +} + +bytes_to_units() { + local diff=$1 + if [ -z "$diff" ]; then + echo "ERROR: NO DATA AVAILABLE" + else + echo $(( $diff / $DIVISOR )) + fi +} + +main() { + parse_arguments $ARGS + check_required_arguments + normalize_units + + local replica_xlog=$(get_replica_current_xlog) + check_errors $? + local replica_bytes=$(xlog_to_bytes ${replica_xlog}) + + if [ -z "${replica_xlog}" ]; then + echo -n "Unable to find replica XLOG replay location" > $ERR + result "CRITICAL" $STATE_CRITICAL + fi + + # Query master and replica for latest xlog + local master_xlog=$(get_master_current_xlog) + check_errors $? + local master_bytes=$(xlog_to_bytes $master_xlog) + + # Calculate xlog diff in bytes + local diff=$(($master_bytes - $replica_bytes)) + + local time_lag=$(check_replica_time_lag) + + # Output response + if [ $diff -ge $WARNING_THRESHOLD ] && [ $diff -lt $CRITICAL_THRESHOLD ]; then + result "WARNING" $STATE_WARNING $diff $time_lag + elif [ $diff -ge $CRITICAL_THRESHOLD ]; then + result "CRITICAL" $STATE_CRITICAL $diff $time_lag + else + result "OK" $STATE_OK $diff $time_lag + fi + + rm -f $ERR +} + +main diff --git a/files/default/plugins/check_sidekiq_queue b/files/default/plugins/check_sidekiq_queue new file mode 100755 index 000000000..f5ebb675a --- /dev/null +++ b/files/default/plugins/check_sidekiq_queue @@ -0,0 +1,125 @@ +#!/bin/bash +# ======================================================================================== +# Sidekiq Queue Size Nagios Check +# +# (c) Wanelo Inc, Distributed under Apache License +# +# Usage: +# To check a regular queue: +# ./check_sidekiq_queue [ -h ] [ -a ] [ -q ] [ <-n mq> ] [ -d ] [-w ] [-c ] +# Eg: ./check_sidekiq_queue -w 500 -c 2000 # warning at 500 or higher used, critical at 2000 or higher +# +# To check schedule or retry (system) queue: +# ./check_sidekiq_queue [ -h ] [ -a ] [ -s ] [ <-n mq> ] [ -d ] [-w ] [-c ] +# +# ======================================================================================== + +# Nagios return codes +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 + +WARNING_THRESHOLD=500 +CRITICAL_THRESHOLD=1000 +QUEUE="default" +SYSTEM="" +NAMESPACE="" +HOST="127.0.0.1" +PASS="" +DB=0 + +# Parse parameters +while [ $# -gt 0 ]; do + case "$1" in + -d | --db) + shift + DB=$1 + ;; + -h | --hostname) + shift + HOST=$1 + ;; + -a | --password) + shift + PASS=$1 + ;; + -q | --queue) + shift + QUEUE=$1 + ;; + -n | --namespace) + shift + NAMESPACE=$1 + ;; + -s | --system) + shift + SYSTEM=$1 + ;; + -w | --warning) + shift + WARNING_THRESHOLD=$1 + ;; + -c | --critical) + shift + CRITICAL_THRESHOLD=$1 + ;; + *) echo "Unknown argument: $1" + exit $STATE_UNKNOWN + ;; + esac +shift +done + +PATH=/opt/local/bin:$PATH +NODENAME=$HOSTNAME + +ERR=/tmp/redis-cli.error.$$ +rm -f $ERR + +function result { + DESCRIPTION=$1 + STATUS=$2 + echo "SIDEKIQ $DESCRIPTION : ${NODENAME} ${QUEUE_SIZE} on ${QUEUE}|sidekiq_queue_${QUEUE}=${QUEUE_SIZE};${WARNING_THRESHOLD};${CRITICAL_THRESHOLD}" + rm -f $ERR + exit $STATUS +} + +if [ "$QUEUE" != "default" -a -n "$SYSTEM" ]; then + result "CRITICAL invalid usage: pass -q or -s but not both", $STATE_CRITICAL +fi + +if [ -n "$SYSTEM" -a "$SYSTEM" != "schedule" -a "$SYSTEM" != "retry" ] ; then + result "CRITICAL invalid usage: -s expect one of schedule or retry", $STATE_CRITICAL +fi + +if [ ! -z "$PASS" ]; then + PASS="-a $PASS" +fi + +if [ ! -z "$NAMESPACE" ]; then + NAMESPACE="$NAMESPACE:" +fi + +if [ -n "$SYSTEM" ]; then + QUEUE_SIZE=`redis-cli -h $HOST $PASS -n $DB zcard ${NAMESPACE}$SYSTEM 2>$ERR | cut -d " " -f 1` + QUEUE=$SYSTEM +else + QUEUE_SIZE=`redis-cli -h $HOST $PASS -n $DB llen ${NAMESPACE}queue:$QUEUE 2>$ERR | cut -d " " -f 1` +fi + +if [ -s "$ERR" ]; then + QUEUE_SIZE=`cat $ERR` + result "CRITICAL" $STATE_CRITICAL +fi + +if [ $QUEUE_SIZE -ge $WARNING_THRESHOLD ] && [ $QUEUE_SIZE -lt $CRITICAL_THRESHOLD ]; then + result "WARNING" $STATE_WARNING +elif [ $QUEUE_SIZE -ge $CRITICAL_THRESHOLD ]; then + result "CRITICAL" $STATE_CRITICAL +else + result "OK" $STATE_OK +fi + +# ensure that output from stderr is cleaned up +rm -f $ERR diff --git a/recipes/client_package.rb b/recipes/client_package.rb index c649f4e46..a802b8f23 100644 --- a/recipes/client_package.rb +++ b/recipes/client_package.rb @@ -24,5 +24,5 @@ nagios-plugins-standard nagios-plugins-contrib }.each do |pkg| - package pkg + package pkg unless pkg == 'nagios-plugins-contrib' && node.platform_version == '12.04' end