Permalink
executable file 7734 lines (6269 sloc) 267 KB
#!/usr/bin/perl
# This program is open source, licensed under the PostgreSQL License.
# For license terms, see the LICENSE file.
#
# Copyright (C) 2012-2018: Open PostgreSQL Monitoring Development Group
=head1 check_pgactivity
check_pgactivity - PostgreSQL plugin for Nagios
=head2 SYNOPSIS
check_pgactivity {-w|--warning THRESHOLD} {-c|--critical THRESHOLD} [-s|--service SERVICE ] [-h|--host HOST] [-U|--username ROLE] [-p|--port PORT] [-d|--dbname DATABASE] [-S|--dbservice SERVICE_NAME] [-P|--psql PATH] [--debug] [--status-file FILE] [--path PATH] [-t|--timemout TIMEOUT]
check_pgactivity [-l|--list]
check_pgactivity [--help]
=head2 DESCRIPTION
check_pgactivity is designed to monitor PostgreSQL clusters from Nagios. It
offers many options to measure and monitor useful performance metrics.
=cut
use vars qw($VERSION $PROGRAM);
use strict;
use warnings;
use 5.008;
use POSIX;
use Data::Dumper;
use File::Basename;
use File::Spec;
use File::Temp ();
use Getopt::Long qw(:config bundling no_ignore_case_always);
use List::Util qw(max);
use Pod::Usage;
use Scalar::Util qw(looks_like_number);
use Fcntl qw(:flock);
use Storable qw(lock_store lock_retrieve);
use Config;
use FindBin;
# messing with PATH so pod2usage always finds this script
my @path = split /$Config{'path_sep'}/ => $ENV{'PATH'};
push @path => $FindBin::Bin;
$ENV{'PATH'} = join $Config{'path_sep'} => @path;
undef @path;
# force the env in English
delete $ENV{'LC_ALL'};
$ENV{'LC_ALL'} = 'C';
setlocale( LC_ALL, 'C' );
delete $ENV{'LANG'};
delete $ENV{'LANGUAGE'};
$| = 1;
$VERSION = '2.4dev';
$PROGRAM = 'check_pgactivity';
my $PG_VERSION_MIN = 70400;
my $PG_VERSION_74 = 70400;
my $PG_VERSION_80 = 80000;
my $PG_VERSION_81 = 80100;
my $PG_VERSION_82 = 80200;
my $PG_VERSION_83 = 80300;
my $PG_VERSION_84 = 80400;
my $PG_VERSION_90 = 90000;
my $PG_VERSION_91 = 90100;
my $PG_VERSION_92 = 90200;
my $PG_VERSION_93 = 90300;
my $PG_VERSION_94 = 90400;
my $PG_VERSION_95 = 90500;
my $PG_VERSION_96 = 90600;
my $PG_VERSION_100 = 100000;
# reference to the output sub
my $output_fmt;
# Available services and descriptions.
#
# The referenced sub called to exec each service takes one parameter: a
# reference to the arguments hash (%args)
#
# Note that we cannot use Perl prototype for these subroutine as they are
# called indirectly (thus the args given by references).
my %services = (
# 'service_name' => {
# 'sub' => sub reference to call to run this service
# 'desc' => 'a description of the service'
# }
'autovacuum' => {
'sub' => \&check_autovacuum,
'desc' => 'Check the autovacuum activity.'
},
'backends' => {
'sub' => \&check_backends,
'desc' => 'Number of connections, compared to max_connections.'
},
'backends_status' => {
'sub' => \&check_backends_status,
'desc' => 'Number of connections in relation to their status.'
},
'commit_ratio' => {
'sub' => \&check_commit_ratio,
'desc' => 'Commit and rollback rate per second and commit ratio since last execution.'
},
'database_size' => {
'sub' => \&check_database_size,
'desc' => 'Variation of database sizes.',
},
'table_unlogged' => {
'sub' => \&check_table_unlogged,
'desc' => 'Check unlogged tables'
},
'wal_files' => {
'sub' => \&check_wal_files,
'desc' => 'Total number of WAL files.',
},
'archiver' => {
'sub' => \&check_archiver,
'desc' => 'Check the archiver status and number of wal files ready to archive.',
},
'last_vacuum' => {
'sub' => \&check_last_vacuum,
'desc' =>
'Check the oldest vacuum (from autovacuum or not) on the database.',
},
'last_analyze' => {
'sub' => \&check_last_analyze,
'desc' =>
'Check the oldest analyze (from autovacuum or not) on the database.',
},
'locks' => {
'sub' => \&check_locks,
'desc' => 'Check the number of locks on the hosts.'
},
'oldest_2pc' => {
'sub' => \&check_oldest_2pc,
'desc' => 'Check the oldest two-phase commit transaction.'
},
'oldest_idlexact' => {
'sub' => \&check_oldest_idlexact,
'desc' => 'Check the oldest idle transaction.'
},
'longest_query' => {
'sub' => \&check_longest_query,
'desc' => 'Check the longest running query.'
},
'bgwriter' => {
'sub' => \&check_bgwriter,
'desc' => 'Check the bgwriter activity.',
},
'archive_folder' => {
'sub' => \&check_archive_folder,
'desc' => 'Check archives in given folder.',
},
'minor_version' => {
'sub' => \&check_minor_version,
'desc' => 'Check if the PostgreSQL minor version is the latest one.',
},
'hot_standby_delta' => {
'sub' => \&check_hot_standby_delta,
'desc' => 'Check delta in bytes between a master and its hot standbys.',
},
'streaming_delta' => {
'sub' => \&check_streaming_delta,
'desc' => 'Check delta in bytes between a master and its standbys in streaming replication.',
},
'settings' => {
'sub' => \&check_settings,
'desc' => 'Check if the configuration file changed.',
},
'hit_ratio' => {
'sub' => \&check_hit_ratio,
'desc' => 'Check hit ratio on databases.'
},
'backup_label_age' => {
'sub' => \&check_backup_label_age,
'desc' => 'Check age of backup_label file.',
},
'connection' => {
'sub' => \&check_connection,
'desc' => 'Perform a simple connection test.'
},
'custom_query' => {
'sub' => \&check_custom_query,
'desc' => 'Perform the given user query.'
},
'configuration' => {
'sub' => \&check_configuration,
'desc' => 'Check the most important settings.',
},
'btree_bloat' => {
'sub' => \&check_btree_bloat,
'desc' => 'Check B-tree index bloat.'
},
'max_freeze_age' => {
'sub' => \&check_max_freeze_age,
'desc' => 'Check oldest database in transaction age.'
},
'invalid_indexes' => {
'sub' => \&check_invalid_indexes,
'desc' => 'Check for invalid indexes.'
},
'is_master' => {
'sub' => \&check_is_master,
'desc' => 'Check if cluster is in production.'
},
'is_hot_standby' => {
'sub' => \&check_is_hot_standby,
'desc' => 'Check if cluster is a hot standby.'
},
'pga_version' => {
'sub' => \&check_pga_version,
'desc' => 'Check the version of this check_pgactivity script.'
},
'is_replay_paused' => {
'sub' => \&check_is_replay_paused,
'desc' => 'Check if the replication is paused.'
},
'table_bloat' => {
'sub' => \&check_table_bloat,
'desc' => 'Check tables bloat.'
},
'temp_files' => {
'sub' => \&check_temp_files,
'desc' => 'Check temp files generation.'
},
'replication_slots' => {
'sub' => \&check_replication_slots,
'desc' => 'Check delta in bytes of the replication slots.'
},
'pg_dump_backup' => {
'sub' => \&check_pg_dump_backup,
'desc' => 'Check pg_dump backups age and retention policy.'
},
'stat_snapshot_age' => {
'sub' => \&check_stat_snapshot_age,
'desc' => 'Check stats collector\'s stats age.'
},
'sequences_exhausted' => {
'sub' => \&check_sequences_exhausted,
'desc' => 'Check that auto-incremented colums aren\'t reaching their upper limit.'
},
'pgdata_permission' => {
'sub' => \&check_pgdata_permission,
'desc' => 'Check that the permission on PGDATA is 700.'
},
'uptime' => {
'sub' => \&check_uptime,
'desc' => 'Time since postmaster start or configurtion reload.'
},
);
=over
=item B<-s>, B<--service> SERVICE
The Nagios service to run. See section SERVICES for a description of
available services or use C<--list> for a short service and description
list.
=item B<-h>, B<--host> HOST
Database server host or socket directory (default: "localhost").
=item B<-U>, B<--username> ROLE
Database user name (default: "postgres").
=item B<-p>, B<--port> PORT
Database server port (default: "5432").
=item B<-d>, B<--dbname> DATABASE
Database name to connect to (default: "template1").
B<WARNING>! This is not necessarily one of the database that will be
checked. See C<--dbinclude> and C<--dbexclude> .
=item B<-S>, B<--dbservice> SERVICE_NAME
The connection service name from pg_service.conf to use.
=item B<--dbexclude> REGEXP
Some services automatically check all the databases of your
cluster (note: that does not mean they always need to connect on all
of them to check them though). C<--dbexclude> excludes any
database whose name matches the given Perl regular expression.
Repeat this option as many time as needed.
See C<--dbinclude> as well. If a database match both dbexclude and
dbinclude arguments, it is excluded.
=item B<--dbinclude> REGEXP
Some services automatically check all the databases of your
cluster (note: that does not imply that they always need to connect to all
of them though). Some always exclude the 'postgres'
database and templates. C<--dbinclude> checks B<ONLY>
databases whose names match the given Perl regular expression.
Repeat this option as many time as needed.
See C<--dbexclude> as well. If a database match both dbexclude and
dbinclude arguments, it is excluded.
=item B<-w>, B<--warning> THRESHOLD
The Warning threshold.
=item B<-c>, B<--critical> THRESHOLD
The Critical threshold.
=item B<-F>, B<--format> OUTPUT_FORMAT
The output format. Supported output are: C<binary>, C<debug>, C<human>,
C<nagios> and C<nagios_strict>.
Using the C<binary> format, the results are written in a binary file (using perl
module C<Storable>) given in argument C<--output>. If no output is given,
defaults to file C<check_pgactivity.out> in the same directory as the script.
The C<nagios_strict> format is equivalent to the C<nagios> format. The only
difference is that it enforces the unit follow the strict Nagios specs: B, c, s
or %. Any unit absent from this list is dropped (Bps, Tps, etc).
=item B<--tmpdir> DIRECTORY
Path to a directory where the script can create temporary files. The
script relies on the system default temporary directory if possible.
=item B<-P>, B<--psql> FILE
Path to the C<psql> executable (default: "psql").
=item B<--status-file> PATH
Path to the file where service status information is kept between
successive calls. Default is to save check_pgactivity.data in the same
directory as the script.
=item B<--dump-status-file>
Dump the content of the status file and exit. This is useful for debugging purpose.
=item B<--dump-bin-file> [PATH]
Dump the content of the given binary file previously created using
C<--format binary>. If no path is given, defaults to file
C<check_pgactivity.out> in the same directory as the script.
=item B<-t>, B<--timeout> TIMEOUT
Timeout (default: "30s"), as raw (in seconds) or as
an interval. This timeout will be used as C<statement_timeout> for psql and URL
timeout for C<minor_version> service.
=item B<-l>, B<--list>
List available services.
=item B<-V>, B<--version>
Print version and exit.
=item B<--debug>
Print some debug messages.
=item B<-?>, B<--help>
Show this help page.
=back
=cut
my %args = (
'service' => undef,
'host' => undef,
'username' => undef,
'port' => undef,
'dbname' => undef,
'dbservice' => undef,
'warning' => undef,
'critical' => undef,
'exclude' => [],
'dbexclude' => [],
'dbinclude' => [],
'tmpdir' => File::Spec->tmpdir(),
'psql' => undef,
'path' => undef,
'status-file' => dirname(__FILE__) . '/check_pgactivity.data',
'output' => dirname(__FILE__) . '/check_pgactivity.out',
'query' => undef,
'type' => undef,
'reverse' => 0,
'work_mem' => undef,
'maintenance_work_mem' => undef,
'shared_buffers' => undef,
'wal_buffers' => undef,
'checkpoint_segments' => undef,
'effective_cache_size' => undef,
'no_check_autovacuum' => 0,
'no_check_fsync' => 0,
'no_check_enable' => 0,
'no_check_track_counts' => 0,
'ignore-wal-size' => 0,
'unarchiver' => '',
'save' => 0,
'suffix' => '',
'slave' => [],
'list' => 0,
'help' => 0,
'debug' => 0,
'timeout' => '30s',
'dump-status-file' => 0,
'dump-bin-file' => undef,
'format' => 'nagios',
'uid' => undef
);
# Set name of the program without path*
my $orig_name = $0;
$0 = $PROGRAM;
# Die on kill -1, -2, -3 or -15
$SIG{'HUP'} = $SIG{'INT'} = $SIG{'QUIT'} = $SIG{'TERM'} = \&terminate;
# Handle SIG
sub terminate() {
my ($signal) = @_;
die ("SIG $signal caught");
}
# Print the version and exit
sub version() {
printf "check_pgactivity version %s, Perl %vd\n",
$VERSION, $^V;
exit 0;
}
# List services that can be performed
sub list_services() {
print "List of available services:\n\n";
foreach my $service ( sort keys %services ) {
printf "\t%-17s\t%s\n", $service, $services{$service}{'desc'};
}
exit 0;
}
# Check wrapper around Storable::file_magic to fallback on
# Storable::read_magic under perl 5.8 and below
sub is_storable($) {
my $storage = shift;
my $head;
open my $fh, '<', $storage;
flock($fh, LOCK_SH) or die "can't get shared lock on $storage: $!";
if ( defined *Storable::file_magic{CODE}
and Storable::file_magic( $storage )
) {
close $fh; # release the shared lock
return 1;
}
read $fh, $head, 64;
close $fh;
return defined Storable::read_magic($head);
}
# Record the given ref content for the given host in a file on disk.
# The file is defined by argument "--status-file" on command line. By default:
#
# dirname(__FILE__) . '/check_pgactivity.data'
#
# Format of data in this file is:
# {
# "${host}${port}" => {
# "$name" => ref
# }
# }
# data can be retrieved later using the "load" sub.
#
# Parameters are :
# * the host structure ref that holds the "host" and "port" parameters
# * the name of the structure to save
# * the ref of the structure to save
# * the path to the file storage
sub save($$$$) {
my $host = shift;
my $name = shift;
my $ref = shift;
my $storage = shift;
my $all = {};
my $hostkey;
if (defined $host->{'dbservice'}) {
$hostkey = "$host->{'dbservice'}";
}
else {
$hostkey = "$host->{'host'}$host->{'port'}";
}
die "File «${storage}» not recognized as a check_pgactivity status file.\n\n"
."Please, check its path or move away this wrong file"
if -r $storage and not is_storable $storage;
$all = lock_retrieve($storage) if -r $storage;
$all->{$hostkey}{$name} = $ref;
lock_store( $all, $storage )
or die "Can't store data in '$storage'!\n";
}
# Load the given ref content for the given host from the file on disk.
#
# See "save" sub comments for more info.
# Parameters are :
# * the host structure ref that holds the "host" and "port" parameters
# * the name of the structure to load
# * the path to the file storage
sub load($$$) {
my $host = shift;
my $name = shift;
my $storage = shift;
my $hostkey;
my $all;
if (defined $host->{'dbservice'}) {
$hostkey = "$host->{'dbservice'}";
}
else {
$hostkey = "$host->{'host'}$host->{'port'}";
}
return undef unless -r $storage;
die "File «${storage}» not recognized as a check_pgactivity status file.\n\n"
."Please, check its path or move away this wrong file"
unless is_storable $storage;
$all = lock_retrieve($storage);
return $all->{$hostkey}{$name};
}
sub dump_status_file {
my $f = shift;
my $all;
$f = $args{'status-file'} unless defined $f;
$f = $args{'output'} unless $f ;
$all = lock_retrieve($f);
print Data::Dumper->new( [ $all ] )->Terse(1)->Dump;
exit 0;
}
# Return formatted size string with units.
# Parameter: size in bytes
sub to_size($) {
my $val = shift;
my @units = qw{B kB MB GB TB PB EB};
my $size = '';
my $mod = 0;
my $i;
return $val if $val =~ /^(-?inf)|(NaN$)/i;
$val = int($val);
for ( $i=0; $i < 6 and $val > 1024; $i++ ) {
$mod = $val%1024;
$val = int( $val/1024 );
}
$val = "$val.$mod" unless $mod == 0;
return "${val}$units[$i]";
}
# Return formatted time string with units.
# Parameter: duration in seconds
sub to_interval($) {
my $val = shift;
my $interval = '';
return $val if $val =~ /^-?inf/i;
$val = int($val);
if ( $val > 604800 ) {
$interval = int( $val / 604800 ) . "w ";
$val %= 604800;
}
if ( $val > 86400 ) {
$interval .= int( $val / 86400 ) . "d ";
$val %= 86400;
}
if ( $val > 3600 ) {
$interval .= int( $val / 3600 ) . "h";
$val %= 3600;
}
if ( $val > 60 ) {
$interval .= int( $val / 60 ) . "m";
$val %= 60;
}
$interval .= "${val}s" if $val > 0;
return "${val}s" unless $interval; # return a value if $val <= 0
return $interval;
}
=head2 THRESHOLDS
THRESHOLDS provided as warning and critical values can be raw numbers,
percentages, intervals or sizes. Each available service supports one or more
formats (eg. a size and a percentage).
=over
=item B<Percentage>
If THRESHOLD is a percentage, the value should end with a '%' (no space).
For instance: 95%.
=item B<Interval>
If THRESHOLD is an interval, the following units are accepted (not case
sensitive): s (second), m (minute), h (hour), d (day). You can use more than
one unit per given value. If not set, the last unit is in seconds.
For instance: "1h 55m 6" = "1h55m6s".
=cut
sub is_size($){
my $str_size = lc( shift() );
return 1 if $str_size =~ /^\s*[0-9]+([kmgtpez][bo]?)?\s*$/ ;
return 0;
}
sub is_time($){
my $str_time = lc( shift() );
return 1 if ( $str_time
=~ /^(\s*([0-9]\s*[smhd]?\s*))+$/
);
return 0;
}
# Return a duration in seconds from an interval (with units).
sub get_time($) {
my $str_time = lc( shift() );
my $ts = 0;
my @date;
die( "Malformed interval: «$str_time»!\n"
. "Authorized unit are: dD, hH, mM, sS\n" )
unless is_time($str_time);
# no bad units should exist after this line!
@date = split( /([smhd])/, $str_time );
LOOP_TS: while ( my $val = shift @date ) {
$val = int($val);
die("Wrong value for an interval: «$val»!") unless defined $val;
my $unit = shift(@date) || '';
if ( $unit eq 'm' ) {
$ts += $val * 60;
next LOOP_TS;
}
if ( $unit eq 'h' ) {
$ts += $val * 3600;
next LOOP_TS;
}
if ( $unit eq 'd' ) {
$ts += $val * 86400;
next LOOP_TS;
}
$ts += $val;
}
return $ts;
}
=pod
=item B<Size>
If THRESHOLD is a size, the following units are accepted (not case sensitive):
b (Byte), k (KB), m (MB), g (GB), t (TB), p (PB), e (EB) or Z (ZB). Only
integers are accepted. Eg. C<1.5MB> will be refused, use C<1500kB>.
The factor between units is 1024 bytes. Eg. C<1g = 1G = 1024*1024*1024.>
=back
=cut
# Return a size in bytes from a size with unit.
# If unit is '%', use the second parameter to compute the size in bytes.
sub get_size($;$) {
my $str_size = shift;
my $size = 0;
my $unit = '';
die "Only integers are accepted as size. Adjust the unit to your need."
if $str_size =~ /[.,]/;
$str_size =~ /^([0-9]+)(.*)$/;
$size = int($1);
$unit = lc($2);
return $size unless $unit ne '';
if ( $unit eq '%' ) {
my $ratio = shift;
die("Can not compute a ratio without the factor!")
unless defined $unit;
return int( $size * $ratio / 100 );
}
return $size if $unit eq 'b';
return $size * 1024 if $unit =~ '^k[bo]?$';
return $size * 1024**2 if $unit =~ '^m[bo]?$';
return $size * 1024**3 if $unit =~ '^g[bo]?$';
return $size * 1024**4 if $unit =~ '^t[bo]?$';
return $size * 1024**5 if $unit =~ '^p[bo]?$';
return $size * 1024**6 if $unit =~ '^e[bo]?$';
return $size * 1024**7 if $unit =~ '^z[bo]?$';
die("Unknown size unit: $unit");
}
=head2 CONNECTIONS
check_pgactivity allows two different connection specifications: by service, or
by specifying values for host, user, port, and database.
Some services can run on multiple hosts, or needs to connect to multiple hosts.
You must specify one of the parameters below if the service needs to connect
to your PostgreSQL instance. In other words, check_pgactivity will NOT look for
the C<libpq> environment variables.
The format for connection parameters is:
=over
=item B<Parameter> C<--dbservice SERVICE_NAME>
Define a new host using the given service. Multiple hosts can be defined by
listing multiple services separated by a comma. Eg.
--dbservice service1,service2
=item B<Parameters> C<--host HOST>, C<--port PORT>, C<--user ROLE> or C<--dbname DATABASE>
One parameter is enough to define a new host. Default values are used
for missing parameters.
If multiple values are given, define as many host as maximum given values.
Values are associated by position. Eg.:
--host h1,h2 --port 5432,5433
Means "host=h1 port=5432" and "host=h2 port=5433".
If the number of values is different between parameters, any host missing a
parameter will use the first given value for this parameter. Eg.:
--host h1,h2 --port 5433
Means: "host=h1 port=5433" and "host=h2 port=5433".
=item B<Services are defined first>
For instance:
--dbservice s1 --host h1 --port 5433
means: use "service=s1" and "host=h1 port=5433" in this order. If the service
supports only one host, the second host is ignored.
=item B<Mutual exclusion between both methods>
You can not overwrite services connections variables with parameters C<--host HOST>, C<--port PORT>, C<--user ROLE> or C<--dbname DATABASE>
=back
=cut
sub parse_hosts(\%) {
my %args = %{ shift() };
my @hosts = ();
if (defined $args{'dbservice'}) {
push
@hosts,
{ 'dbservice' => $_,
'name' => "service:$_",
'pgversion' => undef
}
foreach split /,/, $args{'dbservice'};
}
# Add as many hosts than necessary depending on given parameters
# host/port/db/user.
# Any missing parameter will be set to its default value.
if (defined $args{'host'}
or defined $args{'username'}
or defined $args{'port'}
or defined $args{'dbname'}
) {
$args{'host'} = $ENV{'PGHOST'} || 'localhost'
unless defined $args{'host'};
$args{'username'} = $ENV{'PGUSER'} || 'postgres'
unless defined $args{'username'};
$args{'port'} = $ENV{'PGPORT'} || '5432'
unless defined $args{'port'};
$args{'dbname'} = $ENV{'PGDATABASE'} || 'template1'
unless defined $args{'dbname'};
my @dbhosts = split( /,/, $args{'host'} );
my @dbnames = split( /,/, $args{'dbname'} );
my @dbusers = split( /,/, $args{'username'} );
my @dbports = split( /,/, $args{'port'} );
my $nbhosts = max $#dbhosts, $#dbnames, $#dbusers, $#dbports;
# Take the first value for each connection property as default.
# eg. "-h localhost -p 5432,5433" gives two hosts:
# * localhost:5432
# * localhost:5433
for ( my $i = 0; $i <= $nbhosts; $i++ ) {
push(
@hosts,
{ 'host' => $dbhosts[$i] || $dbhosts[0],
'port' => $dbports[$i] || $dbports[0],
'db' => $dbnames[$i] || $dbnames[0],
'user' => $dbusers[$i] || $dbusers[0],
'pgversion' => undef
}
);
$hosts[-1]{'name'} = sprintf('host:%s port:%d db:%s',
$hosts[-1]{'host'}, $hosts[-1]{'port'}, $hosts[-1]{'db'}
);
}
}
dprint ('Hosts: '. Dumper(\@hosts));
return \@hosts;
}
# Execute a query on a host.
# Params:
# * host
# * query
# * (optional) database
# * (optional) get_fields
# The result is an array of arrays:
# [
# [column1, ...] # line1
# ...
# ]
sub query($$;$$) {
my $host = shift;
my $query = shift;
my $db = shift;
my @res = ();
my $res = '';
my $RS = chr(30); # ASCII RS (record separator)
my $FS = chr(3); # ASCII ETX (end of text)
my $get_fields = shift;
my $tmpfile;
my $psqlcmd;
my $rc;
local $/ = undef;
delete $ENV{PGSERVICE};
delete $ENV{PGDATABASE};
delete $ENV{PGHOST};
delete $ENV{PGPORT};
delete $ENV{PGUSER};
delete $ENV{PGOPTIONS};
$ENV{PGDATABASE} = $host->{'db'} if defined $host->{'db'};
$ENV{PGSERVICE} = $host->{'dbservice'} if defined $host->{'dbservice'};
$ENV{PGHOST} = $host->{'host'} if defined $host->{'host'};
$ENV{PGPORT} = $host->{'port'} if defined $host->{'port'};
$ENV{PGUSER} = $host->{'user'} if defined $host->{'user'};
$ENV{PGOPTIONS} = '-c client_encoding=utf8 -c client_min_messages=error -c statement_timeout=' . get_time($args{'timeout'}) * 1000;
dprint ("Query: $query\n");
dprint ("Env. service: $ENV{PGSERVICE} \n") if defined $host->{'dbservice'};
dprint ("Env. host : $ENV{PGHOST} \n") if defined $host->{'host'};
dprint ("Env. port : $ENV{PGPORT} \n") if defined $host->{'port'};
dprint ("Env. user : $ENV{PGUSER} \n") if defined $host->{'user'};
dprint ("Env. db : $ENV{PGDATABASE}\n") if defined $host->{'db'};
$tmpfile = File::Temp->new(
TEMPLATE => 'check_pga-XXXXXXXX',
DIR => $args{'tmpdir'}
) or die "Could not create or write in a temp file!";
print $tmpfile "$query;" or die "Could not create or write in a temp file!";
$psqlcmd = qq{ $args{'psql'} --set "ON_ERROR_STOP=1" }
. qq{ -qXAf $tmpfile -R $RS -F $FS };
$psqlcmd .= qq{ --dbname='$db' } if defined $db;
$psqlcmd .= qq{ -t } unless defined $get_fields;
$res = qx{ $psqlcmd 2>&1 };
$rc = $?;
dprint("Query rc: $rc\n");
dprint( sprintf( " stderr (%u): «%s»\n", length $res, $res ) )
if $rc;
exit unknown('CHECK_PGACTIVITY',
[ "Query fail !\n" . $res ]
) unless $rc == 0;
if (defined $res) {
chop $res;
push @res, [ split(chr(3) => $_, -1) ]
foreach split (chr(30) => $res, -1);
pop @res if defined $get_fields and $res[-1][0] =~ m/^\(\d+ rows?\)$/;
}
dprint( "Query result: ". Dumper( \@res ) );
return \@res;
}
# Select the appropriate query among an hash of queries according to the
# backend version and execute it. Same argument order than in "query" sub.
# Hash of query must be of this form:
# {
# pg_version_num => $query1,
# ...
# }
#
# where pg_version_num is the minimum PostgreSQL version which can run the
# query. This version number is numeric. See "set_pgversion" about
# how to compute a PostgreSQL num version, or globals $PG_VERSION_*.
sub query_ver($\%;$) {
my $host = shift;
my %queries = %{ shift() };
# Shift returns undef if the db is not given. The value is then set in
# "query" sub
my $db = shift;
set_pgversion($host);
foreach my $ver ( sort { $b <=> $a } keys %queries ) {
return query( $host, $queries{$ver}, $db )
if ( $ver <= $host->{'version_num'} );
}
return undef;
}
# Return an (unsorted) array with all databases in given host but
# templates and "postgres".
sub get_all_dbname($) {
my @dbs;
push @dbs => $_->[0] foreach (
@{ query( shift, q{
SELECT datname
FROM pg_database
WHERE NOT datistemplate
AND datallowconn
AND datname <> 'postgres'
ORDER BY 1
})
}
);
return \@dbs;
}
# Query and set the version for the given host
sub set_pgversion($) {
my $host = shift;
unless ( $host->{'version'} ) {
my $rs = query( $host, q{SELECT setting FROM pg_catalog.pg_settings WHERE name IN ('server_version_num', 'server_version') ORDER BY name = 'server_version_num'} );
if ( $? != 0 ) {
dprint("FATAL: psql error, $!\n");
exit 1;
}
$host->{'version'} = $rs->[0][0];
chomp( $host->{'version'} );
if ( scalar(@$rs) > 1 ) {
# only use server_version_num for PostgreSQL 8.2+
$host->{'version_num'} = $rs->[1][0];
chomp( $host->{'version_num'} );
}
elsif ( $host->{'version'} =~ /^(\d+)\.(\d+)(.(\d+))?/ ) {
# get back to the regexp handling for PostgreSQL <8.2
$host->{'version_num'} = int($1) * 10000 + int($2) * 100;
# alpha/beta version have no minor version number
$host->{'version_num'} += int($4) if defined $4;
}
dprint(sprintf ("host %s is version %s/%s\n",
$host->{'name'},
$host->{'version'},
$host->{'version_num'})
);
return;
}
return 1;
}
# Check host compatibility, with warning
sub is_compat($$$;$) {
my $host = shift;
my $service = shift;
my $min = shift;
my $max = shift() || 9999999;;
my $ver;
set_pgversion($host);
$ver = 100*int($host->{'version_num'}/100);
unless (
$ver >= $min
and $ver <= $max
) {
warn sprintf "Service %s is not compatible with host '%s' (v%s).\n",
$service, $host->{'name'}, $host->{'version'};
return 0;
}
return 1;
}
# Check host compatibility, without warning
sub check_compat($$$;$) {
my $host = shift;
my $service = shift;
my $min = shift;
my $max = shift() || 9999999;;
my $ver;
set_pgversion($host);
$ver = 100*int($host->{'version_num'}/100);
return 0 unless (
$ver >= $min
and $ver <= $max
);
return 1;
}
sub dprint {
return unless $args{'debug'};
foreach (@_) {
print "DEBUG: $_";
}
}
sub unknown($;$$$) {
return $output_fmt->( 3, $_[0], $_[1], $_[2], $_[3] );
}
sub critical($;$$$) {
return $output_fmt->( 2, $_[0], $_[1], $_[2], $_[3] );
}
sub warning($;$$$) {
return $output_fmt->( 1, $_[0], $_[1], $_[2], $_[3] );
}
sub ok($;$$$) {
return $output_fmt->( 0, $_[0], $_[1], $_[2], $_[3] );
}
sub bin_output ($$;$$$) {
my $rc = shift;
my $service = shift;
my $all = {};
my @msg;
my @perfdata;
my @longmsg;
@msg = @{ $_[0] } if defined $_[0];
@perfdata = @{ $_[1] } if defined $_[1];
@longmsg = @{ $_[2] } if defined $_[2];
$all = lock_retrieve( $args{'output'} ) if -r $args{'output'};
$all->{ $args{'service'} } = {
'timestamp' => time,
'rc' => $rc,
'service' => $service,
'messages' => \@msg,
'perfdata' => \@perfdata,
'longmsg' => \@longmsg
};
lock_store( $all, $args{'output'} )
or die "Can't store data in '$args{'output'}'!\n";
}
sub debug_output ($$;$$$) {
my $rc = shift;
my $service = shift;
my $ret;
my @msg;
my @perfdata;
my @longmsg;
@msg = @{ $_[0] } if defined $_[0];
@perfdata = @{ $_[1] } if defined $_[1];
@longmsg = @{ $_[2] } if defined $_[2];
$ret = sprintf "%-15s: %s\n", 'Service', $service;
$ret .= sprintf "%-15s: 0 (%s)\n", "Returns", "OK" if $rc == 0;
$ret .= sprintf "%-15s: 1 (%s)\n", "Returns", "WARNING" if $rc == 1;
$ret .= sprintf "%-15s: 2 (%s)\n", "Returns", "CRITICAL" if $rc == 2;
$ret .= sprintf "%-15s: 3 (%s)\n", "Returns", "UNKNOWN" if $rc == 3;
$ret .= sprintf "%-15s: %s\n", "Message", $_ foreach @msg;
$ret .= sprintf "%-15s: %s\n", "Long message", $_ foreach @longmsg;
$ret .= sprintf "%-15s: %s\n", "Perfdata",
Data::Dumper->new([ $_ ])->Indent(0)->Terse(1)->Dump foreach @perfdata;
print $ret;
return $rc;
}
sub human_output ($$;$$$) {
my $rc = shift;
my $service = shift;
my $ret;
my @msg;
my @perfdata;
my @longmsg;
@msg = @{ $_[0] } if defined $_[0];
@perfdata = @{ $_[1] } if defined $_[1];
@longmsg = @{ $_[2] } if defined $_[2];
$ret = sprintf "%-15s: %s\n", 'Service', $service;
$ret .= sprintf "%-15s: 0 (%s)\n", "Returns", "OK" if $rc == 0;
$ret .= sprintf "%-15s: 1 (%s)\n", "Returns", "WARNING" if $rc == 1;
$ret .= sprintf "%-15s: 2 (%s)\n", "Returns", "CRITICAL" if $rc == 2;
$ret .= sprintf "%-15s: 3 (%s)\n", "Returns", "UNKNOWN" if $rc == 3;
$ret .= sprintf "%-15s: %s\n", "Message", $_ foreach @msg;
$ret .= sprintf "%-15s: %s\n", "Long message", $_ foreach @longmsg;
foreach my $perfdata ( @perfdata ) {
map {$_ = undef unless defined $_} @{$perfdata}[2..6];
if ( defined $$perfdata[2] and $$perfdata[2] =~ /B$/ ) {
$ret .= sprintf "%-15s: %s=%s", "Perfdata",
$$perfdata[0], to_size($$perfdata[1]);
$ret .= sprintf " warn=%s", to_size( $$perfdata[3] ) if defined $$perfdata[3];
$ret .= sprintf " crit=%s", to_size( $$perfdata[4] ) if defined $$perfdata[4];
$ret .= sprintf " min=%s", to_size( $$perfdata[5] ) if defined $$perfdata[5];
$ret .= sprintf " max=%s", to_size( $$perfdata[6] ) if defined $$perfdata[6];
$ret .= "\n";
}
elsif ( defined $$perfdata[2] and $$perfdata[2] =~ /\ds$/ ) {
$ret .= sprintf "%-15s: %s=%s", "Perfdata",
$$perfdata[0], to_interval( $$perfdata[1] );
$ret .= sprintf " warn=%s", to_interval( $$perfdata[3] ) if defined $$perfdata[3];
$ret .= sprintf " crit=%s", to_interval( $$perfdata[4] ) if defined $$perfdata[4];
$ret .= sprintf " min=%s", to_interval( $$perfdata[5] ) if defined $$perfdata[5];
$ret .= sprintf " max=%s", to_interval( $$perfdata[6] ) if defined $$perfdata[6];
$ret .= "\n";
}
else {
$ret .= sprintf "%-15s: %s=%s", "Perfdata",
$$perfdata[0], $$perfdata[1];
$ret .= sprintf "%s", $$perfdata[2] if defined $$perfdata[2];
$ret .= sprintf " warn=%s", $$perfdata[3] if defined $$perfdata[3];
$ret .= sprintf " crit=%s", $$perfdata[4] if defined $$perfdata[4];
$ret .= sprintf " min=%s", $$perfdata[5] if defined $$perfdata[5];
$ret .= sprintf " max=%s", $$perfdata[6] if defined $$perfdata[6];
$ret .= "\n";
}
}
print $ret;
return $rc;
}
sub nagios_output ($$;$$$) {
my $rc = shift;
my $ret = shift;
my @msg;
my @perfdata;
my @longmsg;
$ret .= " OK" if $rc == 0;
$ret .= " WARNING" if $rc == 1;
$ret .= " CRITICAL" if $rc == 2;
$ret .= " UNKNOWN" if $rc == 3;
@msg = @{ $_[0] } if defined $_[0];
@perfdata = @{ $_[1] } if defined $_[1];
@longmsg = @{ $_[2] } if defined $_[2];
$ret .= ": ". join( ', ', @msg ) if @msg;
if ( scalar @perfdata ) {
$ret .= " |";
foreach my $perf ( @perfdata ) {
# escape quotes
$$perf[0] =~ s/'/''/g;
# surounding quotes if space in the label
$$perf[0] = "'$$perf[0]'" if $$perf[0] =~ /\s/;
# the perfdata itself and its unit
$ret .= " $$perf[0]=$$perf[1]";
# init and join optional values (unit/warn/crit/min/max)
map {$_ = "" unless defined $_} @{$perf}[2..6];
$ret .= join ';' => @$perf[2..6];
# remove useless semi-colons at end
$ret =~ s/;*$//;
}
}
$ret .= "\n". join( ' ', @longmsg ) if @longmsg;
print $ret;
return $rc;
}
sub nagios_strict_output ($$;$$$) {
my $rc = shift;
my $ret = shift;
my @msg;
my @perfdata;
my @longmsg;
@msg = @{ $_[0] } if defined $_[0];
@perfdata = @{ $_[1] } if defined $_[1];
@longmsg = @{ $_[2] } if defined $_[2];
map { $$_[2] = '' if exists $$_[2] and defined $$_[2]
and $$_[2] !~ /\A[Bcs%]\z/
} @perfdata;
return nagios_output( $rc, $ret, \@msg, \@perfdata, \@longmsg );
}
=head2 SERVICES
Descriptions and parameters of available services.
=over
=item B<archive_folder>
Check if all archived WALs exist between the oldest and the latest WAL in the
archive folder and make sure they are 16MB. The given folder must have archived
files from ONE cluster. The version of PostgreSQL that created the archives is
only checked on the last one, for performance consideration.
This service requires the argument C<--path> on the command line to specify the
archive folder path to check. Obviously, it must have access to this
folder at the filesystem level: you may have to execute it on the archiving
server rather than on the PostgreSQL instance.
The optional argument C<--suffix> defines the suffix of your archived
WALs; this is useful for compressed WALs (eg. .gz, .bz2, ...).
Default is no suffix.
This service needs to read the header of one of the archives to define how many
segments a WAL owns. Check_pgactivity automatically handles files with
extensions .gz, .bz2, .xz, .zip or .7z using the following commands:
gzip -dc
bzip2 -dc
xz -dc
unzip -qqp
7z x -so
If needed, provide your own command that writes the uncompressed file
to standard output with the C<--unarchiver> argument.
Optional argument C<--ignore-wal-size> skips the WAL size check. This is useful
if your archived WALs are compressed and check_pgactivity is unable to guess the
original size. Here are the commands check_pgactivity uses to guess the original
size of .gz, .xz or .zip files:
gzip -ql
xz -ql
unzip -qql
Default behaviour is to check the WALs size.
Perfdata contains the number of archived WALs and the age of the most recent
one.
Critical and Warning define the max age of the latest archived WAL as an
interval (eg. 5m or 300s ).
Required privileges: unprivileged role; the system user needs read access
to archived WAL files.
Sample commands:
check_pgactivity -s archive_folder --path /path/to/archives -w 15m -c 30m
check_pgactivity -s archive_folder --path /path/to/archives --suffix .gz -w 15m -c 30m
check_pgactivity -s archive_folder --path /path/to/archives --ignore-wal-size --suffix .bz2 -w 15m -c 30m
check_pgactivity -s archive_folder --path /path/to/archives --unarchiver "unrar p" --ignore-wal-size --suffix .rar -w 15m -c 30m
=cut
sub check_archive_folder {
my @msg;
my @longmsg;
my @msg_crit;
my @msg_warn;
my @perfdata;
my @history_files;
my @filelist;
my @filelist_sorted;
my @branch_wals;
my $w_limit;
my $c_limit;
my $timeline;
my $start_tl;
my $end_tl;
my $wal;
my $seg;
my $latest_wal_age;
my $dh;
my $fh;
my $wal_version;
my $filename_re;
my $history_re;
my $suffix = $args{'suffix'};
my $check_size = not $args{'ignore-wal-size'};
my $me = 'POSTGRES_ARCHIVES';
my $seg_per_wal = 255; # increased later for pg > 9.2
my %args = %{ $_[0] };
my %unarchive_cmd = (
'.gz' => "gzip -dc",
'.bz2' => "bzip2 -dc",
'.xz' => "xz -dc",
'.zip' => "unzip -qqp",
'.7z' => "7z x -so"
);
my %wal_versions = (
'80' => 53340,
'81' => 53341,
'82' => 53342,
'83' => 53346,
'84' => 53347,
'90' => 53348,
'91' => 53350,
'92' => 53361,
'93' => 53365,
'94' => 53374,
'95' => 53383,
'96' => 53395,
'100' => 53399
);
# "path" argument must be given
pod2usage(
-message => 'FATAL: you must specify the archive folder using "--path <dir>".',
-exitval => 127
) unless defined $args{'path'};
# invalid "path" argument
pod2usage(
-message => "FATAL: \"$args{'path'}\" is not a valid folder.",
-exitval => 127
) unless -d $args{'path'};
# warning and critical are mandatory.
pod2usage(
-message => "FATAL: you must specify critical and warning thresholds.",
-exitval => 127
) unless defined $args{'warning'} and defined $args{'critical'} ;
pod2usage(
-message => "FATAL: critical and warning thresholds only acccepts interval.",
-exitval => 127
) unless ( is_time( $args{'warning'} ) and is_time( $args{'critical'} ) );
opendir( $dh, $args{'path'} )
or die "Cannot opendir $args{'path'} : $!\n";
$filename_re = qr/^[0-9A-F]{24}$suffix$/;
@filelist = map { [ $_ => (stat("$args{'path'}/$_"))[9,7] ] }
grep( /$filename_re/, readdir($dh) );
seekdir( $dh, 0 );
$history_re = qr/^[0-9A-F]{8}.history$suffix$/;
@history_files = grep /$history_re/, readdir($dh) ;
closedir($dh);
return unknown $me, ['No archived WAL found.'] unless @filelist;
$w_limit = get_time($args{'warning'});
$c_limit = get_time($args{'critical'});
# Sort by mtime
@filelist_sorted = sort { ($a->[1] <=> $b->[1]) || ($a->[0] cmp $b->[0]) }
grep{ (defined($_->[0]) and defined($_->[1]))
or die "Cannot read WAL files"
} @filelist;
$latest_wal_age = time() - $filelist_sorted[-1][1];
# Read the XLOG_PAGE_MAGIC header to guess $seg_per_wal
if ( $args{'unarchiver'} eq '' and $suffix =~ /^.(?:gz|bz2|zip|xz|7z)$/ ) {
open $fh, "-|",
qq{ $unarchive_cmd{$suffix} "$args{'path'}/$filelist_sorted[-1][0]" 2>/dev/null }
or
die "could not read first WAL using '$unarchive_cmd{$suffix}': $!";
}
elsif ( $args{'unarchiver'} ne '' ) {
open $fh, "-|",
qq{ $args{'unarchiver'} "$args{'path'}/$filelist_sorted[-1][0]" 2>/dev/null }
or die "could not read first WAL using '$args{'unarchiver'}': $!";
}
else {
# Fallback on raw parsing of first WAL
open $fh, "<", "$args{'path'}/$filelist_sorted[-1][0]"
or die ("Could not read first WAL: $!\n");
}
read( $fh, $wal_version, 2 );
close $fh;
$wal_version = unpack('S', $wal_version);
die ("Could not parse XLOG_PAGE_MAGIC") unless defined $wal_version;
dprint ("wal version: $wal_version");
die "Unknown WAL XLOG_PAGE_MAGIC $wal_version!"
unless grep /^$wal_version$/ => values %wal_versions;
# FIXME: As there is no consensus about XLOG_PAGE_MAGIC algo across PostgreSQL
# versions this piece of code should be checked for compatibility for each new
# PostgreSQL version to confirm the new XLOG_PAGE_MAGIC is still greater
# than the previous one (or at least the 9.2 one).
$seg_per_wal++ if $wal_version >= $wal_versions{'93'};
push @perfdata, [
'latest_archive_age', $latest_wal_age, 's', $w_limit, $c_limit
];
push @perfdata, [ 'num_archives', scalar(@filelist_sorted) ];
dprint ("first wal: $filelist_sorted[0][0]");
dprint ("last wal: $filelist_sorted[-1][0]");
$start_tl = substr($filelist_sorted[0][0], 0, 8);
$end_tl = substr($filelist_sorted[-1][0], 0, 8);
$timeline = $start_tl + 0;
$wal = hex(substr($filelist_sorted[0][0], 8, 8));
$seg = hex(substr($filelist_sorted[0][0], 16, 8));
# look for history files if timeline differs
if ( $start_tl ne $end_tl ) {
if ( -s "$args{'path'}/$end_tl.history" ) {
open my $fd, "<", "$args{'path'}/$end_tl.history";
while ( <$fd> ) {
next unless m{^\s*(\d)\t([0-9A-F]+)/([0-9A-F]+)\t.*$};
push @branch_wals =>
sprintf("%08d%08s%08X", $1, $2, hex($3)>>24);
}
close $fd;
}
}
# Check ALL archives are here.
for ( my $i=0, my $j=0; $i <= $#filelist_sorted ; $i++, $j++ ) {
dprint ("Checking WAL $filelist_sorted[$i][0]\n");
my $curr = sprintf('%08d%08X%08X%s',
$timeline,
$wal + int(($seg + $j)/$seg_per_wal),
($seg + $j)%$seg_per_wal,
$suffix
);
if ( $curr ne $filelist_sorted[$i][0] ) {
push @msg => "Wrong sequence or file missing @ '$curr'";
last;
}
if ( $check_size ) {
if ( $suffix eq '.gz' ) {
my $ans = qx{ gzip -ql "$args{'path'}/$curr" 2>/dev/null };
$filelist_sorted[$i][2] = 16777216
if $ans =~ /^\s*\d+\s+16777216\s/;
}
elsif ( $suffix eq '.xz' ) {
my @ans = qx{ xz -ql --robot "$args{'path'}/$curr" 2>/dev/null };
$filelist_sorted[$i][2] = 16777216
if $ans[-1] =~ /\w+\s+\d+\s+\d+\s+16777216\s+/;
}
elsif ( $suffix eq '.zip' ) {
my $ans;
$ans = qx{ unzip -qql "$args{'path'}/$curr" 2>/dev/null };
$filelist_sorted[$i][2] = 16777216
if $ans =~ /^\s*16777216/;
}
if ( $filelist_sorted[$i][2] != 16777216 ) {
push @msg => "'$curr' is not 16MB";
last;
}
}
if ( grep /$curr/, @branch_wals ) {
dprint( "Found a boundary @ $curr !\n" );
$timeline++;
$j--;
}
}
return critical( $me, \@msg, \@perfdata ) if @msg;
push @msg => scalar(@filelist_sorted)." WAL archived in '$args{'path'}', "
."latest archived since ". to_interval($latest_wal_age);
return critical( $me, \@msg, \@perfdata, \@longmsg )
if $latest_wal_age >= $c_limit;
return warning( $me, \@msg, \@perfdata, \@longmsg )
if $latest_wal_age >= $w_limit;
return ok( $me, \@msg, \@perfdata, \@longmsg );
}
=item B<archiver> (8.1+)
Check if the archiver is working properly and the number of WAL files ready to
archive.
Perfdata returns the number of WAL files waiting to be archived.
Critical and Warning thresholds are optional. They apply on the number of files
waiting to be archived. They only accept a raw number of files.
Whatever the given threshold, a critical alert is raised if the archiver process
did not archive the oldest waiting WAL to be archived since last call.
Required privileges: unprivileged role (10+); superuser (<10).
=cut
sub check_archiver {
my @rs;
my @perfdata;
my @msg;
my @longmsg;
my @hosts;
my $prev_archiving;
my $nb_files;
my %args = %{ $_[0] };
my $me = 'POSTGRES_ARCHIVER';
# warning and critical must be raw
pod2usage(
-message => "FATAL: critical and warning thresholds only accept raw numbers.",
-exitval => 127
) if defined $args{'critical'} and $args{'warning'} !~ m/^([0-9]+)$/
and $args{'critical'} !~ m/^([0-9]+)$/;
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "archiver".',
-exitval => 127
) if @hosts != 1;
is_compat $hosts[0], 'archiver', $PG_VERSION_81 or exit 1;
if (check_compat $hosts[0], 'archiver', $PG_VERSION_81, $PG_VERSION_96) {
# cf. pgarch_readyXlog in src/backend/postmaster/pgarch.c about how the
# archiver process pick the next WAL to archive.
# We try to reproduce the same algo here
my $query = q{
SELECT s.f,
extract(epoch from (pg_stat_file('pg_xlog/archive_status/'||s.f)).modification),
extract(epoch from current_timestamp)
FROM pg_ls_dir('pg_xlog/archive_status') AS s(f)
WHERE f ~ '^[0123456789ABCDEF.history.backup.partial]{16,40}\.ready$'
ORDER BY s.f ASC};
$prev_archiving = load( $hosts[0], 'archiver', $args{'status-file'} ) || '';
@rs = @{ query( $hosts[0], $query ) };
$nb_files = scalar @rs;
push @perfdata => [ 'ready_archive', $nb_files, undef, $args{'warning'}, $args{'critical'}, 0 ];
if ( $nb_files > 0 ) {
push @perfdata => [ 'oldest_ready_wal', int( $rs[0][2] - $rs[0][1] ), 's',
undef, undef, 0 ];
if ( $rs[0][0] ne $prev_archiving ) {
save $hosts[0], 'archiver', $rs[0][0], $args{'status-file'};
}
else {
push @msg => sprintf 'archiver stalling', substr($rs[0][0], 0, 24);
push @longmsg => sprintf '"%s" not archived since last check',
substr($rs[0][0], 0, -6);
}
}
else {
push @perfdata => [ 'oldest_ready_wal', 0, 's', undef, undef, 0 ];
save $hosts[0], 'archiver', '', $args{'status-file'};
}
push @msg => "$nb_files WAL files ready to archive";
}
else {
# Version 10.0 and higher: use pg_stat_archiver as the monitoring
# user may not be super-user.
my $query = q{
SELECT COALESCE( (('x' || substr(current_wal, 9, 8))::bit(32)::int *256 + ('x' || substr(current_wal, 17, 8))::bit(32)::int)
- (('x' || substr(last_failed_wal, 9, 8))::bit(32)::int * 256 + ('x' || substr(last_failed_wal, 17, 8))::bit(32)::int),
0 /* return 0 if NULL */) AS ready_archives,
extract('epoch' from (current_timestamp - last_failed_time)) AS oldest_archive_failure
FROM (SELECT CASE WHEN (last_failed_time >= last_archived_time)
OR (last_archived_time IS NULL AND last_failed_time IS NOT NULL)
THEN last_failed_wal
ELSE NULL
END AS last_failed_wal,
CASE WHEN pg_is_in_recovery() THEN pg_walfile_name(pg_last_wal_receive_lsn())
ELSE pg_walfile_name(pg_current_wal_lsn())
END as current_wal,
last_failed_time
FROM pg_stat_archiver) stats
};
@rs = @{ query( $hosts[0], $query ) };
$nb_files = $rs[0][0];
push @perfdata => [ 'ready_archive', $nb_files, undef, $args{'warning'}, $args{'critical'}, 0 ];
if ( $nb_files > 0 ) {
push @perfdata => [ 'oldest_ready_wal', int( $rs[0][1] ), 's',
undef, undef, 0 ];
}
else {
push @perfdata => [ 'oldest_ready_wal', 0, 's', undef, undef, 0 ];
}
push @msg => "$nb_files WAL files ready to archive";
}
return critical( $me, \@msg, \@perfdata, \@longmsg ) if scalar @msg > 1;
if ( defined $args{'critical'} and $nb_files >= $args{'critical'} ) {
return critical( $me, \@msg, \@perfdata );
}
elsif ( defined $args{'warning'} and $nb_files >= $args{'warning'} ) {
return warning( $me, \@msg, \@perfdata );
}
return ok( $me, \@msg, \@perfdata );
}
=item B<autovacuum> (8.1+)
Check the autovacuum activity on the cluster.
Perfdata contains the age of oldest running autovacuum and the number of workers
by type (VACUUM, VACUUM ANALYZE, ANALYZE, VACUUM FREEZE).
Thresholds, if any, are ignored.
Required privileges: unprivileged role.
=cut
sub check_autovacuum {
my @rs;
my @perfdata;
my @msg;
my @longmsg;
my @hosts;
my %args = %{ $_[0] };
my $me = 'POSTGRES_AUTOVACUUM';
my $oldest = undef;
my $numautovac = 0;
my $max_workers = "NaN";
my %activity = (
'VACUUM' => 0,
'VACUUM_ANALYZE' => 0,
'ANALYZE' => 0,
'VACUUM_FREEZE' => 0,
'BRIN_SUMMARIZE' => 0
);
my %queries = (
# field current_query, not autovacuum_max_workers
$PG_VERSION_81 => q{
SELECT current_query,
extract(EPOCH FROM now()-query_start)::bigint,
'NaN'
FROM pg_stat_activity
WHERE current_query LIKE 'autovacuum: %'
ORDER BY query_start ASC
},
# field current_query, autovacuum_max_workers
$PG_VERSION_83 => q{
SELECT a.current_query,
extract(EPOCH FROM now()-a.query_start)::bigint,
s.setting
FROM
(SELECT current_setting('autovacuum_max_workers') AS setting) AS s
LEFT JOIN (
SELECT * FROM pg_stat_activity
WHERE current_query LIKE 'autovacuum: %'
) AS a ON true
ORDER BY query_start ASC
},
# field query, still autovacuum_max_workers
$PG_VERSION_92 => q{
SELECT a.query,
extract(EPOCH FROM now()-a.query_start)::bigint,
s.setting
FROM
(SELECT current_setting('autovacuum_max_workers') AS setting) AS s
LEFT JOIN (
SELECT * FROM pg_stat_activity
WHERE query LIKE 'autovacuum: %'
) AS a ON true
ORDER BY a.query_start ASC
}
);
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "autovacuum".',
-exitval => 127
) if @hosts != 1;
is_compat $hosts[0], 'autovacuum', $PG_VERSION_81 or exit 1;
if (check_compat $hosts[0], 'archiver', $PG_VERSION_81, $PG_VERSION_96) {
delete $activity{BRIN_SUMMARIZE};
}
@rs = @{ query_ver( $hosts[0], %queries ) };
REC_LOOP: foreach my $r (@rs) {
if ( not defined $oldest ){
$max_workers = $r->[2];
next REC_LOOP if ( $r->[1] eq "" );
$oldest = $r->[1];
}
$numautovac++;
if ( $r->[0] =~ '\(to prevent wraparound\)$' ) {
$activity{'VACUUM_FREEZE'}++;
} else {
if ( $r->[0] =~ '^autovacuum: VACUUM ANALYZE' ) {
$activity{'VACUUM_ANALYZE'}++;
} elsif ( $r->[0] =~ 'autovacuum: VACUUM' ) {
$activity{'VACUUM'}++;
} elsif ( $r->[0] =~ 'autovacuum: BRIN summarize' ) {
$activity{'BRIN_SUMMARIZE'}++;
} else {
$activity{'ANALYZE'}++;
};
}
$r->[0] =~ s/autovacuum: //;
push @longmsg, $r->[0];
}
$oldest = 'NaN' if not defined ( $oldest );
@perfdata = map { [ $_, $activity{$_} ] } keys %activity;
push @perfdata, [ 'oldest_autovacuum', $oldest, 's' ];
push @perfdata, [ 'max_workers', $max_workers ];
push @msg, "Number of autovacuum: $numautovac";
push @msg, "Oldest autovacuum: " . to_interval($oldest) if $oldest ne "NaN";
return ok( $me, \@msg , \@perfdata, \@longmsg );
}
=item B<backends> (all)
Check the total number of connections in the PostgreSQL cluster.
Perfdata contains the number of connections per database.
Critical and Warning thresholds accept either a raw number or a percentage (eg.
80%). When a threshold is a percentage, it is compared to the difference
between the cluster parameters C<max_connections> and
C<superuser_reserved_connections>.
Required privileges: an unprivileged user only sees its own queries;
a pg_monitor (10+) or superuser (<10) role is required to see all queries.
=cut
sub check_backends {
my @rs;
my @perfdata;
my @msg;
my @hosts;
my %args = %{ $_[0] };
my $me = 'POSTGRES_BACKENDS';
my $num_backends = 0;
my %queries = (
$PG_VERSION_MIN => q{
SELECT s.datname, s.numbackends,
current_setting('max_connections')::int
- current_setting('superuser_reserved_connections')::int
FROM pg_catalog.pg_stat_database AS s
JOIN pg_catalog.pg_database d ON d.oid = s.datid
WHERE d.datallowconn },
# Remove autovacuum connections (autovac introduced in 8.1, but exposed
# in pg_stat_activity since 8.2)
$PG_VERSION_82 => q{
SELECT d.datname, count(*),
current_setting('max_connections')::int
- current_setting('superuser_reserved_connections')::int
FROM pg_catalog.pg_stat_activity AS s
JOIN pg_catalog.pg_database AS d ON d.oid = s.datid
WHERE current_query NOT LIKE 'autovacuum: %'
GROUP BY d.datname },
# Add replication connections 9.1
$PG_VERSION_91 => q{
SELECT s.*, current_setting('max_connections')::int
- current_setting('superuser_reserved_connections')::int
FROM (
SELECT d.datname, count(*)
FROM pg_catalog.pg_stat_activity AS s
JOIN pg_catalog.pg_database AS d ON d.oid = s.datid
WHERE current_query NOT LIKE 'autovacuum: %'
GROUP BY d.datname
UNION ALL
SELECT 'replication', count(*)
FROM pg_catalog.pg_stat_replication
) AS s },
# Rename current_query => query
$PG_VERSION_92 => q{
SELECT s.*, current_setting('max_connections')::int
- current_setting('superuser_reserved_connections')::int
FROM (
SELECT d.datname, count(*)
FROM pg_catalog.pg_stat_activity AS s
JOIN pg_catalog.pg_database AS d ON d.oid = s.datid
WHERE query NOT LIKE 'autovacuum: %'
GROUP BY d.datname
UNION ALL
SELECT 'replication', count(*)
FROM pg_catalog.pg_stat_replication
) AS s },
# Only account client backends
$PG_VERSION_100 => q{
SELECT s.*, current_setting('max_connections')::int
- current_setting('superuser_reserved_connections')::int
FROM (
SELECT d.datname, count(*)
FROM pg_catalog.pg_stat_activity AS s
JOIN pg_catalog.pg_database AS d ON d.oid = s.datid
WHERE backend_type IN ('client backend', 'background worker')
GROUP BY d.datname
UNION ALL
SELECT 'replication', count(*)
FROM pg_catalog.pg_stat_replication
) AS s }
);
# Warning and critical are mandatory.
pod2usage(
-message => "FATAL: you must specify critical and warning thresholds.",
-exitval => 127
) unless defined $args{'warning'} and defined $args{'critical'} ;
# Warning and critical must be raw or %.
pod2usage(
-message => "FATAL: critical and warning thresholds only accept raw numbers or %.",
-exitval => 127
) unless $args{'warning'} =~ m/^([0-9.]+)%?$/
and $args{'critical'} =~ m/^([0-9.]+)%?$/;
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "backends".',
-exitval => 127
) if @hosts != 1;
@rs = @{ query_ver( $hosts[0], %queries ) };
$args{'critical'} = int( $rs[0][2] * $1 / 100 )
if $args{'critical'} =~ /^([0-9.]+)%$/;
$args{'warning'} = int( $rs[0][2] * $1 / 100 )
if $args{'warning'} =~ /^([0-9.]+)%$/;
LOOP_DB: foreach my $db (@rs) {
$num_backends += $db->[1];
push @perfdata, [
$db->[0], $db->[1], '', $args{'warning'}, $args{'critical'}, 0, $db->[2]
];
}
push @perfdata, [
'maximum_connections', $rs[0][2], undef, undef, undef, 0, $rs[0][2]
];
push @msg => "$num_backends connections on $rs[0][2]";
return critical( $me, \@msg, \@perfdata )
if $num_backends >= $args{'critical'};
return warning( $me, \@msg, \@perfdata )
if $num_backends >= $args{'warning'};
return ok( $me, \@msg, \@perfdata );
}
=item B<backends_status> (8.2+)
Check the status of all backends. Depending on your PostgreSQL version,
statuses are: C<idle>, C<idle in transaction>, C<idle in transaction (aborted)>
(>=9.0 only), C<fastpath function call>, C<active>, C<waiting for lock>,
C<undefined>, C<disabled> and C<insufficient privilege>.
B<insufficient privilege> appears when you are not allowed to see the statuses
of other connections.
This service supports the argument C<--exclude REGEX> to exclude queries
matching the given regular expression.
You can use multiple C<--exclude REGEX> arguments.
Critical and Warning thresholds are optional. They accept a list of
'status_label=value' separated by a comma. Available labels are C<idle>,
C<idle_xact>, C<aborted_xact>, C<fastpath>, C<active> and C<waiting>. Values
are raw numbers or time units and empty lists are forbidden. Here is an example:
-w 'waiting=5,idle_xact=10' -c 'waiting=20,idle_xact=30,active=1d'
Perfdata contains the number of backends for each status and the oldest one for
each of them, for 8.2+.
Note that the number of backends reported in Nagios message B<includes>
excluded backends.
Required privileges: an unprivileged user only sees its own queries;
a pg_monitor (10+) or superuser (<10) role is required to see all queries.
=cut
sub check_backends_status {
my @rs;
my @hosts;
my @perfdata;
my @msg_warn;
my @msg_crit;
my %warn;
my %crit;
my $max_connections;
my $num_backends = 0;
my $me = 'POSTGRES_BACKENDS_STATUS';
my %status = (
'idle' => [0, 0],
'idle in transaction' => [0, 0],
'idle in transaction (aborted)' => [0, 0],
'fastpath function call' => [0, 0],
'waiting for lock' => [0, 0],
'active' => [0, 0],
'disabled' => [0, 0],
'undefined' => [0, 0],
'insufficient privilege' => [0, 0],
'other wait event' => [0, 0]
);
my %translate = (
'idle' => 'idle',
'idle_xact' => 'idle in transaction',
'aborted_xact' => 'idle in transaction (aborted)',
'fastpath' => 'fastpath function call',
'waiting' => 'waiting for lock',
'active' => 'active'
);
my %queries = (
# Doesn't support "idle in transaction (aborted)" and xact age
$PG_VERSION_82 => q{
SELECT CASE
WHEN s.current_query = '<IDLE>'
THEN 'idle'
WHEN s.current_query = '<IDLE> in transaction'
THEN 'idle in transaction'
WHEN s.current_query = '<FASTPATH> function call'
THEN 'fastpath function call'
WHEN s.current_query = '<command string not enabled>'
THEN 'disabled'
WHEN s.current_query = '<backend information not available>'
THEN 'undefined'
WHEN s.current_query = '<insufficient privilege>'
THEN 'insufficient privilege'
WHEN s.waiting = 't'
THEN 'waiting for lock'
ELSE 'active'
END AS status,
NULL, current_setting('max_connections'), s.current_query
FROM pg_stat_activity AS s
JOIN pg_database d ON d.oid=s.datid
WHERE d.datallowconn
},
# Doesn't support "idle in transaction (aborted)"
$PG_VERSION_83 => q{
SELECT CASE
WHEN s.current_query = '<IDLE>'
THEN 'idle'
WHEN s.current_query = '<IDLE> in transaction'
THEN 'idle in transaction'
WHEN s.current_query = '<FASTPATH> function call'
THEN 'fastpath function call'
WHEN s.current_query = '<command string not enabled>'
THEN 'disabled'
WHEN s.current_query = '<backend information not available>'
THEN 'undefined'
WHEN s.current_query = '<insufficient privilege>'
THEN 'insufficient privilege'
WHEN s.waiting = 't'
THEN 'waiting for lock'
ELSE 'active'
END AS status,
extract('epoch' FROM
date_trunc('milliseconds', current_timestamp-s.xact_start)
),
current_setting('max_connections'), s.current_query
FROM pg_stat_activity AS s
JOIN pg_database d ON d.oid=s.datid
WHERE d.datallowconn
},
# Supports everything
$PG_VERSION_90 => q{
SELECT CASE
WHEN s.current_query = '<IDLE>'
THEN 'idle'
WHEN s.current_query = '<IDLE> in transaction'
THEN 'idle in transaction'
WHEN s.current_query = '<IDLE> in transaction (aborted)'
THEN 'idle in transaction (aborted)'
WHEN s.current_query = '<FASTPATH> function call'
THEN 'fastpath function call'
WHEN s.current_query = '<command string not enabled>'
THEN 'disabled'
WHEN s.current_query = '<backend information not available>'
THEN 'undefined'
WHEN s.current_query = '<insufficient privilege>'
THEN 'insufficient privilege'
WHEN s.waiting = 't'
THEN 'waiting for lock'
ELSE 'active'
END,
extract('epoch' FROM
date_trunc('milliseconds', current_timestamp-s.xact_start)
),
current_setting('max_connections'), s.current_query
FROM pg_stat_activity AS s
JOIN pg_database d ON d.oid=s.datid
WHERE d.datallowconn
},
# pg_stat_activity schema change
$PG_VERSION_92 => q{
SELECT CASE
WHEN s.waiting = 't' THEN 'waiting for lock'
WHEN s.query = '<insufficient privilege>'
THEN 'insufficient privilege'
WHEN s.state IS NULL THEN 'undefined'
ELSE s.state
END,
extract('epoch' FROM
date_trunc('milliseconds', current_timestamp-s.xact_start)
), current_setting('max_connections'), s.query
FROM pg_stat_activity AS s
JOIN pg_database d ON d.oid=s.datid
WHERE d.datallowconn
},
# pg_stat_activity schema change for wait events
$PG_VERSION_96 => q{
SELECT CASE
WHEN s.wait_event_type = 'Lock' THEN 'waiting for lock'
WHEN s.wait_event_type IS NOT NULL THEN 'other wait event'
WHEN s.query = '<insufficient privilege>'
THEN 'insufficient privilege'
WHEN s.state IS NULL THEN 'undefined'
ELSE s.state
END,
extract('epoch' FROM
date_trunc('milliseconds', current_timestamp-s.xact_start)
), current_setting('max_connections'), s.query
FROM pg_stat_activity AS s
JOIN pg_database d ON d.oid=s.datid
WHERE d.datallowconn
},
# pg_stat_activity now displays background processes
$PG_VERSION_100 => q{
SELECT CASE
WHEN s.wait_event_type = 'Lock' THEN 'waiting for lock'
WHEN s.query = '<insufficient privilege>'
THEN 'insufficient privilege'
WHEN s.state IS NULL THEN 'undefined'
WHEN s.wait_event_type IS NOT NULL
AND s.wait_event_type NOT IN ('Client', 'Activity')
THEN 'other wait event'
ELSE s.state
END,
extract('epoch' FROM
date_trunc('milliseconds', current_timestamp-s.xact_start)
), current_setting('max_connections'), s.query
FROM pg_stat_activity AS s
JOIN pg_database d ON d.oid=s.datid
WHERE d.datallowconn
AND backend_type IN ('client backend', 'background worker')
}
);
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "backends_status".',
-exitval => 127
) if @hosts != 1;
is_compat $hosts[0], 'backends_status', $PG_VERSION_82 or exit 1;
if ( defined $args{'warning'} ) {
my $threshods_re
= qr/(idle|idle_xact|aborted_xact|fastpath|active|waiting)\s*=\s*(\d+\s*[smhd]?)/i;
# Warning and critical must be raw
pod2usage(
-message => "FATAL: critical and warning thresholds only accept a list of 'label=value' separated by comma.\n"
. "See documentation for more information.",
-exitval => 127
) unless $args{'warning'} =~ m/^$threshods_re(\s*,\s*$threshods_re)*$/
and $args{'critical'} =~ m/^$threshods_re(\s*,\s*$threshods_re)*$/ ;
while ( $args{'warning'} =~ /$threshods_re/g ) {
my ($threshold, $value) = ($1, $2);
$warn{$translate{$threshold}} = $value if $1 and defined $2;
}
while ( $args{'critical'} =~ /$threshods_re/g ) {
my ($threshold, $value) = ($1, $2);
$crit{$translate{$threshold}} = $value if $1 and defined $2;
}
}
@rs = @{ query_ver( $hosts[0], %queries ) };
delete $status{'idle in transaction (aborted)'}
if $hosts[0]->{'version_num'} < $PG_VERSION_90;
delete $status{'other wait event'}
if $hosts[0]->{'version_num'} < $PG_VERSION_96;
$max_connections = $rs[0][2] if scalar @rs;
REC_LOOP: foreach my $r (@rs) {
$num_backends++;
foreach my $exclude_re ( @{ $args{'exclude'} } ) {
next REC_LOOP if $r->[3] =~ /$exclude_re/;
}
if (exists $status{$r->[0]}) {
$status{$r->[0]}[0]++;
$status{$r->[0]}[1] = $r->[1]
if $r->[1] and $r->[1] > $status{$r->[0]}[1];
}
}
STATUS_LOOP: foreach my $s (sort keys %status) {
my @perf = ( $s, $status{$s}[0], undef );
push @perf, ( $warn{$s}, $crit{$s}, 0, $max_connections )
if ( exists $warn{$s} and exists $crit{$s}
and $warn{$s} =~ /\d+$/ and $crit{$s} =~ /\d+$/ );
push @perfdata => [ @perf ];
if ( $hosts[0]->{'version_num'} >= $PG_VERSION_83
and $s !~ '^(?:disabled|undefined|insufficient)' ) {
my @perf = ("oldest $s", $status{$s}[1], 's' );
push @perf, ( $warn{$s}, $crit{$s}, 0, $max_connections )
if ( exists $warn{$s} and exists $crit{$s}
and $warn{$s} =~ /\d+\s*[smhd]/ and $crit{$s} =~ /\d+\s*[smhd]/ );
push @perfdata => [ @perf ];
}
# Criticals
if ( exists $crit{$s} ) {
if ( $crit{$s} =~ /\d+\s*[smhd]/ ) {
if ( $status{$s}[1] >= get_time($crit{$s}) ) {
push @msg_crit => "$status{$s}[0] $s for $status{$s}[1] seconds";
next STATUS_LOOP;
}
}
elsif ( $status{$s}[0] >= $crit{$s} ) {
push @msg_crit => "$status{$s}[0] $s";
next STATUS_LOOP;
}
}
# Warning
if ( exists $warn{$s} ) {
if ( $warn{$s} =~ /\d+\s*[smhd]/ ) {
if ( $status{$s}[1] >= get_time($warn{$s}) ) {
push @msg_warn => "$status{$s}[0] $s for $status{$s}[1] seconds";
next STATUS_LOOP;
}
}
elsif ( $status{$s}[0] >= $warn{$s} ) {
push @msg_warn => "$status{$s}[0] $s";
next STATUS_LOOP;
}
}
}
return critical( $me, [ @msg_crit, @msg_warn ], \@perfdata )
if scalar @msg_crit > 0;
return warning( $me, \@msg_warn, \@perfdata ) if scalar @msg_warn > 0;
return ok( $me, [ "$num_backends backend connected" ], \@perfdata );
}
=item B<backup_label_age> (8.1+)
Check the age of the backup label file.
Perfdata returns the age of the backup_label file, -1 if not present.
Critical and Warning thresholds only accept an interval (eg. 1h30m25s).
Required privileges: unprivileged role (9.3+); superuser (<9.3)
=cut
sub check_backup_label_age {
my $rs;
my $c_limit;
my $w_limit;
my @perfdata;
my @hosts;
my %args = %{ $_[0] };
my $me = 'POSTGRES_BACKUP_LABEL_AGE';
my %queries = (
$PG_VERSION_81 => q{SELECT max(s.r) AS value FROM (
SELECT CAST(extract(epoch FROM current_timestamp - (pg_stat_file(file)).modification) AS integer) AS r
FROM pg_ls_dir('.') AS ls(file)
WHERE file='backup_label' UNION SELECT 0
) AS s},
$PG_VERSION_93 => q{
SELECT CASE WHEN pg_is_in_backup()
THEN CAST(extract(epoch FROM current_timestamp - pg_backup_start_time()) AS integer)
ELSE 0
END}
);
# warning and critical are mandatory.
pod2usage(
-message => "FATAL: you must specify critical and warning thresholds.",
-exitval => 127
) unless defined $args{'warning'} and defined $args{'critical'} ;
pod2usage(
-message => "FATAL: critical and warning thresholds only acccepts interval.",
-exitval => 127
) unless ( is_time( $args{'warning'} ) and is_time( $args{'critical'} ) );
$c_limit = get_time( $args{'critical'} );
$w_limit = get_time( $args{'warning'} );
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "backup_label_age".',
-exitval => 127
) if @hosts != 1;
is_compat $hosts[0], 'backup_label_age', $PG_VERSION_81 or exit 1;
$rs = @{ query_ver( $hosts[0], %queries )->[0] }[0];
push @perfdata, [ 'age', $rs, 's', $w_limit, $c_limit ];
return critical( $me, [ "age: ".to_interval($rs) ], \@perfdata )
if $rs > $c_limit;
return warning( $me, [ "age: ".to_interval($rs) ], \@perfdata )
if $rs > $w_limit;
return ok( $me, [ "backup_label file ".( $rs == 0 ? "absent":"present (age: ".to_interval($rs).")") ], \@perfdata );
}
=item B<bgwriter> (8.3+)
Check the percentage of pages written by backends since last check.
This service uses the status file (see C<--status-file> parameter).
Perfdata contains the ratio per second for each C<pg_stat_bgwriter> counter
since last execution. Units Nps for checkpoints, max written clean and fsyncs
are the number of "events" per second.
Critical and Warning thresholds are optional. If set, they I<only> accept a
percentage.
Required privileges: unprivileged role.
=cut
sub check_bgwriter {
my @msg;
my @msg_crit;
my @msg_warn;
my @rs;
my @perfdata;
my $delta_ts;
my $delta_buff_total;
my $delta_buff_backend;
my $delta_buff_bgwriter;
my $delta_buff_checkpointer;
my $delta_buff_alloc;
my $delta_checkpoint_timed;
my $delta_checkpoint_req;
my $delta_maxwritten_clean;
my $delta_backend_fsync;
my %new_bgw;
my %bgw;
my @hosts;
my $now = time();
my %args = %{ $_[0] };
my $me = 'POSTGRES_BGWRITER';
my %queries = (
$PG_VERSION_83 => q{SELECT checkpoints_timed, checkpoints_req,
buffers_checkpoint * current_setting('block_size')::numeric,
buffers_clean * current_setting('block_size')::numeric,
maxwritten_clean,
buffers_backend * current_setting('block_size')::numeric,
buffers_alloc * current_setting('block_size')::numeric,
0,
0
FROM pg_stat_bgwriter;
},
$PG_VERSION_91 => q{SELECT checkpoints_timed, checkpoints_req,
buffers_checkpoint * current_setting('block_size')::numeric,
buffers_clean * current_setting('block_size')::numeric,
maxwritten_clean,
buffers_backend * current_setting('block_size')::numeric,
buffers_alloc * current_setting('block_size')::numeric,
buffers_backend_fsync,
extract ('epoch' from stats_reset)
FROM pg_stat_bgwriter;
}
);
# Warning and critical must be %.
pod2usage(
-message => "FATAL: critical and warning thresholds only accept percentages.",
-exitval => 127
) unless not (defined $args{'warning'} and defined $args{'critical'} )
or (
$args{'warning'} =~ m/^([0-9.]+)%$/
and $args{'critical'} =~ m/^([0-9.]+)%$/
);
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "bgwriter".',
-exitval => 127
) if @hosts != 1;
is_compat $hosts[0], 'bgwriter', $PG_VERSION_83 or exit 1;
%bgw = %{ load( $hosts[0], 'bgwriter', $args{'status-file'} ) || {} };
@rs = @{ query_ver( $hosts[0], %queries )->[0] };
$new_bgw{'ts'} = $now;
$new_bgw{'checkpoint_timed'} = $rs[0];
$new_bgw{'checkpoint_req'} = $rs[1];
$new_bgw{'buff_checkpoint'} = $rs[2];
$new_bgw{'buff_clean'} = $rs[3];
$new_bgw{'maxwritten_clean'} = $rs[4];
$new_bgw{'buff_backend'} = $rs[5];
$new_bgw{'buff_alloc'} = $rs[6];
$new_bgw{'backend_fsync'} = $rs[7];
$new_bgw{'stat_reset'} = $rs[8];
save $hosts[0], 'bgwriter', \%new_bgw, $args{'status-file'};
return ok( $me, ['First call'] ) unless keys %bgw
and defined $bgw{'ts'}; # 'ts' was added in 1.25, check for existence
# instead of raising some ugly Perl errors
# when upgrading.
return ok( $me, ['Stats reseted since last call'] )
if $new_bgw{'stat_reset'} > $bgw{'stat_reset'}
or $new_bgw{'checkpoint_timed'} < $bgw{'checkpoint_timed'}
or $new_bgw{'checkpoint_req'} < $bgw{'checkpoint_req'}
or $new_bgw{'buff_checkpoint'} < $bgw{'buff_checkpoint'}
or $new_bgw{'buff_clean'} < $bgw{'buff_clean'}
or $new_bgw{'maxwritten_clean'} < $bgw{'maxwritten_clean'}
or $new_bgw{'buff_backend'} < $bgw{'buff_backend'}
or $new_bgw{'buff_alloc'} < $bgw{'buff_alloc'}
or $new_bgw{'backend_fsync'} < $bgw{'backend_fsync'};
$delta_buff_total = $rs[2] - $bgw{'buff_checkpoint'}
+ $rs[3] - $bgw{'buff_clean'}
+ $rs[5] - $bgw{'buff_backend'};
$delta_ts = $now - $bgw{'ts'};
$delta_buff_backend = ($rs[5] - $bgw{'buff_backend'}) / $delta_ts;
$delta_buff_bgwriter = ($rs[3] - $bgw{'buff_clean'}) / $delta_ts;
$delta_buff_checkpointer = ($rs[2] - $bgw{'buff_checkpoint'}) / $delta_ts;
$delta_buff_alloc = ($rs[6] - $bgw{'buff_alloc'}) / $delta_ts;
$delta_checkpoint_timed = ($rs[0] - $bgw{'checkpoint_timed'}) / $delta_ts;
$delta_checkpoint_req = ($rs[1] - $bgw{'checkpoint_req'}) / $delta_ts;
$delta_maxwritten_clean = ($rs[4] - $bgw{'maxwritten_clean'}) / $delta_ts;
$delta_backend_fsync = ($rs[7] - $bgw{'backend_fsync'}) / $delta_ts;
push @perfdata, (
[ 'buffers_backend', $delta_buff_backend, 'Bps' ],
[ 'checkpoint_timed', $delta_checkpoint_timed, 'Nps' ],
[ 'checkpoint_req', $delta_checkpoint_req, 'Nps' ],
[ 'buffers_checkpoint', $delta_buff_checkpointer, 'Bps' ],
[ 'buffers_clean', $delta_buff_bgwriter, 'Bps' ],
[ 'maxwritten_clean', $delta_maxwritten_clean, 'Nps' ],
[ 'buffers_backend_fsync', $delta_backend_fsync, 'Nps' ],
[ 'buffers_alloc', $delta_buff_alloc, 'Bps' ] );
if ($delta_buff_total) {
push @msg => sprintf(
"%.2f%% from backends, %.2f%% from bgwriter, %.2f%% from checkpointer",
100 * $delta_buff_backend / $delta_buff_total,
100 * $delta_buff_bgwriter / $delta_buff_total,
100 * $delta_buff_checkpointer / $delta_buff_total
);
}
else {
push @msg => "No writes";
}
# Alarm if asked.
# FIXME: threshold should accept a % and a minimal written size
if ( defined $args{'warning'}
and defined $args{'critical'}
and $delta_buff_total
) {
my $w_limit = get_size( $args{'warning'}, $delta_buff_total );
my $c_limit = get_size( $args{'critical'}, $delta_buff_total );
return critical( $me, \@msg, \@perfdata )
if $delta_buff_backend >= $c_limit;
return warning( $me, \@msg, \@perfdata )
if $delta_buff_backend >= $w_limit;
}
return ok( $me, \@msg, \@perfdata );
}
=item B<btree_bloat>
Estimate bloat on B-tree indexes.
Warning and critical thresholds accept a comma-separated list of either
raw number(for a size), size (eg. 125M) or percentage. The thresholds apply to
B<bloat> size, not object size. If a percentage is given, the threshold will
apply to the bloat size compared to the total index size. If multiple threshold
values are passed, check_pgactivity will choose the largest (bloat size) value.
This service supports both C<--dbexclude> and C<--dbinclude> parameters.
The 'postgres' database and templates are always excluded.
It also supports a C<--exclude REGEX> parameter to exclude relations matching
a regular expression. The regular expression applies to
"database.schema_name.relation_name". This enables you to filter either on a
relation name for all schemas and databases, on a qualified named relation
(schema + relation) for all databases or on a qualified named relation in
only one database.
You can use multiple C<--exclude REGEX> parameters.
Perfdata will return the number of indexes of concern, by warning and critical
threshold per database.
A list of the bloated indexes will be returned after the
perfdata. This list contains the fully qualified bloated index name, the
estimated bloat size, the index size and the bloat percentage.
Required privileges: superuser (<10) able to log in all databases, or at least
those in C<--dbinclude>; superuser (<10);
on PostgreSQL 10+, a user with the role pg_monitor suffices,
provided that you grant SELECT on the system table pg_statistic
to the pg_monitor role, in each database of the cluster:
C<GRANT SELECT ON pg_statistic TO pg_monitor;>
=cut
sub check_btree_bloat {
my @perfdata;
my @longmsg;
my @rs;
my @hosts;
my @all_db;
my $total_index; # num of index checked, without excluded ones
my $w_count = 0;
my $c_count = 0;
my %args = %{ $_[0] };
my @dbinclude = @{ $args{'dbinclude'} };
my @dbexclude = @{ $args{'dbexclude'} };
my $me = 'POSTGRES_BTREE_BLOAT';
my %queries = (
$PG_VERSION_74 => q{
SELECT current_database(), nspname AS schemaname, tblname, idxname, bs*(relpages)::bigint AS real_size,
bs*(relpages-est_pages)::bigint AS bloat_size,
100 * (relpages-est_pages)::float / relpages AS bloat_ratio
FROM (
SELECT coalesce(
1+ceil(reltuples/floor((bs-pageopqdata-pagehdr)/(4+nulldatahdrwidth)::float)), 0
) AS est_pages,
bs, nspname, tblname, idxname, relpages, is_na
FROM (
SELECT maxalign, bs, nspname, tblname, idxname, reltuples, relpages, relam,
( index_tuple_hdr_bm +
maxalign - CASE
WHEN index_tuple_hdr_bm%maxalign = 0 THEN maxalign
ELSE index_tuple_hdr_bm%maxalign
END
+ nulldatawidth + maxalign - CASE
WHEN nulldatawidth = 0 THEN 0
WHEN nulldatawidth::numeric%maxalign = 0 THEN maxalign
ELSE nulldatawidth::numeric%maxalign
END
)::numeric AS nulldatahdrwidth, pagehdr, pageopqdata, is_na
FROM (
SELECT n.nspname, sub.tblname, sub.idxname, sub.reltuples, sub.relpages, sub.relam,
8192::numeric AS bs,
CASE
WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8
ELSE 4
END AS maxalign,
20 AS pagehdr,
16 AS pageopqdata,
CASE WHEN max(coalesce(sub.stanullfrac,0)) = 0
THEN 2
ELSE 2 + (( 32 + 8 - 1 ) / 8)
END AS index_tuple_hdr_bm,
sum( (1-coalesce(sub.stanullfrac, 0)) * coalesce(sub.stawidth, 1024)) AS nulldatawidth,
max( CASE WHEN a.atttypid = 'pg_catalog.name'::regtype THEN 1 ELSE 0 END ) > 0
OR count(1) <> sub.indnatts AS is_na
FROM (
SELECT ct.relnamespace, ct.relname AS tblname,
ci.relname AS idxname, ci.reltuples, ci.relpages, ci.relam,
s.stawidth, s.stanullfrac, s.starelid, s.staattnum, i.indnatts
FROM pg_catalog.pg_index AS i
JOIN pg_catalog.pg_class AS ci ON ci.oid = i.indexrelid
JOIN pg_catalog.pg_class AS ct ON ct.oid = i.indrelid
JOIN pg_catalog.pg_statistic AS s ON s.starelid = i.indrelid
AND s.staattnum = ANY (
string_to_array(pg_catalog.textin(pg_catalog.int2vectorout(i.indkey)), ' ')::smallint[]
)
WHERE ci.relpages > 0
) AS sub
JOIN pg_catalog.pg_attribute AS a ON sub.starelid = a.attrelid
AND sub.staattnum = a.attnum
JOIN pg_catalog.pg_type AS t ON a.atttypid = t.oid
JOIN pg_catalog.pg_namespace AS n ON sub.relnamespace = n.oid
WHERE a.attnum > 0
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, sub.indnatts
) AS sub2
) AS sub3
JOIN pg_am am ON sub3.relam = am.oid
WHERE am.amname = 'btree'
) AS sub4
WHERE NOT is_na
ORDER BY 2,3,4 },
# Page header is 24 and block_size GUC, support index on expression
$PG_VERSION_80 => q{
SELECT current_database(), nspname AS schemaname, tblname, idxname, bs*(relpages)::bigint AS real_size,
bs*(relpages-est_pages)::bigint AS bloat_size,
100 * (relpages-est_pages)::float / relpages AS bloat_ratio
FROM (
SELECT coalesce(1 +
ceil(reltuples/floor((bs-pageopqdata-pagehdr)/(4+nulldatahdrwidth)::float)), 0
) AS est_pages,
bs, nspname, tblname, idxname, relpages, is_na
FROM (
SELECT maxalign, bs, nspname, tblname, idxname, reltuples, relpages, relam,
( index_tuple_hdr_bm +
maxalign - CASE
WHEN index_tuple_hdr_bm%maxalign = 0 THEN maxalign
ELSE index_tuple_hdr_bm%maxalign
END
+ nulldatawidth + maxalign - CASE
WHEN nulldatawidth = 0 THEN 0
WHEN nulldatawidth::numeric%maxalign = 0 THEN maxalign
ELSE nulldatawidth::numeric%maxalign
END
)::numeric AS nulldatahdrwidth, pagehdr, pageopqdata, is_na
FROM (
SELECT n.nspname, sub.tblname, sub.idxname, sub.reltuples, sub.relpages, sub.relam,
current_setting('block_size')::numeric AS bs,
CASE
WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8
ELSE 4
END AS maxalign,
24 AS pagehdr,
16 AS pageopqdata,
CASE WHEN max(coalesce(sub.stanullfrac,0)) = 0
THEN 2
ELSE 2 + (( 32 + 8 - 1 ) / 8)
END AS index_tuple_hdr_bm,
sum( (1-coalesce(sub.stanullfrac, 0)) * coalesce(sub.stawidth, 1024)) AS nulldatawidth,
max( CASE WHEN a.atttypid = 'pg_catalog.name'::regtype THEN 1 ELSE 0 END ) > 0 AS is_na
FROM (
SELECT ct.relnamespace, ct.relname AS tblname,
ci.relname AS idxname, ci.reltuples, ci.relpages, ci.relam,
s.stawidth, s.stanullfrac, s.starelid, s.staattnum
FROM pg_catalog.pg_index AS i
JOIN pg_catalog.pg_class AS ci ON ci.oid = i.indexrelid
JOIN pg_catalog.pg_class AS ct ON i.indrelid = ct.oid
JOIN pg_catalog.pg_statistic AS s ON i.indexrelid = s.starelid
WHERE ci.relpages > 0
UNION
SELECT ct.relnamespace, ct.relname AS tblname,
ci.relname AS idxname, ci.reltuples, ci.relpages, ci.relam,
s.stawidth, s.stanullfrac, s.starelid, s.staattnum
FROM pg_catalog.pg_index AS i
JOIN pg_catalog.pg_class AS ci ON ci.oid = i.indexrelid
JOIN pg_catalog.pg_class AS ct ON ct.oid = i.indrelid
JOIN pg_catalog.pg_statistic AS s ON s.starelid = i.indrelid
AND s.staattnum = ANY (
string_to_array(pg_catalog.textin(pg_catalog.int2vectorout(i.indkey)), ' ')::smallint[]
)
WHERE ci.relpages > 0
) AS sub
JOIN pg_catalog.pg_attribute AS a ON sub.starelid = a.attrelid
AND sub.staattnum = a.attnum
JOIN pg_catalog.pg_type AS t ON a.atttypid = t.oid
JOIN pg_catalog.pg_namespace AS n ON sub.relnamespace = n.oid
WHERE a.attnum > 0
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9
) AS sub2
) AS sub3
JOIN pg_am am ON sub3.relam = am.oid
WHERE am.amname = 'btree'
) AS sub4
WHERE NOT is_na
ORDER BY 2,3,4 },
# Use ANY (i.indkey) w/o function call to cast from vector to array
$PG_VERSION_81 => q{
SELECT current_database(), nspname AS schemaname, tblname, idxname, bs*(relpages)::bigint AS real_size,
bs*(relpages-est_pages)::bigint AS bloat_size,
100 * (relpages-est_pages)::float / relpages AS bloat_ratio
FROM (
SELECT coalesce(1 +
ceil(reltuples/floor((bs-pageopqdata-pagehdr)/(4+nulldatahdrwidth)::float)), 0
) AS est_pages,
bs, nspname, tblname, idxname, relpages, is_na
FROM (
SELECT maxalign, bs, nspname, tblname, idxname, reltuples, relpages, relam,
( index_tuple_hdr_bm +
maxalign - CASE
WHEN index_tuple_hdr_bm%maxalign = 0 THEN maxalign
ELSE index_tuple_hdr_bm%maxalign
END
+ nulldatawidth + maxalign - CASE
WHEN nulldatawidth = 0 THEN 0
WHEN nulldatawidth::numeric%maxalign = 0 THEN maxalign
ELSE nulldatawidth::numeric%maxalign
END
)::numeric AS nulldatahdrwidth, pagehdr, pageopqdata, is_na
FROM (
SELECT n.nspname, sub.tblname, sub.idxname, sub.reltuples, sub.relpages, sub.relam,
current_setting('block_size')::numeric AS bs,
CASE
WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8
ELSE 4
END AS maxalign,
24 AS pagehdr,
16 AS pageopqdata,
CASE WHEN max(coalesce(sub.stanullfrac,0)) = 0
THEN 2
ELSE 2 + (( 32 + 8 - 1 ) / 8)
END AS index_tuple_hdr_bm,
sum( (1-coalesce(sub.stanullfrac, 0)) * coalesce(sub.stawidth, 1024)) AS nulldatawidth,
max( CASE WHEN a.atttypid = 'pg_catalog.name'::regtype THEN 1 ELSE 0 END ) > 0 AS is_na
FROM (
SELECT ct.relnamespace, ct.relname AS tblname,
ci.relname AS idxname, ci.reltuples, ci.relpages, ci.relam,
s.stawidth, s.stanullfrac, s.starelid, s.staattnum
FROM pg_catalog.pg_index AS i
JOIN pg_catalog.pg_class AS ci ON ci.oid = i.indexrelid
JOIN pg_catalog.pg_class AS ct ON i.indrelid = ct.oid
JOIN pg_catalog.pg_statistic AS s ON i.indexrelid = s.starelid
WHERE ci.relpages > 0
UNION
SELECT ct.relnamespace, ct.relname AS tblname,
ci.relname AS idxname, ci.reltuples, ci.relpages, ci.relam,
s.stawidth, s.stanullfrac, s.starelid, s.staattnum
FROM pg_catalog.pg_index AS i
JOIN pg_catalog.pg_class AS ci ON ci.oid = i.indexrelid
JOIN pg_catalog.pg_class AS ct ON ct.oid = i.indrelid
JOIN pg_catalog.pg_statistic AS s ON s.starelid = i.indrelid
AND s.staattnum = ANY ( i.indkey )
WHERE ci.relpages > 0
) AS sub
JOIN pg_catalog.pg_attribute AS a ON sub.starelid = a.attrelid
AND sub.staattnum = a.attnum
JOIN pg_catalog.pg_type AS t ON a.atttypid = t.oid
JOIN pg_catalog.pg_namespace AS n ON sub.relnamespace = n.oid
WHERE a.attnum > 0
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9
) AS sub2
) AS sub3
JOIN pg_am am ON sub3.relam = am.oid
WHERE am.amname = 'btree'
) AS sub4
WHERE NOT is_na
ORDER BY 2,3,4 },
# New column pg_index.indisvalid
$PG_VERSION_82 => q{
SELECT current_database(), nspname AS schemaname, tblname, idxname, bs*(relpages)::bigint AS real_size,
bs*(relpages-est_pages_ff) AS bloat_size,
100 * (relpages-est_pages_ff)::float / relpages AS bloat_ratio
FROM (
SELECT coalesce(1 +
ceil(reltuples/floor((bs-pageopqdata-pagehdr)/(4+nulldatahdrwidth)::float)), 0
) AS est_pages,
coalesce(1 +
ceil(reltuples/floor((bs-pageopqdata-pagehdr)*fillfactor/(100*(4+nulldatahdrwidth)::float))), 0
) AS est_pages_ff,
bs, nspname, tblname, idxname, relpages, fillfactor, is_na
FROM (
SELECT maxalign, bs, nspname, tblname, idxname, reltuples, relpages, relam, fillfactor,
( index_tuple_hdr_bm +
maxalign - CASE
WHEN index_tuple_hdr_bm%maxalign = 0 THEN maxalign
ELSE index_tuple_hdr_bm%maxalign
END
+ nulldatawidth + maxalign - CASE
WHEN nulldatawidth = 0 THEN 0
WHEN nulldatawidth::numeric%maxalign = 0 THEN maxalign
ELSE nulldatawidth::numeric%maxalign
END
)::numeric AS nulldatahdrwidth, pagehdr, pageopqdata, is_na
FROM (
SELECT n.nspname, sub.tblname, sub.idxname, sub.reltuples, sub.relpages, sub.relam, sub.fillfactor,
current_setting('block_size')::numeric AS bs,
CASE -- MAXALIGN: 4 on 32bits, 8 on 64bits (and mingw32 ?)
WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8
ELSE 4
END AS maxalign,
24 AS pagehdr,
16 AS pageopqdata,
CASE WHEN max(coalesce(sub.stanullfrac,0)) = 0
THEN 2 -- IndexTupleData size
ELSE 2 + (( 32 + 8 - 1 ) / 8)
END AS index_tuple_hdr_bm,
sum( (1-coalesce(sub.stanullfrac, 0)) * coalesce(sub.stawidth, 1024)) AS nulldatawidth,
max( CASE WHEN a.atttypid = 'pg_catalog.name'::regtype THEN 1 ELSE 0 END ) > 0 AS is_na
FROM (
SELECT ct.relnamespace, ct.relname AS tblname,
ci.relname AS idxname, ci.reltuples, ci.relpages, ci.relam,
s.stawidth, s.stanullfrac, s.starelid, s.staattnum,
coalesce(substring(
array_to_string(ci.reloptions, ' ')
from 'fillfactor=([0-9]+)')::smallint, 90) AS fillfactor
FROM pg_catalog.pg_index AS i
JOIN pg_catalog.pg_class AS ci ON ci.oid = i.indexrelid
JOIN pg_catalog.pg_class AS ct ON i.indrelid = ct.oid
JOIN pg_catalog.pg_statistic AS s ON i.indexrelid = s.starelid
WHERE ci.relpages > 0
UNION
SELECT ct.relnamespace, ct.relname AS tblname,
ci.relname AS idxname, ci.reltuples, ci.relpages, ci.relam,
s.stawidth, s.stanullfrac, s.starelid, s.staattnum,
coalesce(substring(
array_to_string(ci.reloptions, ' ')
from 'fillfactor=([0-9]+)')::smallint, 90) AS fillfactor
FROM pg_catalog.pg_index AS i
JOIN pg_catalog.pg_class AS ci ON ci.oid = i.indexrelid
JOIN pg_catalog.pg_class AS ct ON ct.oid = i.indrelid
JOIN pg_catalog.pg_statistic AS s ON s.starelid = i.indrelid
AND s.staattnum = ANY ( i.indkey )
WHERE ci.relpages > 0
) AS sub
JOIN pg_catalog.pg_attribute AS a ON sub.starelid = a.attrelid
AND sub.staattnum = a.attnum
JOIN pg_catalog.pg_type AS t ON a.atttypid = t.oid
JOIN pg_catalog.pg_namespace AS n ON sub.relnamespace = n.oid
WHERE a.attnum > 0
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
) AS sub2
) AS sub3
JOIN pg_am am ON sub3.relam = am.oid
WHERE am.amname = 'btree'
) AS sub4
WHERE NOT is_na
ORDER BY 2,3,4 }
);
# Warning and critical are mandatory.
pod2usage(
-message => "FATAL: you must specify critical and warning thresholds.",
-exitval => 127
) unless defined $args{'warning'} and defined $args{'critical'} ;
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "btree_bloat".',
-exitval => 127
) if @hosts != 1;
@all_db = @{ get_all_dbname( $hosts[0] ) };
# Iterate over all db
ALLDB_LOOP: foreach my $db (sort @all_db) {
my @rc;
# handle max, avg and count for size and percentage, per relkind
my $nb_ind = 0;
my $idx_bloated = 0;
next ALLDB_LOOP if grep { $db =~ /$_/ } @dbexclude;
next ALLDB_LOOP if @dbinclude and not grep { $db =~ /$_/ } @dbinclude;
@rc = @{ query_ver( $hosts[0], %queries, $db ) };
BLOAT_LOOP: foreach my $bloat (@rc) {
foreach my $exclude_re ( @{ $args{'exclude'} } ) {
next BLOAT_LOOP if "$bloat->[0].$bloat->[1].$bloat->[3]" =~ m/$exclude_re/;
}
if ( defined $args{'warning'} ) {
my $w_limit = 0;
my $c_limit = 0;
# We need to compute effective thresholds on each object,
# as the value can be given in percentage
# The biggest calculated size will be used.
foreach my $cur_warning (split /,/, $args{'warning'}) {
my $size = get_size( $cur_warning, $bloat->[4] );
$w_limit = $size if $size > $w_limit;
}
foreach my $cur_critical (split /,/, $args{'critical'}) {
my $size = get_size( $cur_critical, $bloat->[4] );
$c_limit = $size if $size > $c_limit;
}
if ( $bloat->[5] > $w_limit ) {
$idx_bloated++;
$w_count++;
$c_count++ if $bloat->[5] > $c_limit;
push @longmsg => sprintf "%s.%s.%s %s/%s (%.2f%%);",
$bloat->[0], $bloat->[1], $bloat->[3],
to_size($bloat->[5]), to_size($bloat->[4]), $bloat->[6];
}
}
$nb_ind++;
}
$total_index += $nb_ind;
push @perfdata => [ "idx bloated in $db", $idx_bloated ];
}
# We use the warning count for the **total** number of bloated indexes
return critical $me,
[ "$w_count/$total_index index(es) bloated" ],
[ @perfdata ], [ @longmsg ]
if $c_count > 0;
return warning $me,
[ "$w_count/$total_index index(es) bloated" ],
[ @perfdata ], [ @longmsg ]
if $w_count > 0;
return ok $me, [ "Btree bloat ok" ], \@perfdata;
}
=item B<commit_ratio> (all)
Check the commit and rollback rate per second since last call.
This service uses the status file (see --status-file parameter).
Perfdata contains the commit rate, rollback rate, transaction rate and rollback
ratio for each database since last call.
Critical and Warning thresholds are optional. They accept a list of comma
separated 'label=value'. Available labels are B<rollbacks>, B<rollback_rate>
and B<rollback_ratio>, which will be compared to the number of rollbacks, the
rollback rate and the rollback ratio of each database. Warning or critical will
be raised if the reported value is greater than B<rollbacks>, B<rollback_rate> or
B<rollback_ratio>.
Required privileges: unprivileged role.
=cut
sub check_commit_ratio {
my @rs;
my @msg_warn;
my @msg_crit;
my @perfdata;
my @hosts;
my %xacts;
my %new_xacts;
my $global_commits;
my $global_rollbacks;
my %warn;
my %crit;
my %args = %{ $_[0] };
my $me = 'POSTGRES_COMMIT_RATIO';
my $sql = q{
SELECT floor(extract(EPOCH from now())), s.datname,
s.xact_commit, s.xact_rollback
FROM pg_stat_database s
JOIN pg_database d ON s.datid = d.oid
WHERE d.datallowconn
};
if ( defined $args{'warning'} ) {
my $thresholds_re = qr/(rollbacks|rollback_rate|rollback_ratio)\s*=\s*(\d+)/i;
# warning and critical must be list of status=value
pod2usage(
-message => "FATAL: critical and warning thresholds only accept a list of 'label=value' separated by comma.\n"
. "See documentation for more information about accepted labels.",
-exitval => 127
) unless $args{'warning'} =~ m/^$thresholds_re(\s*,\s*$thresholds_re)*$/
and $args{'critical'} =~ m/^$thresholds_re(\s*,\s*$thresholds_re)*$/ ;
while ( $args{'warning'} =~ /$thresholds_re/g ) {
my ($threshold, $value) = ($1, $2);
$warn{$threshold} = $value if $1 and defined $2;
}
while ( $args{'critical'} =~ /$thresholds_re/g ) {
my ($threshold, $value) = ($1, $2);
$crit{$threshold} = $value if $1 and defined $2;
}
}
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "commit_ratio".',
-exitval => 127
) if @hosts != 1;
%xacts = %{ load( $hosts[0], 'commit_ratio', $args{'status-file'} ) || {} };
@rs = @{ query( $hosts[0], $sql ) };
$new_xacts{$_->[1]} = {
'ts' => $_->[0],
'commit' => $_->[2],
'rollback' => $_->[3]
} foreach @rs;
save $hosts[0], 'commit_ratio', \%new_xacts, $args{'status-file'};
return ok( $me, ['First call'] ) unless keys %xacts;
foreach my $db ( keys %new_xacts ) {
my $ratio = 0;
my $commits = $new_xacts{$db}{'commit'} - $xacts{$db}{'commit'};
my $rollbacks = $new_xacts{$db}{'rollback'} - $xacts{$db}{'rollback'};
# default to 1 sec if called twice in the same second
my $sec = ( $new_xacts{$db}{'ts'} - $xacts{$db}{'ts'} ) || 1;
my $commit_rate = $commits / $sec;
my $rollback_rate = $rollbacks / $sec;
my $xact_rate = ($commits + $rollbacks ) / $sec;
$global_commits += $commits;
$global_rollbacks += $rollbacks;
$ratio = $rollbacks * 100 / ( $commits + $rollbacks )
unless $rollbacks == 0;
push @perfdata => (
[ "${db}_commit_rate", sprintf( "%.2f", $commit_rate ), 'tps' ],
[ "${db}_rollback_rate", sprintf( "%.2f", $rollback_rate ), 'tps' ],
[ "${db}_xact_rate", sprintf( "%.2f", $xact_rate ), 'tps' ],
[ "${db}_rollback_ratio", sprintf( "%.2f", $ratio ), '%' ]
);
THRESHOLD_LOOP: foreach my $val ( ('rollbacks', 'rollback_rate', 'rollback_ratio') ) {
my $prefix = "${db}_${val}";
# Criticals
if ( exists $crit{$val} ) {
if ( $val eq "rollbacks" and $crit{$val} < $rollbacks ) {
push @msg_crit => "'$prefix'=$rollbacks";
next THRESHOLD_LOOP;
}
if ( $val eq "rollback_rate" and $crit{$val} < $rollback_rate ) {
push @msg_crit => sprintf "'%s'=%.2ftps", $prefix, $rollback_rate;
next THRESHOLD_LOOP;
}
if ( $val eq "rollback_ratio" and $crit{$val} < $ratio ) {
push @msg_crit => sprintf "'%s'=%.2f%%", $prefix, $ratio;
next THRESHOLD_LOOP;
}
}
# Warnings
if ( exists $warn{$val} ) {
if ( $val eq "rollbacks" and $warn{$val} < $rollbacks ) {
push @msg_warn => "'$prefix'=$rollbacks";
next THRESHOLD_LOOP;
}
if ( $val eq "rollback_rate" and $warn{$val} < $rollback_rate ) {
push @msg_warn => sprintf("'%s'=%.2ftps", $prefix, $rollback_rate );
next THRESHOLD_LOOP;
}
if ( $val eq "rollback_ratio" and $warn{$val} < $ratio ) {
push @msg_warn => sprintf("'%s'=%.2f%%", $prefix, $ratio );
next THRESHOLD_LOOP;
}
}
}
}
return critical( $me, [ "Commits: $global_commits - Rollbacks: $global_rollbacks", @msg_crit, @msg_warn ], \@perfdata )
if scalar @msg_crit > 0;
return warning( $me, [ "Commits: $global_commits - Rollbacks: $global_rollbacks", @msg_warn ], \@perfdata )
if scalar @msg_warn > 0;
return ok( $me, ["Commits: $global_commits - Rollbacks: $global_rollbacks"], \@perfdata );
}
=item B<configuration> (8.0+)
Check the most important settings.
Warning and Critical thresholds are ignored.
Specific parameters are :
C<--work_mem>, C<--maintenance_work_mem>, C<--shared_buffers>,C<--wal_buffers>,
C<--checkpoint_segments>, C<--effective_cache_size>, C<--no_check_autovacuum>,
C<--no_check_fsync>, C<--no_check_enable>, C<--no_check_track_counts>.
Required privileges: unprivileged role.
=cut
sub check_configuration {
my @hosts;
my @msg_crit;
my %args = %{ $_[0] };
my $me = 'POSTGRES_CONFIGURATION';
# This service is based on a probe by Marc Cousin (cousinmarc@gmail.com)
# Limit parameters. Have defaut values
my $work_mem = $args{'work_mem'} || 4096; # At least 4MB
my $maintenance_work_mem = $args{'maintenance_work_mem'} || 65536; # At least 64MB
my $shared_buffers = $args{'shared_buffers'} || 16384; # At least 128MB
my $wal_buffers = $args{'wal_buffers'} || 64; # At least 512k. Or -1 for 9.1
my $checkpoint_segments = $args{'checkpoint_segments'} || 10;
my $effective_cache_size = $args{'effective_cache_size'} || 131072; # At least 1GB. No way a modern server has less than 2GB of ram
# These will be checked to verify they are still the default values (no parameter, for now)
# autovacuum, fsync, enable*,track_counts/stats_row_level
my $no_check_autovacuum = $args{'no_check_autovacuum'} || 0;
my $no_check_fsync = $args{'no_check_fsync'} || 0;
my $no_check_enable = $args{'no_check_enable'} || 0;
my $no_check_track_counts = $args{'no_check_track_counts'} || 0;
my $sql = "SELECT name,setting FROM pg_settings
WHERE ( ( name='work_mem' and setting::bigint < $work_mem )
or ( name='maintenance_work_mem' and setting::bigint < $maintenance_work_mem )
or ( name='shared_buffers' and setting::bigint < $shared_buffers )
or ( name='wal_buffers' and ( setting::bigint < $wal_buffers or setting = '-1') )
or ( name='checkpoint_segments' and setting::bigint < $checkpoint_segments )
or ( name='effective_cache_size' and setting::bigint < $effective_cache_size )
or ( name='autovacuum' and setting='off' and $no_check_autovacuum = 0)
or ( name='fsync' and setting='off' and $no_check_fsync=0 )
or ( name~'^enable.*' and setting='off' and $no_check_enable=0)
or (name='stats_row_level' and setting='off' and $no_check_track_counts=0)
or (name='track_counts' and setting='off' and $no_check_track_counts=0)
)";
# FIXME make one parameter --ignore to rules 'em all.
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "configuration".',
-exitval => 127
) if @hosts != 1;
is_compat $hosts[0], 'configuration', $PG_VERSION_80 or exit 1;
my @rc = @{ query( $hosts[0], $sql ) };
DB_LOOP: foreach my $setting (@rc) {
push @msg_crit => ( $setting->[0] . "=" . $setting->[1] );
}
# All the entries in $result are an error. If the array isn't empty, we return ERROR, and the list of errors
return critical( $me, \@msg_crit )
if ( @msg_crit > 0 );
return ok( $me, [ "PostgreSQL configuration ok" ] );
}
=item B<connection> (all)
Perform a simple connection test.
No perfdata is returned.
This service ignores critical and warning arguments.
Required privileges: unprivileged role.
=cut
sub check_connection {
my @rs;
my @hosts;
my %args = %{ $_[0] };
my $me = 'POSTGRES_CONNECTION';
my $sql = q{SELECT now(), version()};
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "connection".',
-exitval => 127
) if @hosts != 1;
@rs = @{ query( $hosts[0], $sql ) };
return ok( $me, [ "Connection successful at $rs[0][0], on $rs[0][1]" ] );
}
=item B<custom_query> (all)
Perform the given user query.
Specify the query with C<--query>. The first column will be
used to perform the test for the status if warning and critical are provided.
The warning and critical arguments are optional. They can be of format integer
(default), size or time depending on the C<--type> argument.
Warning and Critical will be raised if they are greater than the first column,
or less if the C<--reverse> option is used.
All other columns will be used to generate the perfdata. Each field name is used
as the name of the perfdata. The field value must contain your perfdata value
and its unit appended to it. You can add as many fields as needed. Eg.:
SELECT pg_database_size('postgres'),
pg_database_size('postgres')||'B' AS db_size
Required privileges: unprivileged role (depends on the query).
=cut
sub check_custom_query {
my %args = %{ $_[0] };
my $me = 'POSTGRES_CUSTOM_QUERY';
my $sql = $args{'query'};
my $type = $args{'type'} || 'integer';
my $reverse = $args{'reverse'};
my $bounded = undef;
my @rs;
my @fields;
my @perfdata;
my @hosts;
my @msg_crit;
my @msg_warn;
my $c_limit;
my $w_limit;
my $perf;
my $value;
# FIXME: add warn/crit threshold in perfdata
# Query must be given
pod2usage(
-message => 'FATAL: you must set parameter "--query" with "custom_query" service.',
-exitval => 127
) unless defined $args{'query'} ;
# Critical and Warning must be given with --type argument
pod2usage(
-message => 'FATAL: you must specify critical and warning thresholds with "--type" parameter.',
-exitval => 127
) unless ( not defined $args{'type'} ) or
( defined $args{'type'} and $args{'warning'} and $args{'critical'} );
@hosts = @{ parse_hosts %args };
pod2usage(
-message => 'FATAL: you must give only one host with service "custom_query".',
-exitval => 127
) if @hosts != 1;
# Handle warning and critical type
if ( $type eq 'size' ) {
$w_limit = get_size( $args{'warning'} );
$c_limit = get_size( $args{'critical'} );
}
elsif ( $type eq 'time' ) {
pod2usage(
-message => "FATAL: critical and warning thresholds only acccepts interval with --type time.",
-exitval => 127
) unless ( is_time( $args{'warning'} ) and is_time( $args{'critical'} ) );
$w_limit = get_time( $args{'warning'} );
$c_limit = get_time( $args{'critical'} );
}
elsif (defined $args{'warning'} ) {
pod2usage(
-message => 'FATAL: given critical and/or warning are not numeric. Please, set "--type" parameter if needed.',
-exitval => 127
) if $args{'warning'} !~ m/^[0-9.]+$/
or $args{'critical'} !~ m/^[0-9.]+$/;
$w_limit = $args{'warning'};
$c_limit = $args{'critical'};
}
@rs = @{ query( $hosts[0], $sql, undef, 1 ) };
@fields = @{ shift @rs };
return unknown( $me, [ 'No row returned by the query!' ] )
unless defined $rs[0];
pod2usage(
-message => 'FATAL: First column of your query is not numeric!',
-exitval => 127
) unless looks_like_number($rs[0][0]);
DB_LOOP: foreach my $rec ( @rs ) {
$bounded = $rec->[0] unless $bounded;
$bounded = $rec->[0] if ( !$reverse and $rec->[0] > $bounded )
or ( $reverse and $rec->[0] < $bounded );
$value = shift( @{$rec} );
shift <