Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #8809: Copy check-rudder-agent to rudder-agent repo #83

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 259 additions & 6 deletions share/commands/agent-check
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,264 @@ while getopts "qc" opt; do
esac
done

/opt/rudder/bin/check-rudder-agent
# Source /etc/profile to gather environment variables from the system and the user
. /etc/profile

if [ $? -eq 0 ]; then
[ "$QUIET" = false ] && printf "${GREEN}ok${NORMAL}: Rudder agent check ran without errors.\n"
else
printf "${RED}error${NORMAL}: Rudder agent check failed.\n" 1>&2
exit 1
# /etc/profile may contain some errors and sourcing it would lead to abort
# the execution of check-rudder-agent. Then, the use of "set -e" should be
# after the sourcing.
set -e

# Ensure our PATH includes Rudder's bin dir (for uuidgen on AIX in particular)
PATH=/opt/rudder/bin/:$PATH
export PATH

# Variables
BACKUP_DIR=/var/backups/rudder/

## Which database system is used in CFEngine ?
## TokyoCabinet (tcdb), Lightning MDB (lmdb)
CFENGINE_DB_EXT=lmdb

# Default variables for CFEngine binaries and disable files
RUDDER_CFE_DIR=/var/rudder/cfengine-community
ENTERPRISE_CFE_DIR=/var/cfengine
CFE_DISABLE_FILE=/opt/rudder/etc/disable-agent
UUID_FILE=/opt/rudder/etc/uuid.hive

# Get common commands from common.sh
init_commands

clean_cf_lock_files() {
rm -f ${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}
rm -f ${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}.lock
}

check_and_fix_cfengine_keys() {
# Making sure that the permissions on the CFEngine key directory are correct...
if [ -d ${CFE_DIR}/ppkeys ]; then
chmod 700 ${CFE_DIR}/ppkeys
if [ `ls ${CFE_DIR}/ppkeys | wc -l` -gt 0 ]; then
chmod 600 ${CFE_DIR}/ppkeys/*
fi
fi
}

check_and_fix_cfengine_processes() {
# If there are more than on cf-execd process, we must kill them
# A standard kill won't kill them, so the -9 is necessary to make sure they are stopped
# They will be restarted by the next check, if the disable file is not set
# List the cf-execd processes running (without the path, they can be run manually)
CF_EXECD_RUNNING=`${PS_COMMAND} | grep "${CFE_BIN_DIR}/cf-execd" | sed -e '/grep/d' | cat`
NB_CF_EXECD_RUNNING=`echo "${CF_EXECD_RUNNING}" | sed -e '/^$/d' | wc -l`
if [ ${NB_CF_EXECD_RUNNING} -gt 1 ]; then
printf "WARNING: Too many instance of CFEngine cf-execd processes running. Killing them..."
echo "${CF_EXECD_RUNNING}" | awk 'BEGIN { OFS=" "} {print $2 }' | xargs kill -9 || true
echo " Done"
fi

# List the CFEngine processes running
CF_PROCESS_RUNNING=`${PS_COMMAND} | sed -ne "\\%${CFE_BIN_DIR}/\\(cf-execd\\|cf-agent\\)%p" | cat`
# Count the number of processes running, filtering empty lines
NB_CF_PROCESS_RUNNING=`echo "${CF_PROCESS_RUNNING}" | sed -e '/^$/d' | wc -l`

# If no disable file AND no process of CFEngine from Rudder, then relaunch cf-agent with a failsafe first
# But this is applied only on servers or nodes already initialized (policy server set)
if [ ! -f "${CFE_DISABLE_FILE}" ] && [ "${NB_CF_PROCESS_RUNNING}" -eq 0 ] && [ -f "${CFE_DIR}/policy_server.dat" ]; then
printf "WARNING: No disable file detected and no CFEngine process either. Relaunching CFEngine processes..."
${CFE_BIN_DIR}/cf-agent -f failsafe.cf >/dev/null 2>&1
${CFE_BIN_DIR}/cf-agent >/dev/null 2>&1
echo " Done"
fi

# Get the value of rudder-agent run interval from file /var/rudder/cfengine-community/inputs/run_interval
if [ -f "${CFE_DIR}/inputs/run_interval" ]; then
RUN_INTERVAL=`cat "${CFE_DIR}/inputs/run_interval"`
else
# File does not exists, use default value 5
RUN_INTERVAL=5
fi

CHECK_INTERVAL=`expr ${RUN_INTERVAL} \* 2`

# Check for the age of the last update file
# if it is older than CHECK_INTERVAL minutes, and the disable file is not defined, we
# need to kill the lock database
if [ ! -f "${LAST_UPDATE_FILE}" ] || [ -f "${CFE_DISABLE_FILE}" ]; then
# Either the file ${LAST_UPDATE_FILE} is not yet present, and this node is
# probably not accepted yet, either the file ${CFE_DISABLE_FILE} is present, so
# the agent won't update the ${LAST_UPDATE_FILE}.
# In both case, do nothing
:
elif test `find ${LAST_UPDATE_FILE} -mmin +${CHECK_INTERVAL}`; then
printf "WARNING: The file ${LAST_UPDATE_FILE} is older than ${CHECK_INTERVAL} minutes, the agent is probably stuck. Purging the CFEngine lock database..."
clean_cf_lock_files
echo " Done";
fi


# Check for anomalous number of CFEngine processes
# If there are more than 8 agent/executor processes, we should kill them, and purge the lock database
if [ "${NB_CF_PROCESS_RUNNING}" -gt 8 ]; then
printf "WARNING: Too many instance of CFEngine processes running. Killing them, and purging the CFEngine lock database..."
echo "${CF_PROCESS_RUNNING}" | awk 'BEGIN { OFS=" "} {print $2 }' | xargs kill -9 || true
if [ -e /etc/init.d/rudder-agent ] && [ "${OS_FAMILY}" != "AIX" ] || [ "${OS_FAMILY}" = "SunOS" ]; then
/etc/init.d/rudder-agent forcestop || true
fi
clean_cf_lock_files
echo " Done"
fi
}

# Check the size of the cf_lock file
check_and_fix_cf_lock() {
MAX_CF_LOCK_SIZE=10485760
if [ -f "${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}" ]; then

if [ "${OS_FAMILY}" = "AIX" ]; then
CF_LOCK_SIZE=`istat "${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}" | grep Length | sed "s%^.*Length \([0-9]*\) bytes.*$%\1%"`
else
CF_LOCK_SIZE=`stat -c%s "${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}"`
fi

if [ "${CF_LOCK_SIZE}" -ge "${MAX_CF_LOCK_SIZE}" ]; then
printf "WARNING: The file ${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT} is too big (${CF_LOCK_SIZE} bytes), purging it..."
clean_cf_lock_files
echo " Done"
fi
fi
}

check_and_fix_rudder_uuid() {

# Default variable about UUID backup
LATEST_BACKUPED_UUID=""

# Generate a UUID if we don't have one yet
if [ ! -f "${UUID_FILE}" ]; then
if [ -d "${BACKUP_DIR}" ]; then
LATEST_BACKUPED_UUID=`ls -v1 ${BACKUP_DIR}uuid-*.hive | tail -n1`
fi
if [ "${LATEST_BACKUPED_UUID}" != "" ]; then
printf "WARNING: The UUID of the node does not exist. The lastest backup (${LATEST_BACKUPED_UUID}) will be recovered..."
${CP_A} ${LATEST_BACKUPED_UUID} ${UUID_FILE} >/dev/null 2>&1
echo " Done"
else
printf "WARNING: The UUID of the node does not exist and no backup exist. A new one will be generated..."
uuidgen > ${UUID_FILE}
echo " Done"
fi
else
# UUID is valid only if it has been generetaed by uuidgen or if it is set to 'root' for policy server
REGEX=`x="[a-f0-9][a-f0-9][a-f0-9][a-f0-9]" && echo "$x$x-$x-$x-$x-$x$x$x"`
CHECK_UUID=`cat /opt/rudder/etc/uuid.hive | egrep "^$REGEX|^root" | wc -l`
# If the UUID is not valid, regenerate it
if [ ${CHECK_UUID} -ne 1 ]; then
printf "INFO: Creating a new UUID for Rudder as the existing one is invalid..."
# Keep a backup of UUID even if it is not valid
mkdir -p "${BACKUP_DIR}"
cp -f /opt/rudder/etc/uuid.hive ${BACKUP_DIR}/uuid-`date +%Y%m%d`.hive
# Generate a new one
uuidgen > ${UUID_FILE}
echo " Done."
fi
fi
}


check_and_fix_specific_rudder_agent_file() {
FILE_TO_RESTORE=$1
FILE_BACKUP_NAME=$2
RESOLUTION=$3
MANDATORY=$4

LATEST_BACKUPED_FILES=""

if [ ! -f ${FILE_TO_RESTORE} ]; then
if [ -d ${BACKUP_DIR} ]; then
LATEST_BACKUPED_FILES=`ls -v1 ${BACKUP_DIR}${FILE_BACKUP_NAME}-* 2> /dev/null | tail -n1`
fi
if [ "${LATEST_BACKUPED_FILES}" != "" ]; then
printf "WARNING: The file ${FILE_TO_RESTORE} does not exist. The lastest backup (${LATEST_BACKUPED_FILES}) will be recovered..."
${CP_A} ${LATEST_BACKUPED_FILES} ${FILE_TO_RESTORE} >/dev/null 2>&1
echo " Done"
else
if [ "${MANDATORY}" = "mandatory" ]; then
echo "ERROR: The file ${FILE_TO_RESTORE} does not exist and no backup exists. ${RESOLUTION}"
fi
fi
fi
}

empty() {
[ ! -f "$1" ] || [ $(du "$1" | awk '{print $1}') = "0" ]
}

check_and_fix_inputs() {
# if file is absent or empty there have been a problem with update
if empty "${CFE_DIR}/inputs/common/1.0/update.cf" || empty "${CFE_DIR}/inputs/failsafe.cf" || empty "${CFE_DIR}/inputs/promises.cf" || ! ${CFE_BIN_DIR}/cf-promises -f failsafe.cf
then
printf "ERROR: There was an error during promises update. Reseting to initial promises and updating..."
rm -rf ${CFE_DIR}/inputs/*
if [ -d "/opt/rudder/share/initial-promises" ] # rudder package has been properly installed
then
cp -r /opt/rudder/share/initial-promises/* ${CFE_DIR}/inputs/
${CFE_BIN_DIR}/cf-agent -K -f failsafe.cf && ${CFE_BIN_DIR}/cf-agent
else # this is an enterprise only agent
if [ -e "${CFE_DIR}/policy_server.dat" ]
then
${CFE_BIN_DIR}/cf-agent --bootstrap `cat "${CFE_DIR}/policy_server.dat"`
fi
fi
echo " Done"
fi

# Ensure main promises are not broken, and force a new download if so
if ! ${CFE_BIN_DIR}/cf-promises
then
rm -f ${CFE_DIR}/inputs/rudder_promises_updated
fi
}

# Ensure script is executed by root
MYUID=`id | cut -d\( -f2 | cut -d\) -f1`
if [ "${MYUID}" != 'root' ];then echo "You must be root"; exit; fi

# Launch each check with a certain order
check_and_fix_rudder_uuid
check_and_fix_specific_rudder_agent_file /var/rudder/cfengine-community/policy_server.dat policy_server.dat "Please configure your Rudder server in this file." "optional"

# Once for rudder's cfengine
if [ -e "${RUDDER_CFE_DIR}/bin/cf-agent" ]
then
CFE_DIR="${RUDDER_CFE_DIR}"
CFE_BIN_DIR="${CFE_DIR}/bin"
LAST_UPDATE_FILE="${CFE_DIR}/last_successful_inputs_update"
NEED_INIT_FILES="true"
check_and_fix_cfengine_keys
check_and_fix_cfengine_processes
check_and_fix_cf_lock
check_and_fix_inputs
fi

# Once for enterprise cfengine
if [ -e "${ENTERPRISE_CFE_DIR}/bin/cf-agent" ]
then
CFE_DIR="${ENTERPRISE_CFE_DIR}"
CFE_BIN_DIR="${CFE_DIR}/bin"
LAST_UPDATE_FILE="${CFE_DIR}/last_successful_inputs_update"
check_and_fix_cfengine_keys
check_and_fix_cfengine_processes
check_and_fix_cf_lock
check_and_fix_inputs
fi

# The following files are not present on AIX systems
if [ "${NEED_INIT_FILES}" = "true" ] && [ "${OS_FAMILY}" != "AIX" ] || [ "${OS_FAMILY}" = "SunOS" ]; then
check_and_fix_specific_rudder_agent_file /etc/init.d/rudder-agent rudder-agent.init "Please reinstall the rudder-agent package." "mandatory"
check_and_fix_specific_rudder_agent_file /etc/default/rudder-agent rudder-agent.default "Please reinstall the rudder-agent package." "mandatory"
check_and_fix_specific_rudder_agent_file /etc/cron.d/rudder-agent rudder-agent.cron "Please reinstall the rudder-agent package." "mandatory"
fi

# Since we are in set -e there was no error here
[ "$QUIET" = false ] && printf "${GREEN}ok${NORMAL}: Rudder agent check ran without errors.\n"
28 changes: 28 additions & 0 deletions share/lib/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ clear_colors() {
NORMAL=""
}

# Command used to start/stop/restart a service
service_action() {
service="$1"
action="$2"
Expand Down Expand Up @@ -47,6 +48,33 @@ service_action() {
fi
}

init_commands() {
## If we are on AIX, use alternative commands and options
# detect OS family
OS_FAMILY=`uname -s`

if [ "${OS_FAMILY}" = "AIX" ] || [ "${OS_FAMILY}" = "SunOS" ]; then
CP_A="cp -hpPr"
else
CP_A="cp -a"
fi

# Detect the correct ps tool to use
ns=$(ps --no-header -o utsns --pid $$ 2>/dev/null || true)
if [ -d "/proc/bc" ] && [ -e "/proc/bc/0" ]; then # we have openvz
if [ -e /bin/vzps ]; then # we have vzps
PS_COMMAND="/bin/vzps -E 0"
else # use rudder provided vzps
PS_COMMAND="/opt/rudder/bin/vzps.py -E 0"
fi
elif [ -n "${ns}" ]; then # we have namespaces
# the sed is here to prepend a fake user field that is removed by the -o option (it is never used)
PS_COMMAND="eval ps --no-header -e -O utsns | grep -E '^[[:space:]]*[[:digit:]]*[[:space:]]+${ns}' | sed 's/^/user /'"
else # standard unix
PS_COMMAND="ps -ef"
fi
}

# Colors configuration (enable colors only if stdout is a terminal)
if [ -t 1 ]; then
COLOR="-Calways"
Expand Down