Skip to content

Commit

Permalink
Fixes #7409: Support cfengine enterprise in check-rudder-agent
Browse files Browse the repository at this point in the history
  • Loading branch information
peckpeck committed Nov 20, 2015
1 parent c5b1b85 commit 7a1ee9e
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 78 deletions.
179 changes: 103 additions & 76 deletions rudder-agent/SOURCES/check-rudder-agent
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,12 @@ echo_n() {
}

# Default variables for CFEngine binaries and disable files
CFE_DIR=/var/rudder/cfengine-community
CFE_BIN_DIR=${CFE_DIR}/bin
RUDDER_CFE_DIR=/var/rudder/cfengine-community
ENTERPRISE_CFE_DIR=/var/cfengine
CFE_DISABLE_FILE=/opt/rudder/etc/disable-agent

LAST_UPDATE_FILE=${CFE_DIR}/last_successful_inputs_update

UUID_FILE=/opt/rudder/etc/uuid.hive


clean_cf_lock_files() {
rm -f ${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}
rm -f ${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}.lock
Expand All @@ -83,32 +81,31 @@ check_and_fix_cfengine_processes() {
# A standard kill won't kill them, so the -9 is necessary to make sure they are stopped
# They will be restarted by the next check, if the disable file is not set
# List the cf-execd processes running (without the path, they can be run manually)
CF_EXECD_RUNNING=`${PS_COMMAND} | grep "cf-execd" | sed -e '/grep/d' | cat`
CF_EXECD_RUNNING=`${PS_COMMAND} | grep "${CFE_BIN_DIR}/cf-execd" | sed -e '/grep/d' | cat`
NB_CF_EXECD_RUNNING=`echo "${CF_EXECD_RUNNING}" | sed -e '/^$/d' | wc -l`
if [ ${NB_CF_EXECD_RUNNING} -gt 1 ]; then
echo_n "WARNING: Too many instance of CFEngine cf-execd processes running. Killing them..."
echo "${CF_EXECD_RUNNING}" | awk 'BEGIN { OFS=" "} {print $2 }' | xargs kill -9 || true
echo " Done"
fi


# List the CFEngine processes running
CF_PROCESS_RUNNING=`${PS_COMMAND} | egrep "${CFE_BIN_DIR}/(cf-execd|cf-agent)" | sed -e '/grep/d' | cat`
CF_PROCESS_RUNNING=`${PS_COMMAND} | sed -ne "\\%${CFE_BIN_DIR}/\\(cf-execd\\|cf-agent\\)%p" | cat`
# Count the number of processes running, filtering empty lines
NB_CF_PROCESS_RUNNING=`echo "${CF_PROCESS_RUNNING}" | sed -e '/^$/d' | wc -l`

# If no disable file AND no process of CFEngine from Rudder, then relaunch cf-agent with a failsafe first
# But this is applied only on servers or nodes already initialized (policy server set)
if [ ! -f ${CFE_DISABLE_FILE} ] && [ ${NB_CF_PROCESS_RUNNING} -eq 0 ] && [ -f ${CFE_DIR}/policy_server.dat ]; then
if [ ! -f "${CFE_DISABLE_FILE}" ] && [ "${NB_CF_PROCESS_RUNNING}" -eq 0 ] && [ -f "${CFE_DIR}/policy_server.dat" ]; then
echo_n "WARNING: No disable file detected and no CFEngine process neither. Relaunching CFEngine processes..."
${CFE_BIN_DIR}/cf-agent -f failsafe.cf >/dev/null 2>&1
${CFE_BIN_DIR}/cf-agent >/dev/null 2>&1
echo " Done"
fi

# Get the value of rudder-agent run interval from file /var/rudder/cfengine-community/inputs/run_interval
if [ -f /var/rudder/cfengine-community/inputs/run_interval ]; then
RUN_INTERVAL=`cat /var/rudder/cfengine-community/inputs/run_interval`
if [ -f "${CFE_DIR}/inputs/run_interval" ]; then
RUN_INTERVAL=`cat "${CFE_DIR}/inputs/run_interval"`
else
# File does not exists, use default value 5
RUN_INTERVAL=5
Expand All @@ -119,7 +116,7 @@ check_and_fix_cfengine_processes() {
# Check for the age of the last update file
# if it is older than CHECK_INTERVAL minutes, and the disable file is not defined, we
# need to kill the lock database
if [ ! -f ${LAST_UPDATE_FILE} ] || [ -f ${CFE_DISABLE_FILE} ]; then
if [ ! -f "${LAST_UPDATE_FILE}" ] || [ -f "${CFE_DISABLE_FILE}" ]; then
# Either the file ${LAST_UPDATE_FILE} is not yet present, and this node is
# probably not accepted yet, either the file ${CFE_DISABLE_FILE} is present, so
# the agent won't update the ${LAST_UPDATE_FILE}.
Expand All @@ -134,10 +131,10 @@ check_and_fix_cfengine_processes() {

# Check for anomalous number of CFEngine processes
# If there are more than 8 agent/executor processes, we should kill them, and purge the lock database
if [ ${NB_CF_PROCESS_RUNNING} -gt 8 ]; then
if [ "${NB_CF_PROCESS_RUNNING}" -gt 8 ]; then
echo_n "WARNING: Too many instance of CFEngine processes running. Killing them, and purging the CFEngine lock database..."
echo "${CF_PROCESS_RUNNING}" | awk 'BEGIN { OFS=" "} {print $2 }' | xargs kill -9 || true
if [ "z${OS_FAMILY}" != "zAIX" ]; then
if [ -e /etc/init.d/rudder-agent ] && [ "${OS_FAMILY}" != "AIX" ] || [ "${OS_FAMILY}" = "SunOS" ]; then
/etc/init.d/rudder-agent forcestop || true
fi
clean_cf_lock_files
Expand All @@ -150,13 +147,13 @@ check_and_fix_cf_lock() {
MAX_CF_LOCK_SIZE=10485760
if [ -f "${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}" ]; then

if [ "z${OS_FAMILY}" = "zAIX" ]; then
if [ "${OS_FAMILY}" = "AIX" ]; then
CF_LOCK_SIZE=`istat "${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}" | grep Length | sed "s%^.*Length \([0-9]*\) bytes.*$%\1%"`
else
CF_LOCK_SIZE=`stat -c%s "${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT}"`
fi

if [ ${CF_LOCK_SIZE} -ge ${MAX_CF_LOCK_SIZE} ]; then
if [ "${CF_LOCK_SIZE}" -ge "${MAX_CF_LOCK_SIZE}" ]; then
echo_n "WARNING: The file ${CFE_DIR}/state/cf_lock.${CFENGINE_DB_EXT} is too big (${CF_LOCK_SIZE} bytes), purging it..."
clean_cf_lock_files
echo " Done"
Expand All @@ -166,88 +163,118 @@ check_and_fix_cf_lock() {

check_and_fix_rudder_uuid() {

# Default variable about UUID backup
LATEST_BACKUPED_UUID=""

# Generate a UUID if we don't have one yet
if [ ! -f ${UUID_FILE} ]; then
if [ -d ${BACKUP_DIR} ]; then
LATEST_BACKUPED_UUID=`ls -v1 ${BACKUP_DIR}uuid-*.hive | tail -n1`
fi
if [ "z${LATEST_BACKUPED_UUID}" != "z" ]; then
echo_n "WARNING: The UUID of the node does not exist. The lastest backup (${LATEST_BACKUPED_UUID}) will be recovered..."
${CP_A} ${LATEST_BACKUPED_UUID} ${UUID_FILE} >/dev/null 2>&1
echo " Done"
# Default variable about UUID backup
LATEST_BACKUPED_UUID=""

# Generate a UUID if we don't have one yet
if [ ! -f "${UUID_FILE}" ]; then
if [ -d "${BACKUP_DIR}" ]; then
LATEST_BACKUPED_UUID=`ls -v1 ${BACKUP_DIR}uuid-*.hive | tail -n1`
fi
if [ "${LATEST_BACKUPED_UUID}" != "" ]; then
echo_n "WARNING: The UUID of the node does not exist. The lastest backup (${LATEST_BACKUPED_UUID}) will be recovered..."
${CP_A} ${LATEST_BACKUPED_UUID} ${UUID_FILE} >/dev/null 2>&1
echo " Done"
else
echo_n "WARNING: The UUID of the node does not exist and no backup exist. A new one will be generated..."
uuidgen > ${UUID_FILE}
echo " Done"
fi
else
echo_n "WARNING: The UUID of the node does not exist and no backup exist. A new one will be generated..."
uuidgen > ${UUID_FILE}
echo " Done"
fi
else
# UUID is valid only if it has been generetaed by uuidgen or if it is set to 'root' for policy server
REGEX=`x="[a-f0-9][a-f0-9][a-f0-9][a-f0-9]" && echo "$x$x-$x-$x-$x-$x$x$x"`
CHECK_UUID=`cat /opt/rudder/etc/uuid.hive | egrep "^$REGEX|^root" | wc -l`
# If the UUID is not valid, regenerate it
if [ ${CHECK_UUID} -ne 1 ]; then
echo_n "INFO: Creating a new UUID for Rudder as the existing one is invalid..."
# Keep a backup of UUID even if it is not valid
mkdir -p /var/backups/rudder
cp -f /opt/rudder/etc/uuid.hive /var/backups/rudder/uuid-`date +%Y%m%d`.hive
# Generate a new one
uuidgen > ${UUID_FILE}
echo " Done."
# UUID is valid only if it has been generetaed by uuidgen or if it is set to 'root' for policy server
REGEX=`x="[a-f0-9][a-f0-9][a-f0-9][a-f0-9]" && echo "$x$x-$x-$x-$x-$x$x$x"`
CHECK_UUID=`cat /opt/rudder/etc/uuid.hive | egrep "^$REGEX|^root" | wc -l`
# If the UUID is not valid, regenerate it
if [ ${CHECK_UUID} -ne 1 ]; then
echo_n "INFO: Creating a new UUID for Rudder as the existing one is invalid..."
# Keep a backup of UUID even if it is not valid
mkdir -p "${BACKUP_DIR}"
cp -f /opt/rudder/etc/uuid.hive ${BACKUP_DIR}/uuid-`date +%Y%m%d`.hive
# Generate a new one
uuidgen > ${UUID_FILE}
echo " Done."
fi
fi
fi
}


check_and_fix_specific_rudder_agent_file() {
FILE_TO_RESTORE=$1
FILE_TYPE=$2
LATEST_BACKUPED_FILES=""

if [ ! -f ${FILE_TO_RESTORE} ]; then
if [ -d ${BACKUP_DIR} ]; then
LATEST_BACKUPED_FILES=`ls -v1 ${BACKUP_DIR}rudder-agent.${FILE_TYPE}-* | tail -n1`
fi
if [ "${LATEST_BACKUPED_FILES}" != "" ]; then
echo_n "WARNING: The file ${FILE_TO_RESTORE} does not exist. The lastest backup (${LATEST_BACKUPED_FILES}) will be recovered..."
${CP_A} ${LATEST_BACKUPED_FILES} ${FILE_TO_RESTORE} >/dev/null 2>&1
echo " Done"
else
echo "ERROR: The file ${FILE_TO_RESTORE} does not exist and no backup exist. Please reinstall the rudder-agent package"
fi
fi
}

FILE_TO_RESTORE=$1
FILE_TYPE=$2
LATEST_BACKUPED_FILES=""
empty() {
[ ! -f "$1" ] || [ $(du "$1" | awk '{print $1}') = "0" ]
}

if [ ! -f ${FILE_TO_RESTORE} ]; then
if [ -d ${BACKUP_DIR} ]; then
LATEST_BACKUPED_FILES=`ls -v1 ${BACKUP_DIR}rudder-agent.${FILE_TYPE}-* | tail -n1`
fi
if [ "z${LATEST_BACKUPED_FILES}" != "z" ]; then
echo_n "WARNING: The file ${FILE_TO_RESTORE} does not exist. The lastest backup (${LATEST_BACKUPED_FILES}) will be recovered..."
${CP_A} ${LATEST_BACKUPED_FILES} ${FILE_TO_RESTORE} >/dev/null 2>&1
check_and_fix_inputs() {
# if file is absent or empty there have been a problem with update
if empty "${CFE_DIR}/inputs/common/1.0/update.cf" || empty "${CFE_DIR}/inputs/failsafe.cf" || empty "${CFE_DIR}/inputs/promises.cf"
then
echo_n "ERROR: There was an error during promises update. Reseting to initial promises and updating..."
rm -rf ${CFE_DIR}/inputs/*
if [ -d "/opt/rudder/share/initial-promises" ] # rudder package has been properly installed
then
cp -r /opt/rudder/share/initial-promises/* ${CFE_DIR}/inputs/
${CFE_BIN_DIR}/cf-agent -K -f failsafe.cf && ${CFE_BIN_DIR}/cf-agent
else # this is an enterprise only agent
if [ -e "${CFE_DIR}/policy_server.dat" ]
then
${CFE_BIN_DIR}/cf-agent --bootstrap `cat "${CFE_DIR}/policy_server.dat"`
fi
fi
echo " Done"
else
echo "ERROR: The file ${FILE_TO_RESTORE} does not exist and no backup exist. Please reinstall the rudder-agent package"
fi
fi
}


# Ensure script is executed by root
MYUID=`id | cut -d\( -f2 | cut -d\) -f1`
if [ "${MYUID}" != 'root' ];then echo "You must be root"; exit; fi

# Launch each check with a certain order
check_and_fix_rudder_uuid
check_and_fix_cfengine_processes
check_and_fix_cf_lock

# Once for rudder's cfengine
if [ -e "${RUDDER_CFE_DIR}/bin/cf-agent" ]
then
CFE_DIR="${RUDDER_CFE_DIR}"
CFE_BIN_DIR="${CFE_DIR}/bin"
LAST_UPDATE_FILE="${CFE_DIR}/last_successful_inputs_update"
NEED_INIT_FILES="true"
check_and_fix_cfengine_processes
check_and_fix_cf_lock
check_and_fix_inputs
fi

# Once for enterprise cfengine
if [ -e "${ENTERPRISE_CFE_DIR}/bin/cf-agent" ]
then
CFE_DIR="${ENTERPRISE_CFE_DIR}"
CFE_BIN_DIR="${CFE_DIR}/bin"
LAST_UPDATE_FILE="${CFE_DIR}/last_successful_inputs_update"
check_and_fix_cfengine_processes
check_and_fix_cf_lock
check_and_fix_inputs
fi

# The following files are not present on AIX systems
if [ "z${OS_FAMILY}" != "zAIX" ]; then
if [ "${NEED_INIT_FILES}" = "true" ] && [ "${OS_FAMILY}" != "AIX" ] || [ "${OS_FAMILY}" = "SunOS" ]; then
check_and_fix_specific_rudder_agent_file /etc/init.d/rudder-agent init
check_and_fix_specific_rudder_agent_file /etc/default/rudder-agent default
check_and_fix_specific_rudder_agent_file /etc/cron.d/rudder-agent cron
fi

# if file is absent or empty there have been a problem with update
base="/var/rudder/cfengine-community/inputs"
empty() {
[ ! -f "$1" ] || [ $(du "$1" | awk '{print $1}') = "0" ]
}
if empty "${base}/common/1.0/update.cf" || empty "${base}/failsafe.cf" || empty "${base}/promises.cf"
then
echo_n "ERROR: There was an error during promises update. Reseting to initial promises and updating..."
rm -rf /var/rudder/cfengine-community/inputs/*
cp -r /opt/rudder/share/initial-promises/* /var/rudder/cfengine-community/inputs
/var/rudder/cfengine-community/bin/cf-agent -K -f failsafe.cf && /var/rudder/cfengine-community/bin/cf-agent
echo " Done"
fi
3 changes: 1 addition & 2 deletions rudder-agent/SOURCES/rudder-agent.init
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ fi


# CFEngine Community directory and files
CFENGINE_COMMUNITY_PATH="/opt/rudder"
CFENGINE_COMMUNITY_VAR_PATH="/var/rudder/cfengine-community"
CFENGINE_COMMUNITY_RUN_0="1"
CFENGINE_COMMUNITY_RUN_1="1"
Expand Down Expand Up @@ -196,7 +195,7 @@ start_daemons() {
if [ $i -eq $TIMEOUT -o $i -eq `expr $TIMEOUT / 2` ]
then
eval message "alert" "\"[ALERT] no PID file for \${CFENGINE_COMMUNITY_NAME_$daemon} after $i seconds. Trying again...\""
eval "\${CFENGINE_COMMUNITY_BIN_\$daemon}" "\${CFENGINE_COMMUNITY_PARAMS_$daemon}"
eval "\${CFENGINE_COMMUNITY_BIN_$daemon}" "\${CFENGINE_COMMUNITY_PARAMS_$daemon}"
fi

if [ $i -eq `expr $TIMEOUT + 10` ]
Expand Down

0 comments on commit 7a1ee9e

Please sign in to comment.