Permalink
Browse files

PT43289461: Raise alarms when issues with Mongo

- instruct sec to watch mongodb logs
- added 4 new alarms:
MONGO_FATAL_REPLICATION_STOP raised when a node cannot replicate
MONGO_FAILED_ELECTION raised when there election fails
MONGO_MEMBER_DOWN raised by a node when it cannot see other node in replica set
MONGO_NODE_STATE_CHANGED raised whenever node state is changed
  • Loading branch information...
1 parent 3d8d6ca commit 83365d8105d847edf66a5300bd5058fdacac64d9 @dizzy dizzy committed Jan 30, 2013
@@ -17,6 +17,7 @@
package org.sipfoundry.sipxconfig.mongo;
import org.sipfoundry.sipxconfig.address.AddressType;
+import org.sipfoundry.sipxconfig.alarm.AlarmDefinition;
import org.sipfoundry.sipxconfig.feature.LocationFeature;
public interface MongoManager {
@@ -26,6 +27,11 @@
public static final AddressType ARBITOR_ADDRESS_ID = new AddressType(ARBITOR, MongoSettings.ARBITER_PORT);
public static final LocationFeature FEATURE_ID = new LocationFeature(MONGO);
public static final LocationFeature ARBITER_FEATURE = new LocationFeature(ARBITOR);
+ public static final AlarmDefinition MONGO_FATAL_REPLICATION_STOP =
+ new AlarmDefinition("MONGO_FATAL_REPLICATION_STOP");
+ public static final AlarmDefinition MONGO_FAILED_ELECTION = new AlarmDefinition("MONGO_FAILED_ELECTION", 1);
+ public static final AlarmDefinition MONGO_MEMBER_DOWN = new AlarmDefinition("MONGO_MEMBER_DOWN", 2);
+ public static final AlarmDefinition MONGO_NODE_STATE_CHANGED = new AlarmDefinition("MONGO_NODE_STATE_CHANGED");
public MongoSettings getSettings();
@@ -25,6 +25,9 @@
import org.sipfoundry.sipxconfig.address.AddressManager;
import org.sipfoundry.sipxconfig.address.AddressProvider;
import org.sipfoundry.sipxconfig.address.AddressType;
+import org.sipfoundry.sipxconfig.alarm.AlarmDefinition;
+import org.sipfoundry.sipxconfig.alarm.AlarmProvider;
+import org.sipfoundry.sipxconfig.alarm.AlarmServerManager;
import org.sipfoundry.sipxconfig.commserver.Location;
import org.sipfoundry.sipxconfig.feature.Bundle;
import org.sipfoundry.sipxconfig.feature.FeatureChangeRequest;
@@ -46,7 +49,7 @@
import org.sipfoundry.sipxconfig.snmp.SnmpManager;
public class MongoManagerImpl implements AddressProvider, FeatureProvider, MongoManager, ProcessProvider,
- SetupListener, FirewallProvider {
+ SetupListener, FirewallProvider, AlarmProvider {
private BeanWithSettingsDao<MongoSettings> m_settingsDao;
public MongoSettings getSettings() {
@@ -155,4 +158,16 @@ public void featureChangePrecommit(FeatureManager manager, FeatureChangeValidato
@Override
public void featureChangePostcommit(FeatureManager manager, FeatureChangeRequest request) {
}
+
+ @Override
+ public Collection<AlarmDefinition> getAvailableAlarms(AlarmServerManager manager) {
+ if (!manager.getFeatureManager().isFeatureEnabled(MongoManager.FEATURE_ID)
+ || !manager.getFeatureManager().isFeatureEnabled(MongoManager.FEATURE_ID)) {
+ return null;
+ }
+ Collection<AlarmDefinition> defs = Arrays.asList(new AlarmDefinition[] {
+ MONGO_FATAL_REPLICATION_STOP, MONGO_FAILED_ELECTION, MONGO_MEMBER_DOWN, MONGO_NODE_STATE_CHANGED
+ });
+ return defs;
+ }
}
@@ -56,5 +56,11 @@
<property name="replicationManager" ref="replicationManager"/>
<property name="configManager" ref="configManager" />
</bean>
+
+ <bean id="mongoAlarms" class="org.springframework.context.support.ResourceBundleMessageSource">
+ <property name="basename">
+ <value>org.sipfoundry.sipxconfig.mongo.mongo</value>
+ </property>
+ </bean>
</beans>
@@ -0,0 +1,9 @@
+alarm.MONGO_FATAL_REPLICATION_STOP.label=Database replication stopped
+alarm.MONGO_FATAL_REPLICATION_STOP.resolution=Remove node from replica set, restart node and add it back to replica set.
+alarm.MONGO_FAILED_ELECTION.label=Repeated, failed database election
+alarm.MONGO_FAILED_ELECTION.resolution=The number of votes within database replica set is even. Check if configuration is correct and database states.
+alarm.MONGO_MEMBER_DOWN.label=Replica set member down (or slow to respond)
+alarm.MONGO_MEMBER_DOWN.resolution=Check state of replica set member or network connectivity between nodes.
+alarm.MONGO_NODE_STATE_CHANGED.label=Replica set member state changed
+alarm.MONGO_NODE_STATE_CHANGED.resolution=Check if expected state for this node.
+
View
@@ -1,5 +1,18 @@
include $(top_srcdir)/config/utility.am
+EXTRA_DIST = \
+ $(conf_DATA:=.in)
+
cfpluginsdir = $(SIPX_CFINPUTS)/plugin.d
dist_cfplugins_DATA = \
mongodb.cf
+
+confdir = $(SIPX_CONFDIR)/mongo
+
+conf_DATA = \
+ mongo.sec.erb
+
+$(conf_DATA) : % : %.in Makefile
+ @$(call SearchAndReplace,$<,$@)
+
+CLEANFILES = $(conf_DATA)
@@ -0,0 +1,44 @@
+<%
+ if MONGO_FAILED_ELECTION[:groupName] != 'disabled'
+%>
+type=SingleWithThreshold
+ptype=RegExp
+pattern=total number of votes is even
+desc=<%= MONGO_FAILED_ELECTION[:minThreshold] %> Failed election within a 60 second interval.
+action=shellcmd @SIPX_BINDIR@/sipxtrap MONGO_FAILED_ELECTION '%s'
+window=60
+thresh=<%= MONGO_FAILED_ELECTION[:minThreshold] %>
+<%
+ end
+ if MONGO_MEMBER_DOWN[:groupName] != 'disabled'
+%>
+type=SingleWithThreshold
+ptype=RegExp
+pattern=couldn't connect to (\S+)
+desc=<%= MONGO_MEMBER_DOWN[:minThreshold] %> attempts to connect to replica set node $1 within a 60 second interval failed.
+action=shellcmd @SIPX_BINDIR@/sipxtrap MONGO_MEMBER_DOWN '%s'
+window=60
+thresh=<%= MONGO_MEMBER_DOWN[:minThreshold] %>
+<%
+ end
+ if MONGO_FATAL_REPLICATION_STOP[:groupName] != 'disabled'
+%>
+type=Single
+ptype=RegExp
+pattern=replSet error fatal, stopping replication
+desc=Fatal replica set error, stopping replication
+action=shellcmd @SIPX_BINDIR@/sipxtrap MONGO_FATAL_REPLICATION_STOP '%s'
+<%
+ end
+ if MONGO_NODE_STATE_CHANGED[:groupName] != 'disabled'
+%>
+type=Single
+ptype=RegExp
+pattern=replSet member (\S+) is now in state (\S+)
+desc=Replica set node $1 is now in state $2
+action=shellcmd @SIPX_BINDIR@/sipxtrap MONGO_NODE_STATE_CHANGED '%s'
+<%
+ end
+%>
+
+
View
@@ -59,6 +59,19 @@ bundle agent mongodb {
any::
# at this moment, all setups get mongo client config
"any" usebundle => "mongodb_client";
+
+ sipxlogwatcher::
+ "any" usebundle => "mongo_logwatcher";
+}
+
+bundle agent mongo_logwatcher {
+ files:
+ "$(sipx.SIPX_CONFDIR)/sipxlogwatcher.d/mongo.sec"
+ comment => "mongo log watcher",
+ create => "true",
+ perms => m(644),
+ transformer => "$(sipx.SIPX_LIBEXECDIR)/sipxlogwatcher-maker --in $(sipx.SIPX_CFDATA)/$(sipx.location_id)/alarms.yaml --template $(sipx.SIPX_CONFDIR)/mongo/mongo.sec.erb --out $(this.promiser)",
+ classes => if_repaired("reload_sipxlogwatcher");
}
bundle agent mongodb_arbiter_config {
@@ -62,6 +62,7 @@ rm -rf $RPM_BUILD_ROOT
%files server
%defattr(644,root,root,755)
+%attr(755,root,root) %{_sysconfdir}/sipxpbx/mongo/mongo.sec.erb
%attr(755,root,root) %{_sysconfdir}/init.d/mongod-arbiter
%attr(755,root,root) %{_bindir}/mongodb-repair
%{_datadir}/sipxecs/cfinputs/plugin.d/mongodb.cf
@@ -38,7 +38,7 @@ bundle agent sipxlogwatcher_config {
bundle edit_line sipxlogwatcher_config_contents {
insert_lines:
any::
- "SEC_ARGS=\"--log=$(sipx.SIPX_LOGDIR)/sipxlogwatcher.log --debug=$(sipx.logwatcher_debug) --detach --conf=$(sipx.SIPX_CONFDIR)/sipxlogwatcher.d/*.sec --input=$(sipx.SIPX_LOGDIR)/*.log --input=/var/log/messages --pid=$(sipx.SIPX_RUNDIR)/sipxlogwatcher.pid\"";
+ "SEC_ARGS=\"--log=$(sipx.SIPX_LOGDIR)/sipxlogwatcher.log --debug=$(sipx.logwatcher_debug) --detach --conf=$(sipx.SIPX_CONFDIR)/sipxlogwatcher.d/*.sec --input=$(sipx.SIPX_LOGDIR)/*.log --input=/var/log/messages --input=/var/log/mongodb/mongodb.log --pid=$(sipx.SIPX_RUNDIR)/sipxlogwatcher.pid\"";
delete_lines:
any::

0 comments on commit 83365d8

Please sign in to comment.