Skip to content

Commit

Permalink
Support of WorkflowExecutorListener.handlePotentiallyStuck (#391)
Browse files Browse the repository at this point in the history
* make test more robust

* Support of WorkflowExecutorListener.handlePotentiallyStuck

* refactor

* Unwrap the real exception from UndeclaredThrowableException

* Test reliably that the interruption of a workflow worked correctly

* method ref and assertFalse

* add entry to changelog

* fix changelog

* remove unnecessary lenient

* Update CHANGELOG.md

* pass ListenerContext to handlePotentiallyStuck

Co-authored-by: Edvard Fonsell <edvard.fonsell@nitorcreations.com>
Co-authored-by: Mikko Tiihonen <gmokki@gmail.com>
  • Loading branch information
3 people committed Apr 18, 2020
1 parent 0785be9 commit a6bce76
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 47 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
**Highlights**
- `nflow-engine`
- Expedited clean shutdown for workflows that run many steps without delays.
- Add support for custom logic when nFlow engine thinks the workflow state processing may be stuck.

**Details**
- `nflow-engine`
- When shutdown is requested, stop processing workflows immediately after the current state has been executed.
- Add `WorkflowExecutorLister.handlePotentiallyStuck(Duration processingTime)` to support custom handling when nFlow engine thinks the workflow state processing may be stuck. If any registered listener implementation returns true from this method, nFlow will interrupt the processing thread. The default implementation returns false.
- Dependency updates:
- spring 5.2.5
- jackson 2.10.3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.apache.commons.lang3.exception.ExceptionUtils.getStackTrace;
import static org.joda.time.DateTime.now;
import static org.joda.time.DateTimeUtils.currentTimeMillis;
import static org.joda.time.Duration.standardMinutes;
import static org.slf4j.LoggerFactory.getLogger;
import static org.springframework.util.ReflectionUtils.invokeMethod;

import java.lang.reflect.UndeclaredThrowableException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.EnumSet;
Expand Down Expand Up @@ -80,8 +80,9 @@ class WorkflowStateProcessor implements Runnable {
private final int stateSaveRetryDelay;
private final int stateVariableValueTooLongRetryDelay;
private final Map<Long, WorkflowStateProcessor> processingInstances;
private long startTimeSeconds;
private DateTime startTime;
private Thread thread;
private ListenerContext listenerContext;

WorkflowStateProcessor(long instanceId, Supplier<Boolean> shutdownRequested, ObjectStringMapper objectMapper, WorkflowDefinitionService workflowDefinitions,
WorkflowInstanceService workflowInstances, WorkflowInstanceDao workflowInstanceDao, MaintenanceDao maintenanceDao,
Expand Down Expand Up @@ -109,7 +110,7 @@ class WorkflowStateProcessor implements Runnable {
@Override
public void run() {
MDC.put(MDC_KEY, String.valueOf(instanceId));
startTimeSeconds = currentTimeMillis() / 1000;
startTime = now();
thread = currentThread();
processingInstances.put(instanceId, this);
while (true) {
Expand Down Expand Up @@ -141,10 +142,10 @@ private void runImpl() {
WorkflowSettings settings = definition.getSettings();
int subsequentStateExecutions = 0;
while (instance.status == executing && !shutdownRequested.get()) {
startTimeSeconds = currentTimeMillis() / 1000;
startTime = now();
StateExecutionImpl execution = new StateExecutionImpl(instance, objectMapper, workflowInstanceDao,
workflowInstancePreProcessor, workflowInstances);
ListenerContext listenerContext = new ListenerContext(definition, instance, execution);
listenerContext = new ListenerContext(definition, instance, execution);
WorkflowInstanceAction.Builder actionBuilder = new WorkflowInstanceAction.Builder(instance);
WorkflowState state;
try {
Expand All @@ -155,12 +156,15 @@ private void runImpl() {
}
boolean saveInstanceState = true;
try {
processBeforeListeners(listenerContext);
listenerContext.nextAction = processWithListeners(listenerContext, instance, definition, execution, state);
processBeforeListeners();
listenerContext.nextAction = processWithListeners(instance, definition, execution, state);
} catch (StateVariableValueTooLongException e) {
instance = rescheduleStateVariableValueTooLong(e, instance);
saveInstanceState = false;
} catch (Throwable t) {
if (t instanceof UndeclaredThrowableException) {
t = t.getCause();
}
execution.setFailed(t);
if (state.isRetryAllowed(t)) {
logger.error("Handler threw a retryable exception, trying again later.", t);
Expand All @@ -175,9 +179,9 @@ private void runImpl() {
} finally {
if (saveInstanceState) {
if (execution.isFailed()) {
processAfterFailureListeners(listenerContext, execution.getThrown());
processAfterFailureListeners(execution.getThrown());
} else {
processAfterListeners(listenerContext);
processAfterListeners();
optionallyCleanupWorkflowInstanceHistory(definition.getSettings(), execution);
}
subsequentStateExecutions = busyLoopPrevention(state, settings, subsequentStateExecutions, execution);
Expand Down Expand Up @@ -337,7 +341,7 @@ private boolean isNextActivationImmediately(StateExecutionImpl execution) {
&& !execution.getNextActivation().isAfterNow();
}

private NextAction processWithListeners(ListenerContext listenerContext, WorkflowInstance instance,
private NextAction processWithListeners(WorkflowInstance instance,
AbstractWorkflowDefinition<? extends WorkflowState> definition, StateExecutionImpl execution, WorkflowState state) {
ProcessingExecutorListener processingListener = new ProcessingExecutorListener(instance, definition, execution, state);
List<WorkflowExecutorListener> chain = new ArrayList<>(executorListeners.size() + 1);
Expand Down Expand Up @@ -365,7 +369,7 @@ private void optionallyCleanupWorkflowInstanceHistory(WorkflowSettings settings,

private void sleepIgnoreInterrupted(int seconds) {
try {
Thread.sleep(SECONDS.toMillis(seconds));
SECONDS.sleep(seconds);
} catch (@SuppressWarnings("unused") InterruptedException ok) {
}
}
Expand Down Expand Up @@ -400,7 +404,7 @@ public ProcessingExecutorListener(WorkflowInstance instance, AbstractWorkflowDef
}

@Override
public NextAction process(ListenerContext listenerContext, ListenerChain chain) {
public NextAction process(ListenerContext context, ListenerChain chain) {
return new NormalStateHandler(instance, definition, execution, state).processState();
}
}
Expand Down Expand Up @@ -511,7 +515,7 @@ public NextAction processState() {

}

private void processBeforeListeners(ListenerContext listenerContext) {
private void processBeforeListeners() {
for (WorkflowExecutorListener listener : executorListeners) {
try {
listener.beforeProcessing(listenerContext);
Expand All @@ -521,7 +525,7 @@ private void processBeforeListeners(ListenerContext listenerContext) {
}
}

private void processAfterListeners(ListenerContext listenerContext) {
private void processAfterListeners() {
for (WorkflowExecutorListener listener : executorListeners) {
try {
listener.afterProcessing(listenerContext);
Expand All @@ -531,7 +535,7 @@ private void processAfterListeners(ListenerContext listenerContext) {
}
}

private void processAfterFailureListeners(ListenerContext listenerContext, Throwable ex) {
private void processAfterFailureListeners(Throwable ex) {
for (WorkflowExecutorListener listener : executorListeners) {
try {
listener.afterFailure(listenerContext, ex);
Expand All @@ -541,8 +545,8 @@ private void processAfterFailureListeners(ListenerContext listenerContext, Throw
}
}

public long getStartTimeSeconds() {
return startTimeSeconds;
public DateTime getStartTime() {
return startTime;
}

public void logPotentiallyStuck(long processingTimeSeconds) {
Expand All @@ -558,4 +562,19 @@ private StringBuilder getStackTraceAsString() {
return sb;
}

public void handlePotentiallyStuck(Duration processingTime) {
boolean interrupt = false;
for (WorkflowExecutorListener listener : executorListeners) {
try {
if (listener.handlePotentiallyStuck(listenerContext, processingTime)) {
interrupt = true;
}
} catch (Throwable t) {
logger.error("Error in " + listener.getClass().getName() + ".handleStuck (" + t.getMessage() + ")", t);
}
}
if (interrupt) {
thread.interrupt();
}
}
}
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package io.nflow.engine.internal.executor;

import static org.joda.time.DateTimeUtils.currentTimeMillis;
import static org.joda.time.DateTime.now;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Supplier;

import javax.inject.Inject;

import org.joda.time.DateTime;
import org.joda.time.Duration;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.env.Environment;
import org.springframework.stereotype.Component;
Expand Down Expand Up @@ -54,13 +56,14 @@ public WorkflowStateProcessor createProcessor(long instanceId, Supplier<Boolean>
}

public int getPotentiallyStuckProcessors() {
long currentTimeSeconds = currentTimeMillis() / 1000;
DateTime currentTime = now();
int potentiallyStuck = 0;
for (WorkflowStateProcessor processor : processingInstances.values()) {
long processingTimeSeconds = currentTimeSeconds - processor.getStartTimeSeconds();
if (processingTimeSeconds > stuckThreadThresholdSeconds) {
Duration processingTime = new Duration(processor.getStartTime(), currentTime);
if (processingTime.getStandardSeconds() > stuckThreadThresholdSeconds) {
potentiallyStuck++;
processor.logPotentiallyStuck(processingTimeSeconds);
processor.logPotentiallyStuck(processingTime.getStandardSeconds());
processor.handlePotentiallyStuck(processingTime);
}
}
return potentiallyStuck;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.Map;

import org.joda.time.DateTime;
import org.joda.time.Duration;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import io.nflow.engine.model.ModelObject;
Expand Down Expand Up @@ -129,4 +130,14 @@ default void afterFailure(ListenerContext listenerContext, Throwable throwable)
// no-op
}

/**
* Called when instance processing is potentially stuck. Return true to interrupt the processing thread. Default implementation
* returns false.
* @param listenerContext The listener context.
* @param processingTime How long the instances has been processed.
* @return True if processing should be interruped, false otherwise.
*/
default boolean handlePotentiallyStuck(ListenerContext listenerContext, Duration processingTime) {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
import static java.lang.Boolean.FALSE;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.is;
import static org.joda.time.DateTimeUtils.currentTimeMillis;
import static org.joda.time.DateTime.now;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyLong;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;

import org.joda.time.Duration;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mock;
Expand Down Expand Up @@ -77,15 +79,18 @@ public void factoryCreatesExecutorsWithListeners() {
public void checkIfStateProcessorsAreStuckLogsLongRunningInstance() {
WorkflowStateProcessor executor1 = mock(WorkflowStateProcessor.class);
WorkflowStateProcessor executor2 = mock(WorkflowStateProcessor.class);
when(executor1.getStartTimeSeconds()).thenReturn(currentTimeMillis() / 1000 - STUCK_THREAD_THRESHOLD - 1);
when(executor2.getStartTimeSeconds()).thenReturn(currentTimeMillis() / 1000 - STUCK_THREAD_THRESHOLD);
when(executor1.getStartTime()).thenReturn(now().minusSeconds(STUCK_THREAD_THRESHOLD + 1));
when(executor2.getStartTime()).thenReturn(now().minusSeconds(STUCK_THREAD_THRESHOLD));
factory.processingInstances.put(111L, executor1);
factory.processingInstances.put(222L, executor2);
factory.listeners = listeners;

int potentiallyStuckProcessors = factory.getPotentiallyStuckProcessors();

assertThat(potentiallyStuckProcessors, is(1));
verify(executor1).logPotentiallyStuck(anyLong());
verify(executor2, never()).logPotentiallyStuck(anyLong());
verify(executor1).handlePotentiallyStuck(any(Duration.class));
verify(executor2, never()).handlePotentiallyStuck(any(Duration.class));
}
}
Loading

0 comments on commit a6bce76

Please sign in to comment.