From bec7c38a9334df69ad067337fab44222177c13ed Mon Sep 17 00:00:00 2001 From: Joseph Phillips Date: Wed, 16 Jun 2021 13:14:58 +0200 Subject: [PATCH] Ensures that when the executor returns a mutex wait cancellation error, the resolver loop returns ErrRestart. This ensures that when the migration-inactive flag drops upon migration commencement, we do not cause the unit to go into an error state, in turn causing migration validation to fail. Instead we mimic the behaviour for errors determined to be due to fortress lockdown. --- worker/uniter/operation/executor.go | 2 +- worker/uniter/resolver/loop.go | 15 ++++++++++++++ worker/uniter/resolver/loop_test.go | 31 ++++++++++++++++++++++++++--- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/worker/uniter/operation/executor.go b/worker/uniter/operation/executor.go index f4d8ca2d174..b22a292fa2b 100644 --- a/worker/uniter/operation/executor.go +++ b/worker/uniter/operation/executor.go @@ -87,7 +87,7 @@ func (x *executor) Run(op Operation, remoteStateChange <-chan remotestate.Snapsh if op.NeedsGlobalMachineLock() { releaser, err := x.acquireMachineLock(op.String()) if err != nil { - return errors.Annotatef(err, "could not acquire %q lock for %s", op, x.unitName) + return errors.Annotatef(err, "acquiring %q lock for %s", op, x.unitName) } defer x.logger.Debugf("lock released for %s", x.unitName) defer releaser() diff --git a/worker/uniter/resolver/loop.go b/worker/uniter/resolver/loop.go index 09c157bb0c9..4ef927d83c1 100644 --- a/worker/uniter/resolver/loop.go +++ b/worker/uniter/resolver/loop.go @@ -6,6 +6,7 @@ package resolver import ( "github.com/juju/charm/v7/hooks" "github.com/juju/errors" + "github.com/juju/mutex" "github.com/juju/juju/core/lxdprofile" "github.com/juju/juju/worker/fortress" @@ -20,6 +21,7 @@ var logger interface{} // Logger represents the logging methods used in this package. type Logger interface { Errorf(string, ...interface{}) + Warningf(string, ...interface{}) Debugf(string, ...interface{}) Tracef(string, ...interface{}) } @@ -114,6 +116,19 @@ func Loop(cfg LoopConfig, localState *LocalState) error { cfg.Logger.Tracef("running op: %v", op) if err := cfg.Executor.Run(op, remoteStateChanged); err != nil { close(done) + + if errors.Cause(err) == mutex.ErrCancelled { + // If the lock acquisition was cancelled (such as when the + // migration-inactive flag drops, we do not want the + // resolver to surface that error. This puts the agent into + // the "failed" state, which causes the initial migration + // validation phase to fail. + // The safest thing to do is to bounce the loop and + // reevaluate our state, which is what happens upon a + // fortress error anyway (uniter.TranslateFortressErrors). + cfg.Logger.Warningf("executor lock acquisition cancelled") + return ErrRestart + } return errors.Trace(err) } close(done) diff --git a/worker/uniter/resolver/loop_test.go b/worker/uniter/resolver/loop_test.go index bd282c724df..348f685d1df 100644 --- a/worker/uniter/resolver/loop_test.go +++ b/worker/uniter/resolver/loop_test.go @@ -10,7 +10,7 @@ import ( "github.com/juju/charm/v7" "github.com/juju/charm/v7/hooks" "github.com/juju/loggo" - + "github.com/juju/mutex" envtesting "github.com/juju/testing" jc "github.com/juju/testing/checkers" gc "gopkg.in/check.v1" @@ -259,7 +259,7 @@ func (s *LoopSuite) TestLoopWithChange(c *gc.C) { } func (s *LoopSuite) TestRunFails(c *gc.C) { - s.executor.SetErrors(errors.New("Run fails")) + s.executor.SetErrors(errors.New("run fails")) s.resolver = resolver.ResolverFunc(func( _ resolver.LocalState, _ remotestate.Snapshot, @@ -268,7 +268,7 @@ func (s *LoopSuite) TestRunFails(c *gc.C) { return mockOp{}, nil }) _, err := s.loop() - c.Assert(err, gc.ErrorMatches, "Run fails") + c.Assert(err, gc.ErrorMatches, "run fails") } func (s *LoopSuite) TestNextOpFails(c *gc.C) { @@ -412,6 +412,31 @@ func (s *LoopSuite) testCheckCharmUpgradeCallsRun(c *gc.C) { s.executor.CheckCallNames(c, "State", "State", "Run", "State") } +func (s *LoopSuite) TestCancelledLockAcquisitionCausesRestart(c *gc.C) { + s.executor = &mockOpExecutor{ + Executor: nil, + Stub: envtesting.Stub{}, + st: operation.State{ + Started: true, + Kind: operation.Continue, + }, + run: func(operation.Operation, <-chan remotestate.Snapshot) error { + return mutex.ErrCancelled + }, + } + + s.resolver = resolver.ResolverFunc(func( + _ resolver.LocalState, + _ remotestate.Snapshot, + _ operation.Factory, + ) (operation.Operation, error) { + return &mockOp{}, nil + }) + + _, err := s.loop() + c.Assert(err, gc.Equals, resolver.ErrRestart) +} + func waitChannel(c *gc.C, ch <-chan interface{}, activity string) interface{} { select { case v := <-ch: