From 35015d4f78325b638ca63492d3555705fdd7df56 Mon Sep 17 00:00:00 2001
From: Joseph Phillips <joe.phillips@canonical.com>
Date: Fri, 9 Apr 2021 11:14:15 +0200
Subject: [PATCH 1/2] Fixes typo in machine-lock error message.

---
 core/machinelock/machinelock.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/machinelock/machinelock.go b/core/machinelock/machinelock.go
index 36dbad87eba..f5fa8b7bbae 100644
--- a/core/machinelock/machinelock.go
+++ b/core/machinelock/machinelock.go
@@ -131,7 +131,7 @@ func (s Spec) Validate() error {
 		}
 	}
 	if s.Worker == "" {
-		return errors.NotValidf("mssing Worker")
+		return errors.NotValidf("missing Worker")
 	}
 	return nil
 }

From bec7c38a9334df69ad067337fab44222177c13ed Mon Sep 17 00:00:00 2001
From: Joseph Phillips <joe.phillips@canonical.com>
Date: Wed, 16 Jun 2021 13:14:58 +0200
Subject: [PATCH 2/2] Ensures that when the executor returns a mutex wait
 cancellation error, the resolver loop returns ErrRestart.

This ensures that when the migration-inactive flag drops upon migration
commencement, we do not cause the unit to go into an error state, in
turn causing migration validation to fail.

Instead we mimic the behaviour for errors determined to be due to
fortress lockdown.
---
 worker/uniter/operation/executor.go |  2 +-
 worker/uniter/resolver/loop.go      | 15 ++++++++++++++
 worker/uniter/resolver/loop_test.go | 31 ++++++++++++++++++++++++++---
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/worker/uniter/operation/executor.go b/worker/uniter/operation/executor.go
index f4d8ca2d174..b22a292fa2b 100644
--- a/worker/uniter/operation/executor.go
+++ b/worker/uniter/operation/executor.go
@@ -87,7 +87,7 @@ func (x *executor) Run(op Operation, remoteStateChange <-chan remotestate.Snapsh
 	if op.NeedsGlobalMachineLock() {
 		releaser, err := x.acquireMachineLock(op.String())
 		if err != nil {
-			return errors.Annotatef(err, "could not acquire %q lock for %s", op, x.unitName)
+			return errors.Annotatef(err, "acquiring %q lock for %s", op, x.unitName)
 		}
 		defer x.logger.Debugf("lock released for %s", x.unitName)
 		defer releaser()
diff --git a/worker/uniter/resolver/loop.go b/worker/uniter/resolver/loop.go
index 09c157bb0c9..4ef927d83c1 100644
--- a/worker/uniter/resolver/loop.go
+++ b/worker/uniter/resolver/loop.go
@@ -6,6 +6,7 @@ package resolver
 import (
 	"github.com/juju/charm/v7/hooks"
 	"github.com/juju/errors"
+	"github.com/juju/mutex"
 
 	"github.com/juju/juju/core/lxdprofile"
 	"github.com/juju/juju/worker/fortress"
@@ -20,6 +21,7 @@ var logger interface{}
 // Logger represents the logging methods used in this package.
 type Logger interface {
 	Errorf(string, ...interface{})
+	Warningf(string, ...interface{})
 	Debugf(string, ...interface{})
 	Tracef(string, ...interface{})
 }
@@ -114,6 +116,19 @@ func Loop(cfg LoopConfig, localState *LocalState) error {
 			cfg.Logger.Tracef("running op: %v", op)
 			if err := cfg.Executor.Run(op, remoteStateChanged); err != nil {
 				close(done)
+
+				if errors.Cause(err) == mutex.ErrCancelled {
+					// If the lock acquisition was cancelled (such as when the
+					// migration-inactive flag drops, we do not want the
+					// resolver to surface that error. This puts the agent into
+					// the "failed" state, which causes the initial migration
+					// validation phase to fail.
+					// The safest thing to do is to bounce the loop and
+					// reevaluate our state, which is what happens upon a
+					// fortress error anyway (uniter.TranslateFortressErrors).
+					cfg.Logger.Warningf("executor lock acquisition cancelled")
+					return ErrRestart
+				}
 				return errors.Trace(err)
 			}
 			close(done)
diff --git a/worker/uniter/resolver/loop_test.go b/worker/uniter/resolver/loop_test.go
index bd282c724df..348f685d1df 100644
--- a/worker/uniter/resolver/loop_test.go
+++ b/worker/uniter/resolver/loop_test.go
@@ -10,7 +10,7 @@ import (
 	"github.com/juju/charm/v7"
 	"github.com/juju/charm/v7/hooks"
 	"github.com/juju/loggo"
-
+	"github.com/juju/mutex"
 	envtesting "github.com/juju/testing"
 	jc "github.com/juju/testing/checkers"
 	gc "gopkg.in/check.v1"
@@ -259,7 +259,7 @@ func (s *LoopSuite) TestLoopWithChange(c *gc.C) {
 }
 
 func (s *LoopSuite) TestRunFails(c *gc.C) {
-	s.executor.SetErrors(errors.New("Run fails"))
+	s.executor.SetErrors(errors.New("run fails"))
 	s.resolver = resolver.ResolverFunc(func(
 		_ resolver.LocalState,
 		_ remotestate.Snapshot,
@@ -268,7 +268,7 @@ func (s *LoopSuite) TestRunFails(c *gc.C) {
 		return mockOp{}, nil
 	})
 	_, err := s.loop()
-	c.Assert(err, gc.ErrorMatches, "Run fails")
+	c.Assert(err, gc.ErrorMatches, "run fails")
 }
 
 func (s *LoopSuite) TestNextOpFails(c *gc.C) {
@@ -412,6 +412,31 @@ func (s *LoopSuite) testCheckCharmUpgradeCallsRun(c *gc.C) {
 	s.executor.CheckCallNames(c, "State", "State", "Run", "State")
 }
 
+func (s *LoopSuite) TestCancelledLockAcquisitionCausesRestart(c *gc.C) {
+	s.executor = &mockOpExecutor{
+		Executor: nil,
+		Stub:     envtesting.Stub{},
+		st: operation.State{
+			Started: true,
+			Kind:    operation.Continue,
+		},
+		run: func(operation.Operation, <-chan remotestate.Snapshot) error {
+			return mutex.ErrCancelled
+		},
+	}
+
+	s.resolver = resolver.ResolverFunc(func(
+		_ resolver.LocalState,
+		_ remotestate.Snapshot,
+		_ operation.Factory,
+	) (operation.Operation, error) {
+		return &mockOp{}, nil
+	})
+
+	_, err := s.loop()
+	c.Assert(err, gc.Equals, resolver.ErrRestart)
+}
+
 func waitChannel(c *gc.C, ch <-chan interface{}, activity string) interface{} {
 	select {
 	case v := <-ch: