-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
app/retry: implement async retryer (#346)
Package retry provides a generic async slot function executor with retries for robustness against network failures. Functions are linked to a slot, executed asynchronously and network or context errors retried with backoff until duties related to a slot have elapsed (5 slots later). category: feature ticket: #354
- Loading branch information
1 parent
a2d294d
commit 07a2e54
Showing
11 changed files
with
483 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
00:00 [34mINFO[0m see source {"source": "source", "caller": "log_test.go:91"} | ||
00:00 [34mINFO[0m also source {"source": "source", "caller": "log_test.go:92"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
00:00 [31mERRO[0m err1: first {"1": 1, "caller": "log_test.go:59"} | ||
app/log/log_test.go:54 .TestErrorWrap | ||
00:00 [31mERRO[0m err2: second: first {"2": 2, "1": 1, "caller": "log_test.go:60"} | ||
app/log/log_test.go:54 .TestErrorWrap | ||
00:00 [31mERRO[0m err3: third: second: first {"3": 3, "2": 2, "1": 1, "caller": "log_test.go:61"} | ||
app/log/log_test.go:54 .TestErrorWrap | ||
00:00 [31mERRO[0m err1: first {"1": 1, "caller": "log_test.go:60"} | ||
app/log/log_test.go:55 .TestErrorWrap | ||
00:00 [31mERRO[0m err2: second: first {"2": 2, "1": 1, "caller": "log_test.go:61"} | ||
app/log/log_test.go:55 .TestErrorWrap | ||
00:00 [31mERRO[0m err3: third: second: first {"3": 3, "2": 2, "1": 1, "caller": "log_test.go:62"} | ||
app/log/log_test.go:55 .TestErrorWrap |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
00:00 [31mERRO[0m err1: EOF {"caller": "log_test.go:73"} | ||
app/log/log_test.go:73 .TestErrorWrapOther | ||
00:00 [31mERRO[0m err2: wrap: EOF {"caller": "log_test.go:74"} | ||
app/log/log_test.go:70 .TestErrorWrapOther | ||
00:00 [31mERRO[0m err1: EOF {"caller": "log_test.go:74"} | ||
app/log/log_test.go:74 .TestErrorWrapOther | ||
00:00 [31mERRO[0m err2: wrap: EOF {"caller": "log_test.go:75"} | ||
app/log/log_test.go:71 .TestErrorWrapOther |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
00:00 [35mDEBG[0m msg1 {"ctx1": 1, "caller": "log_test.go:43"} | ||
00:00 [34mINFO[0m msg2 {"ctx2": 2, "wrap2": 2, "caller": "log_test.go:44"} | ||
00:00 [33mWARN[0m msg3a {"wrap3": "a", "wrap2": 2, "caller": "log_test.go:45"} | ||
00:00 [33mWARN[0m msg3b {"wrap3": "b", "wrap2": 2, "caller": "log_test.go:46"} | ||
00:00 [35mDEBG[0m msg1 {"ctx1": 1, "caller": "log_test.go:44"} | ||
00:00 [34mINFO[0m msg2 {"ctx2": 2, "wrap2": 2, "caller": "log_test.go:45"} | ||
00:00 [33mWARN[0m msg3a {"wrap3": "a", "wrap2": 2, "caller": "log_test.go:46"} | ||
00:00 [33mWARN[0m msg3b {"wrap3": "b", "wrap2": 2, "caller": "log_test.go:47"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
// Copyright © 2021 Obol Technologies Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
// Package retry provides a generic async slot function executor with retries for robustness against network failures. | ||
// Functions are linked to a slot, executed asynchronously and network or context errors retried with backoff | ||
// until duties related to a slot have elapsed (5 slots later). | ||
package retry | ||
|
||
import ( | ||
"context" | ||
"net" | ||
"strings" | ||
"sync" | ||
"testing" | ||
"time" | ||
|
||
eth2client "github.com/attestantio/go-eth2-client" | ||
"go.opentelemetry.io/otel/attribute" | ||
"go.opentelemetry.io/otel/trace" | ||
|
||
"github.com/obolnetwork/charon/app/errors" | ||
"github.com/obolnetwork/charon/app/log" | ||
"github.com/obolnetwork/charon/app/tracer" | ||
"github.com/obolnetwork/charon/app/z" | ||
) | ||
|
||
// lateFactor defines the number of slots duties may be late. | ||
// See https://pintail.xyz/posts/modelling-the-impact-of-altair/#proposer-and-delay-rewards. | ||
const lateFactor = 5 | ||
|
||
// slotTimeProvider defines eth2client interface for resolving slot start times. | ||
type slotTimeProvider interface { | ||
eth2client.GenesisTimeProvider | ||
eth2client.SlotDurationProvider | ||
} | ||
|
||
// New returns a new Retryer instance. | ||
func New(ctx context.Context, eth2Svc eth2client.Service) (*Retryer, error) { | ||
eth2Cl, ok := eth2Svc.(slotTimeProvider) | ||
if !ok { | ||
return nil, errors.New("invalid eth2 service") | ||
} | ||
|
||
genesis, err := eth2Cl.GenesisTime(ctx) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
duration, err := eth2Cl.SlotDuration(ctx) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// ctxTimeoutFunc returns a context that is cancelled when duties for a slot have elapsed. | ||
ctxTimeoutFunc := func(ctx context.Context, slot int64) (context.Context, context.CancelFunc) { | ||
start := genesis.Add(duration * time.Duration(slot)) | ||
end := start.Add(duration * time.Duration(lateFactor)) | ||
|
||
return context.WithTimeout(ctx, time.Until(end)) | ||
} | ||
|
||
// backoffProvider is a naive constant 1s backoff function. | ||
backoffProvider := func() func() <-chan time.Time { | ||
return func() <-chan time.Time { | ||
const backoff = time.Second | ||
return time.After(backoff) | ||
} | ||
} | ||
|
||
return &Retryer{ | ||
shutdown: make(chan struct{}), | ||
ctxTimeoutFunc: ctxTimeoutFunc, | ||
backoffProvider: backoffProvider, | ||
}, nil | ||
} | ||
|
||
// NewForT returns a new Retryer instance for testing supporting a custom clock. | ||
func NewForT( | ||
_ *testing.T, | ||
ctxTimeoutFunc func(ctx context.Context, slot int64) (context.Context, context.CancelFunc), | ||
backoffProvider func() func() <-chan time.Time, | ||
) (*Retryer, error) { | ||
return &Retryer{ | ||
shutdown: make(chan struct{}), | ||
ctxTimeoutFunc: ctxTimeoutFunc, | ||
backoffProvider: backoffProvider, | ||
}, nil | ||
} | ||
|
||
// Retryer provides execution of functions asynchronously with retry adding robustness to network errors. | ||
type Retryer struct { | ||
shutdown chan struct{} | ||
ctxTimeoutFunc func(ctx context.Context, slot int64) (context.Context, context.CancelFunc) | ||
backoffProvider func() func() <-chan time.Time | ||
|
||
wg sync.WaitGroup | ||
} | ||
|
||
// DoAsync will execute the function including retries on network or context errors. | ||
// It is intended to be used asynchronously: | ||
// go retryer.DoAsync(ctx, duty.Slot, "foo", fn) | ||
func (r *Retryer) DoAsync(parent context.Context, slot int64, name string, fn func(context.Context) error) { | ||
if r.isShutdown() { | ||
return | ||
} | ||
|
||
r.wg.Add(1) | ||
defer r.wg.Done() | ||
|
||
backoffFunc := r.backoffProvider() | ||
|
||
// Switch to a new context since this is async and parent context may be closed. | ||
ctx := log.CopyFields(context.Background(), parent) | ||
ctx = log.WithTopic(ctx, "retry") | ||
ctx = trace.ContextWithSpan(ctx, trace.SpanFromContext(parent)) | ||
ctx, cancel := r.ctxTimeoutFunc(ctx, slot) | ||
defer cancel() | ||
|
||
ctx, span := tracer.Start(ctx, "app/retry.DoAsync") | ||
defer span.End() | ||
span.SetAttributes(attribute.String("name", name)) | ||
|
||
for i := 0; ; i++ { | ||
span.AddEvent("retry.attempt.start", trace.WithAttributes(attribute.Int("i", i))) | ||
|
||
err := fn(ctx) | ||
if err == nil { | ||
return | ||
} | ||
|
||
var nerr net.Error | ||
isNetErr := errors.As(err, &nerr) | ||
isTempErr := isTemporaryBeaconErr(err) | ||
isCtxErr := errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) | ||
// Note that the local context is not checked, since we care about downstream timeouts. | ||
|
||
if !isCtxErr && !isNetErr && !isTempErr { | ||
log.Error(ctx, "Permanent failure calling "+name, err) | ||
return | ||
} | ||
|
||
if ctx.Err() == nil { | ||
log.Warn(ctx, "Temporary failure (will retry) calling "+name, z.Err(err)) | ||
span.AddEvent("retry.backoff.start") | ||
select { | ||
case <-backoffFunc(): | ||
case <-ctx.Done(): | ||
case <-r.shutdown: | ||
return | ||
} | ||
span.AddEvent("retry.backoff.done") | ||
} | ||
|
||
if ctx.Err() != nil { | ||
log.Error(ctx, "Timeout retrying "+name, ctx.Err()) | ||
return | ||
} | ||
} | ||
} | ||
|
||
// isTemporaryBeaconErr returns true if the error is a temporary beacon node error. | ||
// eth2http doesn't return structured errors or error sentinels, so this is brittle. | ||
func isTemporaryBeaconErr(err error) bool { | ||
// Check for timing errors like: | ||
// - Proposer duties were requested for a future epoch. | ||
// - Cannot create attestation for future slot. | ||
if strings.Contains(err.Error(), "future") { //nolint:gosimple // More checks will be added below. | ||
return true | ||
} | ||
|
||
// TODO(corver): Add more checks here. | ||
|
||
return false | ||
} | ||
|
||
// isShutdown returns true if Shutdown has been called. | ||
func (r *Retryer) isShutdown() bool { | ||
select { | ||
case <-r.shutdown: | ||
return true | ||
default: | ||
return false | ||
} | ||
} | ||
|
||
// Shutdown triggers graceful shutdown and waits for all active function to complete or timeout. | ||
func (r *Retryer) Shutdown(ctx context.Context) { | ||
close(r.shutdown) | ||
|
||
done := make(chan struct{}) | ||
go func() { | ||
r.wg.Wait() | ||
done <- struct{}{} | ||
}() | ||
|
||
select { | ||
case <-ctx.Done(): | ||
case <-done: | ||
} | ||
} |
Oops, something went wrong.