Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core/tracker: improve failed duty message #972

Merged
merged 5 commits into from
Aug 16, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 55 additions & 11 deletions core/tracker/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func (t *Tracker) Run(ctx context.Context) error {
ctx := log.WithCtx(ctx, z.Any("duty", duty))

// Analyse failed duties
failed, failedComponent, failedMsg := analyseDutyFailed(duty, t.events[duty])
failed, failedComponent, failedMsg := analyseDutyFailed(duty, t.events)
t.failedDutyReporter(ctx, duty, failed, failedComponent, failedMsg)

// Analyse peer participation
Expand All @@ -120,9 +120,9 @@ func (t *Tracker) Run(ctx context.Context) error {
}
}

// analyseDutyFailed detects if a duty failed. It returns true if the duty didn't complete the sigagg component.
// If it failed, it also returns the component where it failed and a human friendly error message explaining why.
func analyseDutyFailed(duty core.Duty, es []event) (bool, component, string) {
// dutyFailedComponent returns true if the duty failed. It also returns the component where the duty got stuck. If the duty didn't get stuck, it
// returns the sigAgg component. It assumes that all the input events are for a single duty.
func dutyFailedComponent(es []event) (bool, component) {
events := make([]event, len(es))
copy(events, es)

Expand All @@ -132,17 +132,61 @@ func analyseDutyFailed(duty core.Duty, es []event) (bool, component, string) {
})

if len(events) == 0 {
return false, sentinel, "No events to analyse"
return false, sentinel
}

if events[0].component == sigAgg {
// Duty completed successfully
c := events[0].component
if c == sigAgg {
return false, sigAgg
}

return true, c + 1
}

// analyseDutyFailed detects if the given duty failed. It returns false if the duty didn't fail, i.e., the duty didn't get stuck and completes the sigAgg component.
// It returns true if the duty failed. It also returns the component where the duty got stuck and a human friendly error message explaining why.
func analyseDutyFailed(duty core.Duty, allEvents map[core.Duty][]event) (bool, component, string) {
var (
failed bool
comp component
msg string
)

failed, comp = dutyFailedComponent(allEvents[duty])
if !failed {
return false, sigAgg, ""
}

// TODO(xenowits): Improve message to have more specific details.
// Ex: If the duty got stuck during validatorAPI, we can say "the VC didn't successfully submit a signed duty").
return true, events[0].component + 1, fmt.Sprintf("%s failed in %s component", duty.String(), (events[0].component + 1).String())
switch comp {
case fetcher:
msg = "couldn't fetch duty data from the beacon node"

if duty.Type == core.DutyProposer || duty.Type == core.DutyBuilderProposer {
// Proposer duties may fail if core.DutyRandao fails
randaoFailed, randaoComp := dutyFailedComponent(allEvents[core.NewRandaoDuty(duty.Slot)])
if randaoFailed {
if randaoComp == parSigDBThreshold {
msg = "couldn't propose block due to insufficient partial randao signatures"
} else {
msg = "couldn't propose block since randao duty failed"
}
}
}
case consensus:
msg = "consensus algorithm didn't complete"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe make these constants, so comparison and tests and updates are trivial.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed, great idea

case validatorAPI:
msg = "signed duty not submitted by local validator client"
case parSigDBInternal:
msg = "partial signature database didn't trigger partial signature exchange"
case parSigEx:
msg = "no partial signatures received from peers"
case parSigDBThreshold:
msg = "insufficient partial signatures received, minimum required threshold not reached"
default:
msg = fmt.Sprintf("%s duty failed at %s", duty.String(), comp.String())
}

return true, comp, msg
}

// failedDutyReporter instruments failed duties.
Expand Down Expand Up @@ -224,7 +268,7 @@ func isParSigEventExpected(duty core.Duty, pubkey core.PubKey, allEvents map[cor
// newParticipationReporter returns a new participation reporter function which logs and instruments peer participation
// and unexpectedPeers.
func newParticipationReporter(peers []p2p.Peer) func(context.Context, core.Duty, map[int]bool, map[int]bool) {
// prevAbsent is the set of peers who didn't participated in the last duty.
// prevAbsent is the set of peers who didn't participate in the last duty.
var prevAbsent []string

return func(ctx context.Context, duty core.Duty, participatedShares map[int]bool, unexpectedShares map[int]bool) {
Expand Down
168 changes: 168 additions & 0 deletions core/tracker/tracker_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ func TestTrackerFailedDuty(t *testing.T) {
require.Equal(t, testData[0].duty, failedDuty)
require.True(t, isFailed)
require.Equal(t, consensus, component)
require.Equal(t, msg, "consensus algorithm didn't complete")
count++

if count == len(testData) {
Expand Down Expand Up @@ -79,6 +80,7 @@ func TestTrackerFailedDuty(t *testing.T) {
require.Equal(t, testData[0].duty, failedDuty)
require.False(t, isFailed)
require.Equal(t, sigAgg, component)
require.Equal(t, msg, "")
count++

if count == len(testData) {
Expand Down Expand Up @@ -111,6 +113,172 @@ func TestTrackerFailedDuty(t *testing.T) {
})
}

func TestAnalyseDutyFailed(t *testing.T) {
slot := 1
attDuty := core.NewAttesterDuty(int64(slot))
proposerDuty := core.NewProposerDuty(int64(slot))
randaoDuty := core.NewRandaoDuty(int64(slot))

t.Run("Failed", func(t *testing.T) {
// Failed at fetcher
events := map[core.Duty][]event{
attDuty: {
{
duty: attDuty,
component: scheduler,
},
},
}

failed, comp, msg := analyseDutyFailed(attDuty, events)
require.True(t, failed)
require.Equal(t, comp, fetcher)
require.Equal(t, msg, "couldn't fetch duty data from the beacon node")

// Failed at consensus
events[attDuty] = append(events[attDuty], event{
duty: attDuty,
component: fetcher,
})

failed, comp, msg = analyseDutyFailed(attDuty, events)
require.True(t, failed)
require.Equal(t, comp, consensus)
require.Equal(t, msg, "consensus algorithm didn't complete")

// Failed at validatorAPI
events[attDuty] = append(events[attDuty], event{
duty: attDuty,
component: consensus,
})

failed, comp, msg = analyseDutyFailed(attDuty, events)
require.True(t, failed)
require.Equal(t, comp, validatorAPI)
require.Equal(t, msg, "signed duty not submitted by local validator client")

// Failed at parsigDBInternal
events[attDuty] = append(events[attDuty], event{
duty: attDuty,
component: validatorAPI,
})

failed, comp, msg = analyseDutyFailed(attDuty, events)
require.True(t, failed)
require.Equal(t, comp, parSigDBInternal)
require.Equal(t, msg, "partial signature database didn't trigger partial signature exchange")

// Failed at parsigEx
events[attDuty] = append(events[attDuty], event{
duty: attDuty,
component: parSigDBInternal,
})

failed, comp, msg = analyseDutyFailed(attDuty, events)
require.True(t, failed)
require.Equal(t, comp, parSigEx)
require.Equal(t, msg, "no partial signatures received from peers")

// Failed at parsigDBThreshold
events[attDuty] = append(events[attDuty], event{
duty: attDuty,
component: parSigEx,
})

failed, comp, msg = analyseDutyFailed(attDuty, events)
require.True(t, failed)
require.Equal(t, comp, parSigDBThreshold)
require.Equal(t, msg, "insufficient partial signatures received, minimum required threshold not reached")
})

t.Run("FailedAtFetcherAsRandaoFailed", func(t *testing.T) {
// Randao failed at parSigEx
events := map[core.Duty][]event{
proposerDuty: {
{
duty: proposerDuty,
component: scheduler,
},
},
randaoDuty: {
{
duty: proposerDuty,
component: validatorAPI,
},
{
duty: proposerDuty,
component: parSigDBInternal,
},
},
}

failed, comp, msg := analyseDutyFailed(proposerDuty, events)
require.True(t, failed)
require.Equal(t, comp, fetcher)
require.Equal(t, msg, "couldn't propose block since randao duty failed")

// Randao failed at parSigDBThreshold
events[randaoDuty] = append(events[randaoDuty], event{
duty: proposerDuty,
component: parSigEx,
})

failed, comp, msg = analyseDutyFailed(proposerDuty, events)
require.True(t, failed)
require.Equal(t, comp, fetcher)
require.Equal(t, msg, "couldn't propose block due to insufficient partial randao signatures")
})

t.Run("DutySuccess", func(t *testing.T) {
var (
events = make(map[core.Duty][]event)
attDuty = core.NewAttesterDuty(int64(1))
)

for comp := scheduler; comp < sentinel; comp++ {
events[attDuty] = append(events[attDuty], event{component: comp})
}

failed, comp, msg := analyseDutyFailed(proposerDuty, events)
require.False(t, failed)
require.Equal(t, comp, sigAgg)
require.Equal(t, msg, "")
})
}

func TestDutyFailedComponent(t *testing.T) {
var events []event
for comp := scheduler; comp < sentinel; comp++ {
events = append(events, event{component: comp})
}

t.Run("DutySuccess", func(t *testing.T) {
failed, comp := dutyFailedComponent(events)
require.False(t, failed)
require.Equal(t, comp, sigAgg)
})

t.Run("EmptyEvents", func(t *testing.T) {
f, comp := dutyFailedComponent([]event{})
require.False(t, f)
require.Equal(t, comp, sentinel)
})

t.Run("DutyFailed", func(t *testing.T) {
// Remove the last component (sigAgg) from the events array.
events = events[:len(events)-1]
f, comp := dutyFailedComponent(events)
require.True(t, f)
require.Equal(t, comp, sigAgg)

// Remove the second-last component (parsigDBThreshold) from the events array.
events = events[:len(events)-1]
f, comp = dutyFailedComponent(events)
require.True(t, f)
require.Equal(t, comp, parSigDBThreshold)
})
}

func TestTrackerParticipation(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
slots := []int{1, 2, 3}
Expand Down