Skip to content

Commit

Permalink
db: add CanDeterministicallySingleDelete
Browse files Browse the repository at this point in the history
Add a new specialized CanDeterministicallySingleDelete function that allows a
user to inspect hidden internal keys within the current user key to determine
whether the behavior of a SingleDelete of the current Key would be
deterministic, assuming no writes are made to the key between Iterator creation
and commit of the SingleDelete. The CanDeterministicallySingleDelete func may
only be called on an Iterator oriented in the forward direction and does not
change the external Iterator position.

During intent resolution, CockroachDB will write a SINGLEDEL tombstone to
delete the intent if the intent's value indicates that it must have only been
written once. Subtle logic that cuts across the stack makes this optimization
possible, and one possible explanation for a recent instance of replica
divergence (cockroachdb/cockroach#114421) is a bug in this logic. We anticipate
using CanDeterministicallySingleDelete within CockroachDB's intent resolution
to validate this logic at runtime.

There is some potential for this function to drive the decision to use
SingleDelete when creating batches for local Engine application only.

Informs cockroachdb/cockroach#114492.
  • Loading branch information
jbowens committed Nov 21, 2023
1 parent fc555fe commit 32e8ed5
Show file tree
Hide file tree
Showing 8 changed files with 613 additions and 1 deletion.
23 changes: 22 additions & 1 deletion data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,27 @@ func runIterCmd(d *datadriven.TestData, iter *Iterator, closeIter bool) string {
}
validityState = iter.NextWithLimit([]byte(parts[1]))
printValidityState = true
case "internal-next":
validity, keyKind := iter.internalNext()
switch validity {
case internalNextError:
fmt.Fprintf(&b, "err: %s\n", iter.Error())
case internalNextExhausted:
fmt.Fprint(&b, ".\n")
case internalNextValid:
fmt.Fprintf(&b, "%s\n", keyKind)
default:
panic("unreachable")
}
continue
case "can-deterministically-single-delete":
ok, err := CanDeterministicallySingleDelete(iter)
if err != nil {
fmt.Fprintf(&b, "err: %s\n", err)
} else {
fmt.Fprintf(&b, "%t\n", ok)
}
continue
case "prev-limit":
if len(parts) != 2 {
return "prev-limit <limit>\n"
Expand Down Expand Up @@ -531,7 +552,7 @@ func runBatchDefineCmd(d *datadriven.TestData, b *Batch) error {
err = b.Delete([]byte(parts[1]), nil)
case "del-sized":
if len(parts) != 3 {
return errors.Errorf("%s expects 1 argument", parts[0])
return errors.Errorf("%s expects 2 arguments", parts[0])
}
var valSize uint64
valSize, err = strconv.ParseUint(parts[2], 10, 32)
Expand Down
171 changes: 171 additions & 0 deletions iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,11 @@ const (
seekPrefixGELastPositioningOp
seekGELastPositioningOp
seekLTLastPositioningOp
// internalNextOp is a special internal iterator positioning operation used
// by CanDeterministicallySingleDelete. It exists for enforcing requirements
// around calling CanDeterministicallySingleDelete at most once per external
// iterator position.
internalNextOp
// invalidatedLastPositionOp is similar to unknownLastPositionOp and the
// only reason to distinguish this is for the wider set of SeekGE
// optimizations we permit for the external iterator Iterator.forwardOnly
Expand Down Expand Up @@ -2823,3 +2828,169 @@ func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) {
stats.RangeKeyStats.SkippedPoints)
}
}

// CanDeterministicallySingleDelete takes a valid iterator and examines internal
// state to determine if a SingleDelete deleting Iterator.Key() would
// deterministically delete the key. CanDeterministicallySingleDelete requires
// the iterator to be oriented in the forward direction (eg, the last
// positioning operation must've been a First, a Seek[Prefix]GE, or a
// Next[Prefix][WithLimit]).
//
// This function does not change the external position of the iterator, and all
// positioning methods should behave the same as if it was never called. This
// function will only return a meaningful result the first time it's invoked at
// an iterator position. This function invalidates the iterator Value's memory,
// and the caller must not rely on the memory safety of the previous Iterator
// position.
//
// If CanDeterministicallySingleDelete returns true AND the key at the iterator
// position is not modified between the creation of the Iterator and the commit
// of a batch containing a SingleDelete over the key, then the caller can be
// assured that SingleDelete is equivalent to Delete on the local engine, but it
// may not be true on another engine that received the same writes and with
// logically equivalent state since this engine may have collapsed multiple SETs
// into one.
func CanDeterministicallySingleDelete(it *Iterator) (bool, error) {
// This function may only be called once per external iterator position. We
// can validate this by checking the last positioning operation.
if it.lastPositioningOp == internalNextOp {
return false, errors.New("pebble: CanDeterministicallySingleDelete called twice")
}
validity, kind := it.internalNext()
var shadowedBySingleDelete bool
for validity == internalNextValid {
switch kind {
case InternalKeyKindDelete, InternalKeyKindDeleteSized:
// A DEL or DELSIZED tombstone is okay. An internal key
// sequence like SINGLEDEL; SET; DEL; SET can be handled
// deterministically. If there are SETs further down, we
// don't care about them.
return true, nil
case InternalKeyKindSingleDelete:
// A SingleDelete is okay as long as when that SingleDelete was
// written, it was written deterministically (eg, with its own
// CanDeterministicallySingleDelete check). Validate that it was
// written deterministically. We'll allow one set to appear after
// the SingleDelete.
shadowedBySingleDelete = true
validity, kind = it.internalNext()
continue
case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge:
// If we observed a single delete, it's allowed to delete 1 key.
// We'll keep looping to validate that the internal keys beneath the
// already-written single delete are copacetic.
if shadowedBySingleDelete {
shadowedBySingleDelete = false
validity, kind = it.internalNext()
continue
}
// We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL
// that deleted the KV at the original iterator position could
// result in this key becoming visible.
return false, nil
case InternalKeyKindRangeDelete:
// RangeDeletes are handled by the merging iterator and should never
// be observed by the top-level Iterator.
panic(errors.AssertionFailedf("pebble: unexpected range delete"))
case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
// Range keys are interleaved at the maximal sequence number and
// should never be observed within a user key.
panic(errors.AssertionFailedf("pebble: unexpected range key"))
default:
panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind)))
}
}
if validity == internalNextError {
return false, it.Error()
}
return true, nil
}

// internalNextValidity enumerates the potential outcomes of a call to
// internalNext.
type internalNextValidity int8

const (
// internalNextError is returned by internalNext when an error occurred and
// the caller is responsible for checking iter.Error().
internalNextError internalNextValidity = iota
// internalNextExhausted is returned by internalNext when the next internal
// key is an internal key with a different user key than Iterator.Key().
internalNextExhausted
// internalNextValid is returned by internalNext when the internal next
// found a shadowed internal key with a user key equal to Iterator.Key().
internalNextValid
)

// internalNext advances internal Iterator state forward to expose the
// InternalKeyKind of the next internal key with a user key equal to Key().
//
// internalNext is a highly specialized operation and is unlikely to be
// generally useful. See Iterator.Next for how to reposition the iterator to the
// next key. internalNext requires the Iterator to be at a valid position in the
// forward direction (the last positioning operation must've been a First, a
// Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true).
//
// internalNext, unlike all other Iterator methods, exposes internal LSM state.
// internalNext advances the Iterator's internal iterator to the next shadowed
// key with a user key equal to Key(). When a key is overwritten or deleted, its
// removal from the LSM occurs lazily as a part of compactions. internalNext
// allows the caller to see whether an obsolete internal key exists with the
// current Key(), and what it's key kind is. Note that the existence of an
// internal key is nondeterministic and dependent on internal LSM state. These
// semantics are unlikely to be applicable to almost all use cases.
//
// If internalNext finds a key that shares the same user key as Key(), it
// returns internalNextValid and the internal key's kind. If internalNext
// encounters an error, it returns internalNextError and the caller is expected
// to call Iterator.Error() to retrieve it. In all other circumstances,
// internalNext returns internalNextExhausted, indicating that there are no more
// additional internal keys with the user key Key().
//
// internalNext does not change the external position of the iterator, and a
// Next operation should behave the same as if internalNext was never called.
// internalNext does invalidate the iterator Value's memory, and the caller must
// not rely on the memory safety of the previous Iterator position.
func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) {
i.stats.ForwardStepCount[InterfaceCall]++
if i.err != nil {
return internalNextError, base.InternalKeyKindInvalid
} else if i.iterValidityState != IterValid {
return internalNextExhausted, base.InternalKeyKindInvalid
}
i.lastPositioningOp = internalNextOp

switch i.pos {
case iterPosCurForward:
i.iterKey, i.iterValue = i.iter.Next()
if i.iterKey == nil {
// We check i.iter.Error() here and return an internalNextError enum
// variant so that the caller does not need to check i.iter.Error()
// in the common case that the next internal key has a new user key.
if i.err = i.iter.Error(); i.err != nil {
return internalNextError, base.InternalKeyKindInvalid
}
i.pos = iterPosNext
return internalNextExhausted, base.InternalKeyKindInvalid
} else if i.comparer.Equal(i.iterKey.UserKey, i.key) {
return internalNextValid, i.iterKey.Kind()
}
i.pos = iterPosNext
return internalNextExhausted, base.InternalKeyKindInvalid
case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev:
i.err = errors.New("switching from reverse to forward via internalNext is prohibited")
i.iterValidityState = IterExhausted
return internalNextError, base.InternalKeyKindInvalid
case iterPosNext, iterPosCurForwardPaused:
// The previous method already moved onto the next user key. This is
// only possible if
// - the last positioning method was a call to internalNext, and we
// advanced to a new user key.
// - the previous non-internalNext iterator operation encountered a
// range key or merge, forcing an internal Next that found a new
// user key that's not equal to i.Iterator.Key().
return internalNextExhausted, base.InternalKeyKindInvalid
default:
panic("unreachable")
}
}
2 changes: 2 additions & 0 deletions metamorphic/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ const (
iterNext
iterNextWithLimit
iterNextPrefix
iterCanSingleDelete
iterPrev
iterPrevWithLimit
iterSeekGE
Expand Down Expand Up @@ -124,6 +125,7 @@ func defaultConfig() config {
iterNext: 100,
iterNextWithLimit: 20,
iterNextPrefix: 20,
iterCanSingleDelete: 20,
iterPrev: 100,
iterPrevWithLimit: 20,
iterSeekGE: 100,
Expand Down
8 changes: 8 additions & 0 deletions metamorphic/generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ func generate(rng *rand.Rand, count uint64, cfg config, km *keyManager) []op {
iterNext: g.randIter(g.iterNext),
iterNextWithLimit: g.randIter(g.iterNextWithLimit),
iterNextPrefix: g.randIter(g.iterNextPrefix),
iterCanSingleDelete: g.randIter(g.iterCanSingleDelete),
iterPrev: g.randIter(g.iterPrev),
iterPrevWithLimit: g.randIter(g.iterPrevWithLimit),
iterSeekGE: g.randIter(g.iterSeekGE),
Expand Down Expand Up @@ -1077,6 +1078,13 @@ func (g *generator) iterNextPrefix(iterID objID) {
})
}

func (g *generator) iterCanSingleDelete(iterID objID) {
g.add(&iterCanSingleDelOp{
iterID: iterID,
derivedReaderID: g.iterReaderID[iterID],
})
}

func (g *generator) iterPrevWithLimit(iterID objID) {
g.add(&iterPrevOp{
iterID: iterID,
Expand Down
1 change: 1 addition & 0 deletions metamorphic/key_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,7 @@ func opWrittenKeys(untypedOp op) [][]byte {
case *iterLastOp:
case *iterNextOp:
case *iterNextPrefixOp:
case *iterCanSingleDelOp:
case *iterPrevOp:
case *iterSeekGEOp:
case *iterSeekLTOp:
Expand Down
30 changes: 30 additions & 0 deletions metamorphic/ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -1154,6 +1154,36 @@ func (o *iterNextPrefixOp) String() string { return fmt.Sprintf("%s.NextPr
func (o *iterNextPrefixOp) receiver() objID { return o.iterID }
func (o *iterNextPrefixOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }

// iterCanSingleDelOp models a call to CanDeterministicallySingleDelete with an
// Iterator.
type iterCanSingleDelOp struct {
iterID objID

derivedReaderID objID
}

func (o *iterCanSingleDelOp) run(t *test, h historyRecorder) {
// TODO(jackson): When we perform error injection, we'll need to rethink
// this.
_, err := pebble.CanDeterministicallySingleDelete(t.getIter(o.iterID).iter)
// The return value of CanDeterministicallySingleDelete is dependent on
// internal LSM state and non-deterministic, so we don't record it.
// Including the operation within the metamorphic test at all helps ensure
// that it does not change the result of any other Iterator operation that
// should be deterministic, regardless of its own outcome.
//
// We still record the value of the error because it's deterministic, at
// least for now. The possible error cases are:
// - The iterator was already in an error state when the operation ran.
// - The operation is deterministically invalid (like using an InternalNext
// to change directions.)
h.Recordf("%s // %v", o, err)
}

func (o *iterCanSingleDelOp) String() string { return fmt.Sprintf("%s.InternalNext()", o.iterID) }
func (o *iterCanSingleDelOp) receiver() objID { return o.iterID }
func (o *iterCanSingleDelOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }

// iterPrevOp models an Iterator.Prev[WithLimit] operation.
type iterPrevOp struct {
iterID objID
Expand Down
5 changes: 5 additions & 0 deletions metamorphic/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ func opArgs(op op) (receiverID *objID, targetID *objID, args []interface{}) {
return &t.iterID, nil, []interface{}{&t.limit}
case *iterNextPrefixOp:
return &t.iterID, nil, nil
case *iterCanSingleDelOp:
return &t.iterID, nil, []interface{}{}
case *iterPrevOp:
return &t.iterID, nil, []interface{}{&t.limit}
case *iterSeekLTOp:
Expand Down Expand Up @@ -137,6 +139,7 @@ var methods = map[string]*methodInfo{
"NewSnapshot": makeMethod(newSnapshotOp{}, dbTag),
"Next": makeMethod(iterNextOp{}, iterTag),
"NextPrefix": makeMethod(iterNextPrefixOp{}, iterTag),
"InternalNext": makeMethod(iterCanSingleDelOp{}, iterTag),
"Prev": makeMethod(iterPrevOp{}, iterTag),
"RangeKeyDelete": makeMethod(rangeKeyDeleteOp{}, dbTag, batchTag),
"RangeKeySet": makeMethod(rangeKeySetOp{}, dbTag, batchTag),
Expand Down Expand Up @@ -520,6 +523,8 @@ func computeDerivedFields(ops []op) {
v.derivedReaderID = iterToReader[v.iterID]
case *iterNextPrefixOp:
v.derivedReaderID = iterToReader[v.iterID]
case *iterCanSingleDelOp:
v.derivedReaderID = iterToReader[v.iterID]
case *iterPrevOp:
v.derivedReaderID = iterToReader[v.iterID]
}
Expand Down
Loading

0 comments on commit 32e8ed5

Please sign in to comment.