diff --git a/arraycontainer.go b/arraycontainer.go index 80fa676e..92036992 100644 --- a/arraycontainer.go +++ b/arraycontainer.go @@ -1,6 +1,7 @@ package roaring import ( + "errors" "fmt" ) @@ -8,6 +9,11 @@ type arrayContainer struct { content []uint16 } +var ( + ErrArrayIncorrectSort = errors.New("incorrectly sorted array") + ErrArrayInvalidSize = errors.New("invalid array size") +) + func (ac *arrayContainer) String() string { s := "{" for it := ac.getShortIterator(); it.hasNext(); { @@ -26,8 +32,7 @@ func (ac *arrayContainer) fillLeastSignificant16bits(x []uint32, i int, mask uin _ = x[len(ac.content)-1+i] _ = ac.content[len(ac.content)-1] for k := 0; k < len(ac.content); k++ { - x[k+i] = - uint32(ac.content[k]) | mask + x[k+i] = uint32(ac.content[k]) | mask } return i + len(ac.content) } @@ -168,7 +173,7 @@ func (ac *arrayContainer) notClose(firstOfRange, lastOfRange int) container { return ac.toBitmapContainer().not(firstOfRange, lastOfRange+1) } answer := newArrayContainer() - answer.content = make([]uint16, newCardinality, newCardinality) //a hack for sure + answer.content = make([]uint16, newCardinality, newCardinality) // a hack for sure copy(answer.content, ac.content[:startIndex]) outPos := startIndex @@ -194,11 +199,9 @@ func (ac *arrayContainer) notClose(firstOfRange, lastOfRange int) container { } answer.content = answer.content[:newCardinality] return answer - } func (ac *arrayContainer) equals(o container) bool { - srb, ok := o.(*arrayContainer) if ok { // Check if the containers are the same object. @@ -239,8 +242,8 @@ func (ac *arrayContainer) toBitmapContainer() *bitmapContainer { bc := newBitmapContainer() bc.loadData(ac) return bc - } + func (ac *arrayContainer) iadd(x uint16) (wasNew bool) { // Special case adding to the end of the container. l := len(ac.content) @@ -352,7 +355,7 @@ func (ac *arrayContainer) ior(a container) container { return ac.iorArray(x) case *bitmapContainer: return a.(*bitmapContainer).orArray(ac) - //return ac.iorBitmap(x) // note: this does not make sense + // return ac.iorBitmap(x) // note: this does not make sense case *runContainer16: if x.isFull() { return x.clone() @@ -589,7 +592,6 @@ func (ac *arrayContainer) iandBitmap(bc *bitmapContainer) container { } ac.content = ac.content[:pos] return ac - } func (ac *arrayContainer) xor(a container) container { @@ -630,7 +632,6 @@ func (ac *arrayContainer) xorArray(value2 *arrayContainer) container { length := exclusiveUnion2by2(value1.content, value2.content, answer.content) answer.content = answer.content[:length] return answer - } func (ac *arrayContainer) andNot(a container) container { @@ -822,7 +823,6 @@ func (ac *arrayContainer) inotClose(firstOfRange, lastOfRange int) container { } else { // no expansion needed ac.negateRange(buffer, startIndex, lastIndex, firstOfRange, lastOfRange+1) if cardinalityChange < 0 { - for i := startIndex + newValuesInRange; i < newCardinality; i++ { ac.content[i] = ac.content[i-cardinalityChange] } @@ -915,7 +915,6 @@ func (ac *arrayContainer) rank(x uint16) int { return answer + 1 } return -answer - 1 - } func (ac *arrayContainer) selectInt(x uint16) int { @@ -1039,7 +1038,6 @@ func (ac *arrayContainer) numberOfRuns() (nr int) { // convert to run or array *if needed* func (ac *arrayContainer) toEfficientContainer() container { - numRuns := ac.numberOfRuns() sizeAsRunContainer := runContainer16SerializedSizeInBytes(numRuns) @@ -1099,3 +1097,28 @@ func (ac *arrayContainer) addOffset(x uint16) (container, container) { return low, high } + +// validate checks cardinality and sort order of the array container +func (ac *arrayContainer) validate() error { + cardinality := ac.getCardinality() + + if cardinality <= 0 { + return ErrArrayInvalidSize + } + + if cardinality > arrayDefaultMaxSize { + return ErrArrayInvalidSize + } + + previous := ac.content[0] + for i := 1; i < len(ac.content); i++ { + next := ac.content[i] + if previous > next { + return ErrArrayIncorrectSort + } + previous = next + + } + + return nil +} diff --git a/arraycontainer_test.go b/arraycontainer_test.go index fd360bec..af9a8647 100644 --- a/arraycontainer_test.go +++ b/arraycontainer_test.go @@ -241,9 +241,11 @@ func TestArrayContainerNumberOfRuns025(t *testing.T) { } func TestArrayContainerIaddRangeNearMax068(t *testing.T) { - iv := []interval16{newInterval16Range(65525, 65527), + iv := []interval16{ + newInterval16Range(65525, 65527), newInterval16Range(65530, 65530), - newInterval16Range(65534, 65535)} + newInterval16Range(65534, 65535), + } rc := newRunContainer16TakeOwnership(iv) ac2 := rc.toArrayContainer() @@ -262,9 +264,11 @@ func TestArrayContainerIaddRangeNearMax068(t *testing.T) { } func TestArrayContainerEtc070(t *testing.T) { - iv := []interval16{newInterval16Range(65525, 65527), + iv := []interval16{ + newInterval16Range(65525, 65527), newInterval16Range(65530, 65530), - newInterval16Range(65534, 65535)} + newInterval16Range(65534, 65535), + } rc := newRunContainer16TakeOwnership(iv) ac := rc.toArrayContainer() @@ -431,6 +435,44 @@ func TestArrayContainerResetTo(t *testing.T) { }) } +func TestArrayContainerValidation(t *testing.T) { + array := newArrayContainer() + upperBound := arrayDefaultMaxSize + + err := array.validate() + assert.Error(t, err) + + for i := 0; i < upperBound; i++ { + array.iadd(uint16(i)) + } + err = array.validate() + assert.NoError(t, err) + + // Introduce a sort error + // We know that upperbound is unsorted because we populated up to upperbound + array.content[500] = uint16(upperBound + upperBound) + + err = array.validate() + assert.Error(t, err) + + array = newArrayContainer() + + // Technically a run, but make sure the incorrect sort detection handles equal elements + for i := 0; i < upperBound; i++ { + array.iadd(uint16(1)) + } + err = array.validate() + assert.NoError(t, err) + + array = newArrayContainer() + + for i := 0; i < 2*upperBound; i++ { + array.iadd(uint16(i)) + } + err = array.validate() + assert.Error(t, err) +} + // go test -bench BenchmarkShortIteratorAdvance -run - func BenchmarkShortIteratorAdvanceArray(b *testing.B) { benchmarkContainerIteratorAdvance(b, newArrayContainer()) diff --git a/bitmapcontainer.go b/bitmapcontainer.go index bf08bfca..d59578d6 100644 --- a/bitmapcontainer.go +++ b/bitmapcontainer.go @@ -116,6 +116,7 @@ func (bcsi *bitmapContainerShortIterator) next() uint16 { bcsi.i = bcsi.ptr.NextSetBit(uint(bcsi.i) + 1) return uint16(j) } + func (bcsi *bitmapContainerShortIterator) hasNext() bool { return bcsi.i >= 0 } @@ -241,7 +242,7 @@ func (bc *bitmapContainer) getSizeInBytes() int { } func (bc *bitmapContainer) serializedSizeInBytes() int { - //return bc.Msgsize()// NOO! This breaks GetSerializedSizeInBytes + // return bc.Msgsize()// NOO! This breaks GetSerializedSizeInBytes return len(bc.bitmap) * 8 } @@ -441,7 +442,7 @@ func (bc *bitmapContainer) ior(a container) container { if bc.isFull() { return newRunContainer16Range(0, MaxUint16) } - //bc.computeCardinality() + // bc.computeCardinality() return bc } panic(fmt.Errorf("unsupported container type %T", a)) @@ -819,9 +820,8 @@ func (bc *bitmapContainer) andBitmap(value2 *bitmapContainer) container { } ac := newArrayContainerSize(newcardinality) fillArrayAND(ac.content, bc.bitmap, value2.bitmap) - ac.content = ac.content[:newcardinality] //not sure why i need this + ac.content = ac.content[:newcardinality] // not sure why i need this return ac - } func (bc *bitmapContainer) intersectsArray(value2 *arrayContainer) bool { @@ -842,7 +842,6 @@ func (bc *bitmapContainer) intersectsBitmap(value2 *bitmapContainer) bool { } } return false - } func (bc *bitmapContainer) iandBitmap(value2 *bitmapContainer) container { @@ -995,7 +994,7 @@ func (bc *bitmapContainer) iandNotBitmapSurely(value2 *bitmapContainer) containe return bc } -func (bc *bitmapContainer) contains(i uint16) bool { //testbit +func (bc *bitmapContainer) contains(i uint16) bool { // testbit x := uint(i) w := bc.bitmap[x>>6] mask := uint64(1) << (x & 63) @@ -1051,7 +1050,7 @@ func (bc *bitmapContainer) toArrayContainer() *arrayContainer { } func (bc *bitmapContainer) fillArray(container []uint16) { - //TODO: rewrite in assembly + // TODO: rewrite in assembly pos := 0 base := 0 for k := 0; k < len(bc.bitmap); k++ { @@ -1141,7 +1140,6 @@ func (bc *bitmapContainer) numberOfRuns() int { // convert to run or array *if needed* func (bc *bitmapContainer) toEfficientContainer() container { - numRuns := bc.numberOfRuns() sizeAsRunContainer := runContainer16SerializedSizeInBytes(numRuns) @@ -1159,7 +1157,6 @@ func (bc *bitmapContainer) toEfficientContainer() container { } func newBitmapContainerFromRun(rc *runContainer16) *bitmapContainer { - if len(rc.iv) == 1 { return newBitmapContainerwithRange(int(rc.iv[0].start), int(rc.iv[0].last())) } @@ -1169,7 +1166,7 @@ func newBitmapContainerFromRun(rc *runContainer16) *bitmapContainer { setBitmapRange(bc.bitmap, int(rc.iv[i].start), int(rc.iv[i].last())+1) bc.cardinality += int(rc.iv[i].last()) + 1 - int(rc.iv[i].start) } - //bc.computeCardinality() + // bc.computeCardinality() return bc } @@ -1234,3 +1231,24 @@ func (bc *bitmapContainer) addOffset(x uint16) (container, container) { return low, high } + +// validate checks that the container size is non-negative +func (bc *bitmapContainer) validate() error { + if bc.cardinality < arrayDefaultMaxSize { + return fmt.Errorf("bitmap container size was less than: %d", arrayDefaultMaxSize) + } + + if maxCapacity < len(bc.bitmap)*64 { + return fmt.Errorf("bitmap slize size %d exceeded max capacity %d", maxCapacity, len(bc.bitmap)*64) + } + + if bc.cardinality > maxCapacity { + return fmt.Errorf("bitmap container size was greater than: %d", maxCapacity) + } + + if bc.cardinality != int(popcntSlice(bc.bitmap)) { + return fmt.Errorf("bitmap container size %d did not match underlying slice length: %d", bc.cardinality, int(popcntSlice(bc.bitmap))) + } + + return nil +} diff --git a/bitmapcontainer_test.go b/bitmapcontainer_test.go index af985d1e..76269097 100644 --- a/bitmapcontainer_test.go +++ b/bitmapcontainer_test.go @@ -328,3 +328,46 @@ func TestBitmapContainerIAndNot(t *testing.T) { require.ElementsMatch(t, []uint16{12279, 12282, 12285}, bc.(*arrayContainer).content) require.Equal(t, 3, bc.getCardinality()) } + +func TestBitMapContainerValidate(t *testing.T) { + bc := newBitmapContainer() + + for i := 0; i < arrayDefaultMaxSize-1; i++ { + bc.iadd(uint16(i * 3)) + } + // bitmap containers should have size arrayDefaultMaxSize or larger + assert.Error(t, bc.validate()) + bc.iadd(math.MaxUint16) + + assert.NoError(t, bc.validate()) + + // Break the max cardinality invariant + bc.cardinality = maxCapacity + 1 + + assert.Error(t, bc.validate()) + + // violate cardinality underlying container size invariant + bc = newBitmapContainer() + for i := 0; i < arrayDefaultMaxSize+1; i++ { + bc.iadd(uint16(i * 3)) + } + assert.NoError(t, bc.validate()) + bc.cardinality += 1 + assert.Error(t, bc.validate()) + + // check that underlying packed slice doesn't exceed maxCapacity + bc = newBitmapContainer() + orginalSize := (1 << 16) / 64 + for i := 0; i < orginalSize; i++ { + bc.cardinality += 1 + bc.bitmap[i] = uint64(1) + } + + appendSize := ((1 << 16) - orginalSize) + 1 + for i := 0; i < appendSize; i++ { + bc.cardinality += 1 + bc.bitmap = append(bc.bitmap, uint64(1)) + } + + assert.Error(t, bc.validate()) +} diff --git a/roaring.go b/roaring.go index a31cdbd9..9508711a 100644 --- a/roaring.go +++ b/roaring.go @@ -26,7 +26,6 @@ func (rb *Bitmap) ToBase64() (string, error) { buf := new(bytes.Buffer) _, err := rb.WriteTo(buf) return base64.StdEncoding.EncodeToString(buf.Bytes()), err - } // FromBase64 deserializes a bitmap from Base64 @@ -54,10 +53,12 @@ func (rb *Bitmap) ToBytes() ([]byte, error) { return rb.highlowcontainer.toBytes() } -const wordSize = uint64(64) -const log2WordSize = uint64(6) -const capacity = ^uint64(0) -const bitmapContainerSize = (1 << 16) / 64 // bitmap size in words +const ( + wordSize = uint64(64) + log2WordSize = uint64(6) + capacity = ^uint64(0) + bitmapContainerSize = (1 << 16) / 64 // bitmap size in words +) // DenseSize returns the size of the bitmap when stored as a dense bitmap. func (rb *Bitmap) DenseSize() uint64 { @@ -313,6 +314,17 @@ func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err return } +// MustReadFrom calls ReadFrom internally. +// After deserialization Validate will be called. +// If the Bitmap fails to validate, a panic with the validation error will be thrown +func (rb *Bitmap) MustReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) { + rb.ReadFrom(reader, cookieHeader...) + if err := rb.Validate(); err != nil { + panic(err) + } + return +} + // FromBuffer creates a bitmap from its serialized version stored in buffer // // The format specification is available here: @@ -960,7 +972,6 @@ func (rb *Bitmap) CheckedAdd(x uint32) bool { newac := newArrayContainer() rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, newac.iaddReturnMinimized(lowbits(x))) return true - } // AddInt adds the integer x to the bitmap (convenience method: the parameter is casted to uint32 and we call Add) @@ -998,7 +1009,6 @@ func (rb *Bitmap) CheckedRemove(x uint32) bool { return C.getCardinality() < oldcard } return false - } // IsEmpty returns true if the Bitmap is empty (it is faster than doing (GetCardinality() == 0)) @@ -1088,7 +1098,7 @@ main: break main } s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1187,7 +1197,7 @@ main: break main } s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1256,7 +1266,7 @@ main: break main } s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1396,7 +1406,7 @@ main: break main } s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1584,7 +1594,7 @@ main: } s1 = x1.highlowcontainer.getKeyAtIndex(pos1) s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1632,7 +1642,6 @@ func BitmapOf(dat ...uint32) *Bitmap { // The function uses 64-bit parameters even though a Bitmap stores 32-bit values because it is allowed and meaningful to use [0,uint64(0x100000000)) as a range // while uint64(0x100000000) cannot be represented as a 32-bit value. func (rb *Bitmap) Flip(rangeStart, rangeEnd uint64) { - if rangeEnd > MaxUint32+1 { panic("rangeEnd > MaxUint32+1") } @@ -1916,3 +1925,7 @@ func (rb *Bitmap) Stats() Statistics { } return stats } + +func (rb *Bitmap) Validate() error { + return rb.highlowcontainer.validate() +} diff --git a/roaring_test.go b/roaring_test.go index 080b9f02..7124dfd6 100644 --- a/roaring_test.go +++ b/roaring_test.go @@ -17,7 +17,6 @@ func checkValidity(t *testing.T, rb *Bitmap) { t.Helper() for _, c := range rb.highlowcontainer.containers { - switch c.(type) { case *arrayContainer: if c.getCardinality() > arrayDefaultMaxSize { @@ -74,6 +73,25 @@ func hashTest(t *testing.T, N uint64) { assert.Equal(t, count, len(hashes)) } +func buildRuns(includeBroken bool) *runContainer16 { + rc := &runContainer16{} + if includeBroken { + for i := 0; i < 100; i++ { + start := i * 100 + end := start + 100 + rc.iv = append(rc.iv, newInterval16Range(uint16(start), uint16(end))) + } + } + + for i := 0; i < 100; i++ { + start := i*100 + i*2 + end := start + 100 + rc.iv = append(rc.iv, newInterval16Range(uint16(start), uint16(end))) + } + + return rc +} + func TestReverseIteratorCount(t *testing.T) { array := []int{2, 63, 64, 65, 4095, 4096, 4097, 4159, 4160, 4161, 5000, 20000, 66666} for _, testSize := range array { @@ -477,7 +495,7 @@ func TestRangeRemovalFromContent(t *testing.T) { bm.RemoveRange(0, 30000) c := bm.GetCardinality() - assert.EqualValues(t, 00, c) + assert.EqualValues(t, 0o0, c) } func TestFlipOnEmpty(t *testing.T) { @@ -596,7 +614,7 @@ func TestBitmapExtra(t *testing.T) { clonebs1.InPlaceSymmetricDifference(bs2) assert.True(t, equalsBitSet(clonebs1, Xor(rb1, rb2))) - //testing NOTAND + // testing NOTAND clonebs1 = bs1.Clone() clonebs1.InPlaceDifference(bs2) assert.True(t, equalsBitSet(clonebs1, AndNot(rb1, rb2))) @@ -779,7 +797,7 @@ func TestBitmap(t *testing.T) { t.Run("Test AND 3", func(t *testing.T) { var arrayand [11256]uint32 - //393,216 + // 393,216 pos := 0 rr := NewBitmap() for k := 4000; k < 4256; k++ { @@ -856,7 +874,6 @@ func TestBitmap(t *testing.T) { assert.Equal(t, len(arrayres), len(arrayand)) assert.True(t, ok) - }) t.Run("Test AND 4", func(t *testing.T) { @@ -869,7 +886,7 @@ func TestBitmap(t *testing.T) { for i := 200000; i < 400000; i += 14 { rb2.AddInt(i) } - //TODO: Bitmap.And(bm,bm2) + // TODO: Bitmap.And(bm,bm2) andresult := And(rb, rb2) off := And(rb2, rb) @@ -1247,7 +1264,7 @@ func TestBitmap(t *testing.T) { rb := NewBitmap() rb1 := Flip(rb, 100000, 132000) rb2 := Flip(rb1, 65536, 120000) - //rbcard := rb2.GetCardinality() + // rbcard := rb2.GetCardinality() bs := bitset.New(0) for i := uint(65536); i < 100000; i++ { @@ -1303,7 +1320,7 @@ func TestBitmap(t *testing.T) { numCases := 1000 rb := NewBitmap() bs := bitset.New(0) - //Random r = new Random(3333); + // Random r = new Random(3333); checkTime := 2.0 for i := 0; i < numCases; i++ { @@ -1717,6 +1734,7 @@ func TestBitmap(t *testing.T) { assert.True(t, valide) }) } + func TestXORtest4(t *testing.T) { t.Run("XORtest 4", func(t *testing.T) { rb := NewBitmap() @@ -1764,7 +1782,7 @@ func TestXORtest4(t *testing.T) { rb.Xor(rb2) assert.True(t, xorresult2.Equals(rb)) }) - //need to add the massives + // need to add the massives } func TestNextMany(t *testing.T) { @@ -1874,7 +1892,7 @@ func rTest(t *testing.T, N int) { assert.True(t, equalsBitSet(clonebs1, Xor(rb1, rb2))) - //testing NOTAND + // testing NOTAND clonebs1 = bs1.Clone() clonebs1.InPlaceDifference(bs2) @@ -2618,6 +2636,143 @@ func TestFromBitSet(t *testing.T) { }) } +func TestRoaringArrayValidation(t *testing.T) { + a := newRoaringArray() + + assert.ErrorIs(t, a.validate(), ErrEmptyKeys) + + a.keys = append(a.keys, uint16(3), uint16(1)) + assert.ErrorIs(t, a.validate(), ErrKeySortOrder) + a.clear() + + // build up cardinality coherent arrays + a.keys = append(a.keys, uint16(1), uint16(3), uint16(10)) + assert.ErrorIs(t, a.validate(), ErrCardinalityConstraint) + a.containers = append(a.containers, &runContainer16{}, &runContainer16{}, &runContainer16{}) + assert.ErrorIs(t, a.validate(), ErrCardinalityConstraint) + a.needCopyOnWrite = append(a.needCopyOnWrite, true, false, true) + assert.Errorf(t, a.validate(), "zero intervals") +} + +func TestBitMapValidation(t *testing.T) { + bm := NewBitmap() + bm.AddRange(0, 100) + bm.AddRange(306, 406) + bm.AddRange(102, 202) + bm.AddRange(204, 304) + assert.NoError(t, bm.Validate()) + + randomEntries := make([]uint32, 0, 1000) + for i := 0; i < 1000; i++ { + randomEntries = append(randomEntries, rand.Uint32()) + } + + bm.AddMany(randomEntries) + assert.NoError(t, bm.Validate()) + + randomEntries = make([]uint32, 0, 1000) + for i := 0; i < 1000; i++ { + randomEntries = append(randomEntries, uint32(i)) + } + bm.AddMany(randomEntries) + assert.NoError(t, bm.Validate()) +} + +func TestBitMapValidationFromDeserialization(t *testing.T) { + // To understand what is going on here, read https://github.com/RoaringBitmap/RoaringFormatSpec + // Maintainers: The loader and corruptor are dependent on one another + // The tests expect a certain size, with values at certain location. + // The tests are geared toward single byte corruption. + + // There is no way to test Bitmap container corruption once the bitmap is deserialzied + + deserializationTests := []struct { + name string + loader func(bm *Bitmap) + corruptor func(s []byte) + err error + }{ + { + name: "Corrupts Run Length vs Num Runs", + loader: func(bm *Bitmap) { + bm.AddRange(0, 2) + bm.AddRange(4, 6) + bm.AddRange(8, 100) + }, + corruptor: func(s []byte) { + // 21 is the length of the run of the last run/range + // Shortening causes interval sum to be to short + s[21] = 1 + }, + err: ErrRunIntervalSize, + }, + { + name: "Corrupts Run Length", + loader: func(bm *Bitmap) { + bm.AddRange(100, 110) + }, + corruptor: func(s []byte) { + // 13 is the length of the run + // Setting to zero causes an invalid run + s[13] = 0 + }, + err: ErrRunIntervalLength, + }, + { + name: "Creates Interval Overlap", + loader: func(bm *Bitmap) { + bm.AddRange(100, 110) + bm.AddRange(115, 125) + }, + corruptor: func(s []byte) { + // sets the start of the second run + // Creates overlapping intervals + s[15] = 108 + }, + err: ErrRunIntervalOverlap, + }, + { + name: "Break Array Sort Order", + loader: func(bm *Bitmap) { + arrayEntries := make([]uint32, 0, 10) + for i := 0; i < 10; i++ { + arrayEntries = append(arrayEntries, uint32(i)) + } + bm.AddMany(arrayEntries) + }, + corruptor: func(s []byte) { + // breaks the sort order + s[34] = 0 + }, + err: ErrArrayIncorrectSort, + }, + } + + for _, tt := range deserializationTests { + t.Run(tt.name, func(t *testing.T) { + defer func() { + if err := recover(); err != nil { + } + }() + + bm := NewBitmap() + tt.loader(bm) + assert.NoError(t, bm.Validate()) + serialized, err := bm.ToBytes() + assert.NoError(t, err) + tt.corruptor(serialized) + corruptedDeserializedBitMap := NewBitmap() + corruptedDeserializedBitMap.ReadFrom(bytes.NewReader(serialized)) + assert.ErrorIs(t, corruptedDeserializedBitMap.Validate(), tt.err) + + corruptedDeserializedBitMap = NewBitmap() + corruptedDeserializedBitMap.MustReadFrom(bytes.NewReader(serialized)) + // We will never hit this because of the recover + t.Errorf("did not panic") + }) + } +} + func BenchmarkFromDense(b *testing.B) { testDense(func(name string, rb *Bitmap) { dense := make([]uint64, rb.DenseSize()) @@ -2679,7 +2834,7 @@ func BenchmarkInPlaceArrayUnions(b *testing.B) { for i := 0; i < 100; i++ { bitmap := NewBitmap() for j := 0; j < 100; j++ { - //keep all entries in [0,4096), so they stay arrays. + // keep all entries in [0,4096), so they stay arrays. bitmap.Add(uint32(rand.Intn(arrayDefaultMaxSize))) } componentBitmaps[i] = bitmap diff --git a/roaringarray.go b/roaringarray.go index 079195dd..52ccf483 100644 --- a/roaringarray.go +++ b/roaringarray.go @@ -3,6 +3,7 @@ package roaring import ( "bytes" "encoding/binary" + "errors" "fmt" "io" @@ -30,7 +31,7 @@ type container interface { iadd(x uint16) bool // inplace, returns true if x was new. iaddReturnMinimized(uint16) container // may change return type to minimize storage. - //addRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) + // addRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) iaddRange(start, endx int) container // i stands for inplace, range is [firstOfRange,endx) iremove(x uint16) bool // inplace, returns true if x was present. @@ -61,7 +62,7 @@ type container interface { lazyOR(r container) container lazyIOR(r container) container getSizeInBytes() int - //removeRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) + // removeRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) iremoveRange(start, final int) container // i stands for inplace, range is [firstOfRange,lastOfRange) selectInt(x uint16) int // selectInt returns the xth integer in the container serializedSizeInBytes() int @@ -71,6 +72,8 @@ type container interface { toEfficientContainer() container String() string containerType() contype + + validate() error } type contype uint8 @@ -82,6 +85,12 @@ const ( run32Contype ) +var ( + ErrEmptyKeys = errors.New("keys were empty") + ErrKeySortOrder = errors.New("keys were out of order") + ErrCardinalityConstraint = errors.New("size of arrays was not coherent") +) + // careful: range is [firstOfRange,lastOfRange] func rangeOfOnes(start, last int) container { if start > MaxUint16 { @@ -178,7 +187,6 @@ func (ra *roaringArray) appendCopiesUntil(sa roaringArray, stoppingKey uint16) { } else { // since there is no copy-on-write, we need to clone the container (this is important) ra.appendContainer(sa.keys[i], sa.containers[i].clone(), thiscopyonewrite) - } } } @@ -204,7 +212,6 @@ func (ra *roaringArray) appendCopiesAfter(sa roaringArray, beforeStart uint16) { } else { // since there is no copy-on-write, we need to clone the container (this is important) ra.appendContainer(sa.keys[i], sa.containers[i].clone(), thiscopyonewrite) - } } } @@ -239,7 +246,6 @@ func (ra *roaringArray) clear() { } func (ra *roaringArray) clone() *roaringArray { - sa := roaringArray{} sa.copyOnWrite = ra.copyOnWrite @@ -325,7 +331,6 @@ func (ra *roaringArray) getUnionedWritableContainer(pos int, other container) co return ra.getContainerAtIndex(pos).or(other) } return ra.getContainerAtIndex(pos).ior(other) - } func (ra *roaringArray) getWritableContainerAtIndex(i int) container { @@ -455,7 +460,6 @@ func (ra *roaringArray) headerSize() uint64 { return 4 + (size+7)/8 + 8*size // - 4 because we pack the size with the cookie } return 4 + 4 + 8*size - } // should be dirt cheap @@ -489,7 +493,7 @@ func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) { binary.LittleEndian.PutUint16(buf[2:], uint16(len(ra.keys)-1)) nw += 2 // compute isRun bitmap without temporary allocation - var runbitmapslice = buf[nw : nw+isRunSizeInBytes] + runbitmapslice := buf[nw : nw+isRunSizeInBytes] for i, c := range ra.containers { switch c.(type) { case *runContainer16: @@ -577,7 +581,6 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte // create is-run-container bitmap isRunBitmapSize := (int(size) + 7) / 8 isRunBitmap, err = stream.Next(isRunBitmapSize) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("malformed bitmap, failed to read is-run bitmap, got: %s", err) } @@ -596,7 +599,6 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte // descriptive header buf, err := stream.Next(2 * 2 * int(size)) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("failed to read descriptive header: %s", err) } @@ -637,13 +639,11 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte if isRunBitmap != nil && isRunBitmap[i/8]&(1<<(i%8)) != 0 { // run container nr, err := stream.ReadUInt16() - if err != nil { return 0, fmt.Errorf("failed to read runtime container size: %s", err) } buf, err := stream.Next(int(nr) * 4) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("failed to read runtime container content: %s", err) } @@ -656,7 +656,6 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte } else if card > arrayDefaultMaxSize { // bitmap container buf, err := stream.Next(arrayDefaultMaxSize * 2) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("failed to read bitmap container: %s", err) } @@ -670,7 +669,6 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte } else { // array container buf, err := stream.Next(card * 2) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("failed to read array container: %s", err) } @@ -759,3 +757,48 @@ func (ra *roaringArray) needsCopyOnWrite(i int) bool { func (ra *roaringArray) setNeedsCopyOnWrite(i int) { ra.needCopyOnWrite[i] = true } + +func (ra *roaringArray) checkKeysSorted() bool { + if len(ra.keys) == 0 || len(ra.keys) == 1 { + return true + } + previous := ra.keys[0] + for nextIdx := 1; nextIdx < len(ra.keys); nextIdx++ { + next := ra.keys[nextIdx] + if previous >= next { + return false + } + previous = next + + } + return true +} + +// validate checks the referential integrity +// ensures len(keys) == len(containers), recurses and checks each container type +func (ra *roaringArray) validate() error { + if len(ra.keys) == 0 { + return ErrEmptyKeys + } + + if !ra.checkKeysSorted() { + return ErrKeySortOrder + } + + if len(ra.keys) != len(ra.containers) { + return ErrCardinalityConstraint + } + + if len(ra.keys) != len(ra.needCopyOnWrite) { + return ErrCardinalityConstraint + } + + for _, container := range ra.containers { + err := container.validate() + if err != nil { + return err + } + } + + return nil +} diff --git a/runcontainer.go b/runcontainer.go index 7098ba28..6d5aef1d 100644 --- a/runcontainer.go +++ b/runcontainer.go @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ import ( + "errors" "fmt" "sort" "unsafe" @@ -59,6 +60,17 @@ type interval16 struct { length uint16 // length minus 1 } +var ( + ErrRunIntervalsEmpty = errors.New("run contained no interval") + ErrRunNonSorted = errors.New("runs were not sorted") + ErrRunIntervalLength = errors.New("interval had zero length") + ErrRunIntervalEqual = errors.New("intervals were equal") + ErrRunIntervalOverlap = errors.New("intervals overlapped or were continguous") + ErrRunIntervalSize = errors.New("too many intervals relative to data") + MaxNumIntervals = 2048 + MaxIntervalsSum = 2048 +) + func newInterval16Range(start, last uint16) interval16 { if last < start { panic(fmt.Sprintf("last (%d) cannot be smaller than start (%d)", last, start)) @@ -201,7 +213,6 @@ func newRunContainer16FromVals(alreadySorted bool, vals ...uint16) *runContainer // somewhat efficiently. For reference, see the Java // https://github.com/RoaringBitmap/RoaringBitmap/blob/master/src/main/java/org/roaringbitmap/RunContainer.java#L145-L192 func newRunContainer16FromBitmapContainer(bc *bitmapContainer) *runContainer16 { - rc := &runContainer16{} nbrRuns := bc.numberOfRuns() if nbrRuns == 0 { @@ -251,7 +262,6 @@ func newRunContainer16FromBitmapContainer(bc *bitmapContainer) *runContainer16 { curWord = curWordWith1s & (curWordWith1s + 1) // We've lathered and rinsed, so repeat... } - } // newRunContainer16FromArray populates a new @@ -293,7 +303,6 @@ func newRunContainer16FromArray(arr *arrayContainer) *runContainer16 { // If you have a small number of additions to an already // big runContainer16, calling Add() may be faster. func (rc *runContainer16) set(alreadySorted bool, vals ...uint16) { - rc2 := newRunContainer16FromVals(alreadySorted, vals...) un := rc.union(rc2) rc.iv = un.iv @@ -374,7 +383,6 @@ func intersectInterval16s(a, b interval16) (res interval16, isEmpty bool) { // union merges two runContainer16s, producing // a new runContainer16 with the union of rc and b. func (rc *runContainer16) union(b *runContainer16) *runContainer16 { - // rc is also known as 'a' here, but golint insisted we // call it rc for consistency with the rest of the methods. @@ -457,7 +465,6 @@ func (rc *runContainer16) union(b *runContainer16) *runContainer16 { break aAdds } } - } if !bDone { @@ -471,7 +478,6 @@ func (rc *runContainer16) union(b *runContainer16) *runContainer16 { break bAdds } } - } m = append(m, merged) @@ -489,7 +495,6 @@ func (rc *runContainer16) union(b *runContainer16) *runContainer16 { // unionCardinality returns the cardinality of the merger of two runContainer16s, the union of rc and b. func (rc *runContainer16) unionCardinality(b *runContainer16) uint { - // rc is also known as 'a' here, but golint insisted we // call it rc for consistency with the rest of the methods. answer := uint(0) @@ -528,7 +533,7 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { } if !mergedUpdated { // we know that merged is disjoint from cura and curb - //m = append(m, merged) + // m = append(m, merged) answer += uint(merged.last()) - uint(merged.start) + 1 mergedUsed = false } @@ -539,11 +544,11 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { if !canMerge16(cura, curb) { if cura.start < curb.start { answer += uint(cura.last()) - uint(cura.start) + 1 - //m = append(m, cura) + // m = append(m, cura) na++ } else { answer += uint(curb.last()) - uint(curb.start) + 1 - //m = append(m, curb) + // m = append(m, curb) nb++ } } else { @@ -574,7 +579,6 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { break aAdds } } - } if !bDone { @@ -588,10 +592,9 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { break bAdds } } - } - //m = append(m, merged) + // m = append(m, merged) answer += uint(merged.last()) - uint(merged.start) + 1 } for _, r := range rc.iv[na:] { @@ -615,7 +618,6 @@ func (rc *runContainer16) indexOfIntervalAtOrAfter(key int, startIndex int) int // intersect returns a new runContainer16 holding the // intersection of rc (also known as 'a') and b. func (rc *runContainer16) intersect(b *runContainer16) *runContainer16 { - a := rc numa := int(len(a.iv)) numb := int(len(b.iv)) @@ -645,8 +647,7 @@ func (rc *runContainer16) intersect(b *runContainer16) *runContainer16 { toploop: for acuri < numa && bcuri < numb { - isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = - intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) + isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) if !isOverlap { switch { @@ -664,7 +665,6 @@ toploop: } bstart = int(b.iv[bcuri].start) } - } else { // isOverlap output = append(output, intersection) @@ -748,8 +748,7 @@ toploop: for acuri < numa && bcuri < numb { pass++ - isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = - intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) + isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) if !isOverlap { switch { @@ -767,7 +766,6 @@ toploop: } bstart = int(b.iv[bcuri].start) } - } else { // isOverlap answer += int(intersection.last()) - int(intersection.start) + 1 @@ -1014,8 +1012,10 @@ func newRunContainer16TakeOwnership(iv []interval16) *runContainer16 { return rc } -const baseRc16Size = int(unsafe.Sizeof(runContainer16{})) -const perIntervalRc16Size = int(unsafe.Sizeof(interval16{})) +const ( + baseRc16Size = int(unsafe.Sizeof(runContainer16{})) + perIntervalRc16Size = int(unsafe.Sizeof(interval16{})) +) const baseDiskRc16Size = int(unsafe.Sizeof(uint16(0))) @@ -1274,7 +1274,7 @@ func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int { break } } else { - ri.curPosInIndex += uint16(moreVals) //moreVals always fits in uint16 + ri.curPosInIndex += uint16(moreVals) // moreVals always fits in uint16 } } @@ -1315,7 +1315,7 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { break } } else { - ri.curPosInIndex += uint16(moreVals) //moreVals always fits in uint16 + ri.curPosInIndex += uint16(moreVals) // moreVals always fits in uint16 } } @@ -1324,7 +1324,6 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { // remove removes key from the container. func (rc *runContainer16) removeKey(key uint16) (wasPresent bool) { - var index int index, wasPresent, _ = rc.search(int(key)) if !wasPresent { @@ -1361,7 +1360,7 @@ func (rc *runContainer16) deleteAt(curIndex *int, curPosInIndex *uint16) { *curPosInIndex-- // if we leave *curIndex alone, then Next() will work properly even after the delete. default: - //middle + // middle // split into two, adding an interval16 new0 := newInterval16Range(rc.iv[ci].start, rc.iv[ci].start+*curPosInIndex-1) @@ -1376,7 +1375,6 @@ func (rc *runContainer16) deleteAt(curIndex *int, curPosInIndex *uint16) { *curIndex++ *curPosInIndex = 0 } - } func have4Overlap16(astart, alast, bstart, blast int) bool { @@ -1503,6 +1501,26 @@ func (iv interval16) isSuperSetOf(b interval16) bool { return iv.start <= b.start && b.last() <= iv.last() } +func (iv interval16) isNonContiguousDisjoint(b interval16) bool { + // cover the zero start case + if iv.start == b.start { + return false + } + + nonContiguous1 := iv.start == b.last()+1 || iv.last() == b.start+1 + nonContiguous2 := b.start == iv.last()+1 || b.last() == iv.start+1 + if nonContiguous1 || nonContiguous2 { + return false + } + ivl := iv.last() + bl := b.last() + + c1 := iv.start <= b.start && b.start <= ivl + c2 := b.start <= iv.start && iv.start <= bl + + return !c1 && !c2 +} + func (iv interval16) subtractInterval(del interval16) (left []interval16, delcount int) { isect, isEmpty := intersectInterval16s(iv, del) @@ -1678,7 +1696,6 @@ func (rc *runContainer16) isubtract(del interval16) { // port of run_container_andnot from CRoaring... // https://github.com/RoaringBitmap/CRoaring/blob/master/src/containers/run.c#L435-L496 func (rc *runContainer16) AndNotRunContainer16(b *runContainer16) *runContainer16 { - if len(b.iv) == 0 || len(rc.iv) == 0 { return rc } @@ -1949,7 +1966,6 @@ func (rc *runContainer16) getManyIterator() manyIterable { // add the values in the range [firstOfRange, endx). endx // is still abe to express 2^16 because it is an int not an uint16. func (rc *runContainer16) iaddRange(firstOfRange, endx int) container { - if firstOfRange > endx { panic(fmt.Sprintf("invalid %v = endx > firstOfRange", endx)) } @@ -2002,7 +2018,6 @@ func (rc *runContainer16) not(firstOfRange, endx int) container { // makes 2 more passes through the arrays than should be // strictly necessary. Measure both ways though--this may not matter. func (rc *runContainer16) Not(firstOfRange, endx int) *runContainer16 { - if firstOfRange > endx { panic(fmt.Sprintf("invalid %v = endx > firstOfRange == %v", endx, firstOfRange)) } @@ -2066,12 +2081,12 @@ func (rc *runContainer16) equals(o container) bool { rit := rc.getShortIterator() bit := o.getShortIterator() - //k := 0 + // k := 0 for rit.hasNext() { if bit.next() != rit.next() { return false } - //k++ + // k++ } return true } @@ -2132,7 +2147,7 @@ func (rc *runContainer16) andBitmapContainerCardinality(bc *bitmapContainer) int for i := range rc.iv { answer += bc.getCardinalityInRange(uint(rc.iv[i].start), uint(rc.iv[i].last())+1) } - //bc.computeCardinality() + // bc.computeCardinality() return answer } @@ -2150,7 +2165,7 @@ func (rc *runContainer16) orArray(ac *arrayContainer) container { } intervals, cardMinusOne := runArrayUnionToRuns(rc, ac) result := newRunContainer16TakeOwnership(intervals) - if len(intervals) >= 2048 && cardMinusOne >= arrayDefaultMaxSize { + if len(intervals) >= MaxNumIntervals && cardMinusOne >= arrayDefaultMaxSize { return newBitmapContainerFromRun(result) } if len(intervals)*2 > 1+int(cardMinusOne) { @@ -2190,7 +2205,6 @@ func (rc *runContainer16) inplaceUnion(rc2 *runContainer16) container { } func (rc *runContainer16) iorBitmapContainer(bc *bitmapContainer) container { - it := bc.getShortIterator() for it.hasNext() { rc.Add(it.next()) @@ -2206,11 +2220,11 @@ func (rc *runContainer16) iorArray(ac *arrayContainer) container { return rc } var cardMinusOne uint16 - //TODO: perform the union algorithm in-place using rc.iv + // TODO: perform the union algorithm in-place using rc.iv // this can be done with methods like the in-place array container union // but maybe lazily moving the remaining elements back. rc.iv, cardMinusOne = runArrayUnionToRuns(rc, ac) - if len(rc.iv) >= 2048 && cardMinusOne >= arrayDefaultMaxSize { + if len(rc.iv) >= MaxNumIntervals && cardMinusOne >= arrayDefaultMaxSize { return newBitmapContainerFromRun(rc) } if len(rc.iv)*2 > 1+int(cardMinusOne) { @@ -2511,7 +2525,6 @@ func (rc *runContainer16) toArrayContainer() *arrayContainer { } func newRunContainer16FromContainer(c container) *runContainer16 { - switch x := c.(type) { case *runContainer16: return x.Clone() @@ -2622,3 +2635,71 @@ func (rc *runContainer16) addOffset(x uint16) (container, container) { return low, high } + +// isNonContiguousDisjoint returns an error if the intervals overlap e.g have non-empty intersection +func isNonContiguousDisjoint(outer interval16, inner interval16) error { + if !outer.isNonContiguousDisjoint(inner) { + return ErrRunIntervalOverlap + } + + return nil +} + +// validate checks the run container referential integrity +// Ensures runs are not degenerate, non-contiguous and non-overlapping +func (rc *runContainer16) validate() error { + if rc.getCardinality() == 0 { + return ErrRunIntervalsEmpty + } + + intervalsSum := 0 + for outeridx := range rc.iv { + + if rc.iv[outeridx].length == 0 { + return ErrRunIntervalLength + } + + outerInterval := rc.iv[outeridx] + + intervalsSum += outerInterval.runlen() + for inneridx := outeridx + 1; inneridx < len(rc.iv); inneridx++ { + + innerInterval := rc.iv[inneridx] + + if outerInterval.equal(innerInterval) { + return ErrRunIntervalEqual + } + + // only check the start of runs + // if the run length overlap the next check will catch that. + if outerInterval.start >= innerInterval.start { + return ErrRunNonSorted + } + + err := isNonContiguousDisjoint(outerInterval, innerInterval) + if err != nil { + return err + } + } + + } + /* + if number of distinct values in the container >= 2048 then + check that the number of runs is no more than 2047 + (otherwise you could use a bitset container) + else + check that the number of runs < (number of distinct values) / 2 + (otherwise you could use an array container) + */ + if MaxIntervalsSum <= intervalsSum { + if !(len(rc.iv) < MaxNumIntervals) { + return ErrRunIntervalSize + } + } else { + if !(len(rc.iv) < (intervalsSum / 2)) { + return ErrRunIntervalSize + } + } + + return nil +} diff --git a/runcontainer_test.go b/runcontainer_test.go index 86549a2c..3e6ce3ce 100644 --- a/runcontainer_test.go +++ b/runcontainer_test.go @@ -190,7 +190,8 @@ func TestRleRunIterator16(t *testing.T) { rc := newRunContainer16CopyIv([]interval16{ newInterval16Range(4, 7), newInterval16Range(11, 13), - newInterval16Range(18, 21)}) + newInterval16Range(18, 21), + }) assert.EqualValues(t, 11, rc.getCardinality()) @@ -210,7 +211,8 @@ func TestRleRunIterator16(t *testing.T) { rc := newRunContainer16CopyIv([]interval16{ newInterval16Range(4, 7), newInterval16Range(11, 13), - newInterval16Range(18, 21)}) + newInterval16Range(18, 21), + }) expectedCard := 11 expectedVals := []uint32{4, 5, 6, 7, 11, 12, 13, 18, 19, 20, 21} hs := uint32(1 << 16) @@ -307,7 +309,6 @@ func TestRleRunIterator16(t *testing.T) { } func TestRleRunReverseIterator16(t *testing.T) { - t.Run("RunReverseIterator16 unit tests for next, hasNext, and peekNext should pass", func(t *testing.T) { { rc := newRunContainer16() @@ -411,12 +412,14 @@ func TestRleIntersection16(t *testing.T) { newInterval16Range(2, 4), newInterval16Range(8, 9), newInterval16Range(14, 16), - newInterval16Range(20, 22)}, + newInterval16Range(20, 22), + }, ) f := newRunContainer16TakeOwnership( []interval16{ newInterval16Range(3, 18), - newInterval16Range(22, 23)}, + newInterval16Range(22, 23), + }, ) { @@ -453,7 +456,6 @@ func TestRleIntersection16(t *testing.T) { func TestRleRandomIntersection16(t *testing.T) { t.Run("RunContainer.intersect of two RunContainers should return their intersection, and this should hold over randomized container content when compared to intersection done with hash maps", func(t *testing.T) { - seed := int64(42) rand.Seed(seed) @@ -510,7 +512,7 @@ func TestRleRandomIntersection16(t *testing.T) { // RunContainer's Intersect brle := newRunContainer16FromVals(false, b...) - //arle := newRunContainer16FromVals(false, a...) + // arle := newRunContainer16FromVals(false, a...) // instead of the above line, create from array // get better test coverage: arr := newArrayContainerRange(int(first), int(second)) @@ -519,7 +521,7 @@ func TestRleRandomIntersection16(t *testing.T) { isect := arle.intersect(brle) - //showHash("hashi", hashi) + // showHash("hashi", hashi) for k := range hashi { assert.True(t, isect.contains(uint16(k))) @@ -532,14 +534,11 @@ func TestRleRandomIntersection16(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } func TestRleRandomUnion16(t *testing.T) { - t.Run("RunContainer.union of two RunContainers should return their union, and this should hold over randomized container content when compared to union done with hash maps", func(t *testing.T) { - seed := int64(42) rand.Seed(seed) @@ -580,7 +579,7 @@ func TestRleRandomUnion16(t *testing.T) { hashu[k] = true } - //showHash("hashu", hashu) + // showHash("hashu", hashu) // RunContainer's Union arle := newRunContainer16() @@ -898,8 +897,8 @@ func TestRle16RandomIntersectAgainstOtherContainers010(t *testing.T) { mb[r1] = true } - //showArray16(a, "a") - //showArray16(b, "b") + // showArray16(a, "a") + // showArray16(b, "b") // hash version of intersect: hashi := make(map[int]bool) @@ -948,12 +947,10 @@ func TestRle16RandomIntersectAgainstOtherContainers010(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } func TestRle16RandomUnionAgainstOtherContainers011(t *testing.T) { - t.Run("runContainer16 `or` operation against other container types should correctly do the intersection", func(t *testing.T) { seed := int64(42) rand.Seed(seed) @@ -982,8 +979,8 @@ func TestRle16RandomUnionAgainstOtherContainers011(t *testing.T) { mb[r1] = true } - //showArray16(a, "a") - //showArray16(b, "b") + // showArray16(a, "a") + // showArray16(b, "b") // hash version of union hashi := make(map[int]bool) @@ -1030,12 +1027,10 @@ func TestRle16RandomUnionAgainstOtherContainers011(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } func TestRle16RandomInplaceUnionAgainstOtherContainers012(t *testing.T) { - t.Run("runContainer16 `ior` inplace union operation against other container types should correctly do the intersection", func(t *testing.T) { seed := int64(42) rand.Seed(seed) @@ -1064,8 +1059,8 @@ func TestRle16RandomInplaceUnionAgainstOtherContainers012(t *testing.T) { mb[r1] = true } - //showArray16(a, "a") - //showArray16(b, "b") + // showArray16(a, "a") + // showArray16(b, "b") // hash version of union hashi := make(map[int]bool) @@ -1118,12 +1113,10 @@ func TestRle16RandomInplaceUnionAgainstOtherContainers012(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } func TestRle16RandomInplaceIntersectAgainstOtherContainers014(t *testing.T) { - t.Run("runContainer16 `iand` inplace-and operation against other container types should correctly do the intersection", func(t *testing.T) { seed := int64(42) rand.Seed(seed) @@ -1152,8 +1145,8 @@ func TestRle16RandomInplaceIntersectAgainstOtherContainers014(t *testing.T) { mb[r1] = true } - //showArray16(a, "a") - //showArray16(b, "b") + // showArray16(a, "a") + // showArray16(b, "b") // hash version of intersect: hashi := make(map[int]bool) @@ -1206,12 +1199,10 @@ func TestRle16RandomInplaceIntersectAgainstOtherContainers014(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } func TestRle16RemoveApi015(t *testing.T) { - t.Run("runContainer16 `remove` (a minus b) should work", func(t *testing.T) { seed := int64(42) rand.Seed(seed) @@ -1240,8 +1231,8 @@ func TestRle16RemoveApi015(t *testing.T) { mb[r1] = true } - //showArray16(a, "a") - //showArray16(b, "b") + // showArray16(a, "a") + // showArray16(b, "b") // hash version of remove: hashrm := make(map[int]bool) @@ -1270,7 +1261,6 @@ func TestRle16RemoveApi015(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } @@ -1283,7 +1273,6 @@ func showArray16(a []uint16, name string) { } func TestRle16RandomAndNot016(t *testing.T) { - t.Run("runContainer16 `andNot` operation against other container types should correctly do the and-not operation", func(t *testing.T) { seed := int64(42) rand.Seed(seed) @@ -1312,8 +1301,8 @@ func TestRle16RandomAndNot016(t *testing.T) { mb[r1] = true } - //showArray16(a, "a") - //showArray16(b, "b") + // showArray16(a, "a") + // showArray16(b, "b") // hash version of and-not hashi := make(map[int]bool) @@ -1361,12 +1350,10 @@ func TestRle16RandomAndNot016(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } func TestRle16RandomInplaceAndNot017(t *testing.T) { - t.Run("runContainer16 `iandNot` operation against other container types should correctly do the inplace-and-not operation", func(t *testing.T) { seed := int64(42) rand.Seed(seed) @@ -1395,8 +1382,8 @@ func TestRle16RandomInplaceAndNot017(t *testing.T) { mb[r1] = true } - //showArray16(a, "a") - //showArray16(b, "b") + // showArray16(a, "a") + // showArray16(b, "b") // hash version of and-not hashi := make(map[int]bool) @@ -1447,12 +1434,10 @@ func TestRle16RandomInplaceAndNot017(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } func TestRle16InversionOfIntervals018(t *testing.T) { - t.Run("runContainer `invert` operation should do a NOT on the set of intervals, in-place", func(t *testing.T) { seed := int64(42) rand.Seed(seed) @@ -1470,7 +1455,7 @@ func TestRle16InversionOfIntervals018(t *testing.T) { a := []uint16{} // hashNotA will be NOT ma - //for i := 0; i < n; i++ { + // for i := 0; i < n; i++ { for i := 0; i < MaxUint16+1; i++ { hashNotA[i] = true } @@ -1504,12 +1489,10 @@ func TestRle16InversionOfIntervals018(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } func TestRle16SubtractionOfIntervals019(t *testing.T) { - t.Run("runContainer `subtract` operation removes an interval in-place", func(t *testing.T) { // basics @@ -1639,7 +1622,6 @@ func TestRle16Rank020(t *testing.T) { } func TestRle16NotAlsoKnownAsFlipRange021(t *testing.T) { - t.Run("runContainer `Not` operation should flip the bits of a range on the new returned container", func(t *testing.T) { seed := int64(42) rand.Seed(seed) @@ -1866,7 +1848,6 @@ func TestRleIntersects023(t *testing.T) { for i := range trials { tester(trials[i]) } - }) } @@ -1950,7 +1931,7 @@ func TestRle16RandomFillLeastSignificant16bits029(t *testing.T) { ma[r0] = true } - //showArray16(a, "a") + // showArray16(a, "a") // RunContainer rc := newRunContainer16FromVals(false, a...) @@ -2013,7 +1994,7 @@ func TestRle16RandomGetShortIterator030(t *testing.T) { ma[r0] = true } - //showArray16(a, "a") + // showArray16(a, "a") // RunContainer rc := newRunContainer16FromVals(false, a...) @@ -2074,7 +2055,7 @@ func TestRle16RandomIaddRangeIremoveRange031(t *testing.T) { ma[r0] = true } - //showArray16(a, "a") + // showArray16(a, "a") // RunContainer rc := newRunContainer16FromVals(false, a...) @@ -2198,14 +2179,14 @@ type twofer struct { func TestAllContainerMethodsAllContainerTypesWithData067(t *testing.T) { t.Run("each of the container methods that takes two containers should handle all 3x3==9 possible ways of being called -- and return results that agree with each other", func(t *testing.T) { - seed := int64(42) rand.Seed(seed) srang := newInterval16Range(MaxUint16-100, MaxUint16) trials := []trial{ {n: 100, percentFill: .7, ntrial: 1, numRandomOpsPass: 100}, - {n: 100, percentFill: .7, ntrial: 1, numRandomOpsPass: 100, srang: &srang}} + {n: 100, percentFill: .7, ntrial: 1, numRandomOpsPass: 100, srang: &srang}, + } tester := func(tr trial) { for j := 0; j < tr.ntrial; j++ { @@ -2333,6 +2314,146 @@ func TestRuntimeIteratorAdvance(t *testing.T) { testContainerIteratorAdvance(t, newRunContainer16()) } +func TestIntervalOverlaps(t *testing.T) { + // contiguous runs + a := newInterval16Range(0, 9) + b := newInterval16Range(10, 20) + + // Ensure the function is reflexive + assert.False(t, a.isNonContiguousDisjoint(a)) + assert.False(t, a.isNonContiguousDisjoint(b)) + // Ensure the function is symmetric + assert.False(t, b.isNonContiguousDisjoint(a)) + assert.Error(t, isNonContiguousDisjoint(a, b)) + + // identical runs + a = newInterval16Range(0, 9) + b = newInterval16Range(0, 9) + + assert.False(t, a.isNonContiguousDisjoint(b)) + assert.False(t, b.isNonContiguousDisjoint(a)) + assert.Error(t, isNonContiguousDisjoint(a, b)) + + // identical start runs + a = newInterval16Range(0, 9) + b = newInterval16Range(0, 20) + + assert.False(t, a.isNonContiguousDisjoint(b)) + assert.False(t, b.isNonContiguousDisjoint(a)) + assert.Error(t, isNonContiguousDisjoint(a, b)) + + // overlapping runs + a = newInterval16Range(0, 12) + b = newInterval16Range(10, 20) + + assert.False(t, a.isNonContiguousDisjoint(b)) + assert.Error(t, isNonContiguousDisjoint(a, b)) + + // subset runs + a = newInterval16Range(0, 12) + b = newInterval16Range(5, 9) + + assert.False(t, a.isNonContiguousDisjoint(b)) + assert.Error(t, isNonContiguousDisjoint(a, b)) + + // degenerate runs + a = newInterval16Range(0, 0) + b = newInterval16Range(5, 5) + + assert.True(t, a.isNonContiguousDisjoint(b)) + assert.NoError(t, isNonContiguousDisjoint(a, b)) + + // disjoint non-contiguous runs + a = newInterval16Range(0, 100) + b = newInterval16Range(1000, 2000) + + assert.True(t, a.isNonContiguousDisjoint(b)) + assert.NoError(t, isNonContiguousDisjoint(a, b)) +} + +func TestIntervalValidationFailing(t *testing.T) { + rc := &runContainer16{} + assert.Error(t, rc.validate()) + + a := newInterval16Range(0, 9) + b := newInterval16Range(0, 9) + rc = &runContainer16{} + rc.iv = append(rc.iv, a, b) + assert.ErrorIs(t, rc.validate(), ErrRunIntervalEqual) + + a = newInterval16Range(0, 9) + b = newInterval16Range(10, 20) + + rc = &runContainer16{} + rc.iv = append(rc.iv, a, b) + assert.ErrorIs(t, rc.validate(), ErrRunIntervalOverlap) + + a = newInterval16Range(0, 12) + b = newInterval16Range(10, 20) + + rc = &runContainer16{} + rc.iv = append(rc.iv, a, b) + assert.Error(t, rc.validate(), ErrRunIntervalOverlap) + + c := newInterval16Range(100, 150) + d := newInterval16Range(1000, 10000) + + rc = &runContainer16{} + rc.iv = append(rc.iv, a, b, c, d) + assert.ErrorIs(t, rc.validate(), ErrRunIntervalOverlap) + + a = newInterval16Range(0, 10) + b = newInterval16Range(100, 200) + + // missort + rc = &runContainer16{} + rc.iv = append(rc.iv, b, a) + assert.ErrorIs(t, rc.validate(), ErrRunNonSorted) + + rc = &runContainer16{} + start := -4 + for i := 0; i < MaxNumIntervals; i++ { + start += 4 + end := start + 2 + a := newInterval16Range(uint16(start), uint16(end)) + rc.iv = append(rc.iv, a) + + } + assert.ErrorIs(t, rc.validate(), ErrRunIntervalSize) + + // too many small runs, use array + rc = &runContainer16{} + start = -3 + for i := 0; i < 10; i++ { + start += 3 + end := start + 1 + a := newInterval16Range(uint16(start), uint16(end)) + rc.iv = append(rc.iv, a) + + } + assert.ErrorIs(t, rc.validate(), ErrRunIntervalSize) +} + +func TestIntervalValidationsPassing(t *testing.T) { + rc := &runContainer16{} + a := newInterval16Range(0, 10) + b := newInterval16Range(100, 200) + rc.iv = append(rc.iv, a, b) + assert.NoError(t, rc.validate()) + + // Large total sum, but enough intervals + rc = &runContainer16{} + a = newInterval16Range(0, uint16(MaxIntervalsSum+1)) + rc.iv = append(rc.iv, a) + assert.NoError(t, rc.validate()) + + rc = &runContainer16{} + a = newInterval16Range(0, uint16(MaxIntervalsSum+1)) + b = newInterval16Range(uint16(MaxIntervalsSum+3), uint16(MaxIntervalsSum*2)) + rc.iv = append(rc.iv, a, b) + assert.NoError(t, rc.validate()) +} + // go test -bench BenchmarkShortIteratorAdvance -run - func BenchmarkShortIteratorAdvanceRuntime(b *testing.B) { benchmarkContainerIteratorAdvance(b, newRunContainer16()) @@ -2346,7 +2467,6 @@ func BenchmarkShortIteratorNextRuntime(b *testing.B) { // generate random contents, then return that same // logical content in three different container types func getRandomSameThreeContainers(tr trial) (*arrayContainer, *runContainer16, *bitmapContainer) { - ma := make(map[int]bool) n := tr.n diff --git a/serialization_frozen_test.go b/serialization_frozen_test.go index 8ed7f3b2..957cc6c9 100644 --- a/serialization_frozen_test.go +++ b/serialization_frozen_test.go @@ -10,6 +10,8 @@ import ( "os" "reflect" "testing" + + "github.com/stretchr/testify/assert" ) func TestFrozenFormat(t *testing.T) { @@ -134,3 +136,88 @@ func TestFrozenFormat(t *testing.T) { }) } } + +func TestBitMapValidationFromFrozen(t *testing.T) { + // To understand what is going on here, read https://github.com/RoaringBitmap/RoaringFormatSpec + // Maintainers: The loader and corruptor are dependent on one another + // The tests expect a certain size, with values at certain location. + // The tests are geared toward single byte corruption. + + // There is no way to test Bitmap container corruption once the bitmap is deserialzied + + deserializationTests := []struct { + name string + loader func(bm *Bitmap) + corruptor func(s []byte) + err error + }{ + { + name: "Corrupts Run Length vs Num Runs", + loader: func(bm *Bitmap) { + bm.AddRange(0, 2) + bm.AddRange(4, 6) + bm.AddRange(8, 100) + }, + corruptor: func(s []byte) { + // 21 is the length of the run of the last run/range + // Shortening causes interval sum to be to short + s[10] = 1 + }, + err: ErrRunIntervalSize, + }, + { + name: "Corrupts Run Length", + loader: func(bm *Bitmap) { + bm.AddRange(100, 110) + }, + corruptor: func(s []byte) { + // 13 is the length of the run + // Setting to zero causes an invalid run + s[2] = 0 + }, + err: ErrRunIntervalLength, + }, + { + name: "Creates Interval Overlap", + loader: func(bm *Bitmap) { + bm.AddRange(100, 110) + bm.AddRange(115, 125) + }, + corruptor: func(s []byte) { + // sets the start of the second run + // Creates overlapping intervals + s[4] = 108 + }, + err: ErrRunIntervalOverlap, + }, + { + name: "Break Array Sort Order", + loader: func(bm *Bitmap) { + arrayEntries := make([]uint32, 0, 10) + for i := 0; i < 10; i++ { + arrayEntries = append(arrayEntries, uint32(i)) + } + bm.AddMany(arrayEntries) + }, + corruptor: func(s []byte) { + // breaks the sort order + s[4] = 0 + }, + err: ErrArrayIncorrectSort, + }, + } + + for _, tt := range deserializationTests { + t.Run(tt.name, func(t *testing.T) { + bm := NewBitmap() + tt.loader(bm) + assert.NoError(t, bm.Validate()) + serialized, err := bm.Freeze() + assert.NoError(t, err) + tt.corruptor(serialized) + corruptedDeserializedBitMap := NewBitmap() + + assert.ErrorIs(t, corruptedDeserializedBitMap.MustFrozenView(serialized), tt.err) + }) + } +} diff --git a/serialization_littleendian.go b/serialization_littleendian.go index 6e3a5d55..16d356ca 100644 --- a/serialization_littleendian.go +++ b/serialization_littleendian.go @@ -299,6 +299,15 @@ func (rb *Bitmap) FrozenView(buf []byte) error { return rb.highlowcontainer.frozenView(buf) } +func (rb *Bitmap) MustFrozenView(buf []byte) error { + if err := rb.FrozenView(buf); err != nil { + return err + } + err := rb.Validate() + + return err +} + /* Verbatim specification from CRoaring. * * FROZEN SERIALIZATION FORMAT DESCRIPTION