From 6c6358e6d4f71f7eb3eff67c2bbcf67ef94621d2 Mon Sep 17 00:00:00 2001 From: Achille Roussel Date: Fri, 30 Aug 2019 10:08:26 -0700 Subject: [PATCH 1/6] optimize implementation --- go.mod | 5 +++ go.sum | 2 + jenks.go | 111 ++++++++++++++++++++++++++++++++------------------ jenks_test.go | 27 +++++++----- 4 files changed, 95 insertions(+), 50 deletions(-) create mode 100644 go.mod create mode 100644 go.sum diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..d5f3c68 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/segmentio/jenks + +go 1.12 + +require github.com/ThinkingLogic/jenks v1.1.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..efee7a2 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/ThinkingLogic/jenks v1.1.1 h1:VN9M1nKx0aCjm1T1bh/rm/xlO4RDmvfUznORhQdBFdk= +github.com/ThinkingLogic/jenks v1.1.1/go.mod h1:kjOGWk0lcN7icl/rkxkIEHnKlZIbbiunqanQ2hxBR+I= diff --git a/jenks.go b/jenks.go index d5f2dd0..3587019 100644 --- a/jenks.go +++ b/jenks.go @@ -1,11 +1,11 @@ package jenks import ( + "fmt" "math" "sort" - "fmt" - "strings" "strconv" + "strings" ) // Jenks natural breaks optimization (http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization) @@ -20,16 +20,15 @@ func NaturalBreaks(data []float64, nClasses int) []float64 { data = sortData(data) // sanity check - uniq := deduplicate(data) - if nClasses >= len(uniq) { - return uniq + if nClasses >= countUniqueValues(data) { + return deduplicate(data) } // get our basic matrices (we only need lower class limits here) lowerClassLimits, _ := getMatrices(data, nClasses) // extract nClasses out of the computed matrices - return breaks(data, lowerClassLimits, nClasses) + return breaks(data, lowerClassLimits, nClasses, nClasses) } // AllNaturalBreaks finds all natural breaks in the data, for every set of breaks between 2 breaks and maxClasses. @@ -40,9 +39,9 @@ func AllNaturalBreaks(data []float64, maxClasses int) [][]float64 { data = sortData(data) // sanity check - uniq := deduplicate(data) - if maxClasses > len(uniq) { - maxClasses = len(uniq) + uniq := countUniqueValues(data) + if maxClasses > uniq { + maxClasses = uniq } // get our basic matrices (we only need lower class limits here) @@ -51,9 +50,9 @@ func AllNaturalBreaks(data []float64, maxClasses int) [][]float64 { // extract nClasses out of the computed matrices allBreaks := [][]float64{} for i := 2; i <= maxClasses; i++ { - nClasses := breaks(data, lowerClassLimits, i) - if i == len(uniq) { - nClasses = uniq + nClasses := breaks(data, lowerClassLimits, maxClasses, i) + if i == uniq { + nClasses = deduplicate(data) } allBreaks = append(allBreaks, nClasses) } @@ -71,7 +70,7 @@ func Round(breaks []float64, data []float64) []float64 { dataIdx := sort.SearchFloat64s(data, breaks[breakIdx]) var floor float64 if dataIdx == 0 { // make sure we can't go below breaks[i] - (breaks[i+1]-breaks[i]) - floor = data[0] - (breaks[breakIdx+1]-breaks[breakIdx]) + floor = data[0] - (breaks[breakIdx+1] - breaks[breakIdx]) } else { floor = data[dataIdx-1] } @@ -84,7 +83,7 @@ func Round(breaks []float64, data []float64) []float64 { func roundValue(initialValue float64, floor float64) float64 { b := []byte(strings.Trim(fmt.Sprintf("%f", initialValue), "0")) value := initialValue - for i := len(b)-1; i >= 0; i-- { + for i := len(b) - 1; i >= 0; i-- { if b[i] != '.' { b[i] = '0' round, e := strconv.ParseFloat(string(b), 64) @@ -121,38 +120,52 @@ func deduplicate(data []float64) []float64 { return uniq } +// countUniqueValues returns the number of unique values in the sorted array of +// data points passed as arguments. This function is used as an optimization to +// avoid calling deduplicate for the common case where there are more values +// than classes. +func countUniqueValues(data []float64) int { + n := 0 + x := math.NaN() + for _, v := range data { + if v != x { + x = v + n++ + } + } + return n +} + // getMatrices Computes the matrices required for Jenks breaks. // These matrices can be used for any classing of data with 'classes <= n_classes' -func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) { +func getMatrices(data []float64, nClasses int) ([]int, []float64) { + x := len(data) + 1 + y := nClasses + 1 + n := mat2len(x, y) // in the original implementation, these matrices are referred to // as 'LC' and 'OP' // // * lowerClassLimits (LC): optimal lower class limits // * variance_combinations (OP): optimal variance combinations for all classes - lowerClassLimits := make([][]int, len(data)+1) - varianceCombinations := make([][]float64, len(data)+1) + lowerClassLimits := make([]int, n) + varianceCombinations := make([]float64, n) // the variance, as computed at each step in the calculation variance := 0.0 - // Initialize and fill each matrix with zeroes - for i := 0; i < len(data)+1; i++ { - lowerClassLimits[i] = make([]int, nClasses+1) - varianceCombinations[i] = make([]float64, nClasses+1) - } + for i := 1; i < y; i++ { + index := mat2idx(1, i, y) + lowerClassLimits[index] = 1 + varianceCombinations[index] = 0 - for i := 1; i < nClasses+1; i++ { - lowerClassLimits[1][i] = 1 - varianceCombinations[1][i] = 0 - // in the original implementation, 'Infinity' is used but - // math.MaxFloat64 will do. - for j := 2; j < len(data)+1; j++ { - varianceCombinations[j][i] = math.MaxFloat64 + for j := 2; j < x; j++ { + varianceCombinations[mat2idx(j, i, y)] = math.Inf(+1) } } - for l := 2; l < len(data)+1; l++ { + for l := 2; l < x; l++ { + i1 := mat2idx(l, 0, y) // keep multiplication out of the inner loops // sum was 'SZ' originally. // this is the sum of the values seen thus far when calculating variance. @@ -183,21 +196,31 @@ func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) { // of samples. variance = sumSquares - (sum*sum)/w if currentIndex != 0 { - for j := 2; j < nClasses+1; j++ { + // keep multiplication out of the inner loop + i2 := mat2idx(currentIndex, 0, y) + + for j := 2; j < y; j++ { // if adding this element to an existing class // will increase its variance beyond the limit, break // the class at this point, setting the lower_class_limit // at this point. - if varianceCombinations[l][j] >= (variance + varianceCombinations[currentIndex][j-1]) { - lowerClassLimits[l][j] = lowerClassLimit - varianceCombinations[l][j] = variance + varianceCombinations[currentIndex][j-1] + j1 := i1 + j + j2 := i2 + j - 1 + + v1 := varianceCombinations[j1] + v2 := varianceCombinations[j2] + variance + + if v1 >= v2 { + lowerClassLimits[j1] = lowerClassLimit + varianceCombinations[j1] = v2 } } } } - lowerClassLimits[l][1] = 1; - varianceCombinations[l][1] = variance; + index := mat2idx(l, 1, y) + lowerClassLimits[index] = 1 + varianceCombinations[index] = variance } // return the two matrices. for just providing breaks, only // 'lower_class_limits' is needed, but variances can be useful to @@ -207,8 +230,8 @@ func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) { // breaks is the second part of the jenks recipe: // take the calculated matrices and derive an array of n breaks. -func breaks(data []float64, lowerClassLimits [][]int, nClasses int) []float64 { - +func breaks(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) []float64 { + y := maxClasses + 1 classBoundaries := make([]float64, nClasses) // the calculation of classes will never include the lower bound, so we need to explicitly set it @@ -218,11 +241,19 @@ func breaks(data []float64, lowerClassLimits [][]int, nClasses int) []float64 { // the lowerClassLimits matrix is used as indexes into itself here: // the next value of `k` is obtained from . k := len(data) - 1 - for i := nClasses; i > 1; i -- { - boundaryIndex := lowerClassLimits[k][i] - 1 + for i := nClasses; i > 1; i-- { + boundaryIndex := lowerClassLimits[mat2idx(k, i, y)] - 1 classBoundaries[i-1] = data[boundaryIndex] k = boundaryIndex } return classBoundaries } + +func mat2len(x, y int) int { + return x * y +} + +func mat2idx(i, j, y int) int { + return (i * y) + j +} diff --git a/jenks_test.go b/jenks_test.go index 0a6c23a..718cd19 100644 --- a/jenks_test.go +++ b/jenks_test.go @@ -1,9 +1,8 @@ -package jenks_test +package jenks import ( "reflect" "testing" - "github.com/ThinkingLogic/jenks" ) func TestNaturalBreaks(t *testing.T) { @@ -17,13 +16,13 @@ func TestNaturalBreaks(t *testing.T) { want []float64 }{ {name: "two breaks", - args: args{nClasses: 2, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, + args: args{nClasses: 2, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, want: []float64{1, 21}}, {name: "three breaks", - args: args{nClasses: 3, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, + args: args{nClasses: 3, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, want: []float64{1, 12, 21}}, {name: "four breaks", - args: args{nClasses: 4, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, + args: args{nClasses: 4, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, want: []float64{1, 12, 21, 27}}, {name: "more breaks than unique values", args: args{nClasses: 4, data: []float64{1.1, 1.1, 1.1, 1.3, 1.3, 1.3, 1.2, 1.2, 1.2}}, @@ -43,7 +42,7 @@ func TestNaturalBreaks(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := jenks.NaturalBreaks(tt.args.data, tt.args.nClasses); !reflect.DeepEqual(got, tt.want) { + if got := NaturalBreaks(tt.args.data, tt.args.nClasses); !reflect.DeepEqual(got, tt.want) { t.Errorf("NaturalBreaks() = %v, want %v", got, tt.want) } }) @@ -87,7 +86,7 @@ func TestAllNaturalBreaks(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := jenks.AllNaturalBreaks(tt.args.data, tt.args.maxClasses); !reflect.DeepEqual(got, tt.want) { + if got := AllNaturalBreaks(tt.args.data, tt.args.maxClasses); !reflect.DeepEqual(got, tt.want) { t.Errorf("AllNaturalBreaks() = %v, want %v", got, tt.want) } }) @@ -96,8 +95,8 @@ func TestAllNaturalBreaks(t *testing.T) { func TestRound(t *testing.T) { type args struct { - data []float64 - breaks []float64 + data []float64 + breaks []float64 } tests := []struct { name string @@ -123,9 +122,17 @@ func TestRound(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := jenks.Round(tt.args.breaks, tt.args.data); !reflect.DeepEqual(got, tt.want) { + if got := Round(tt.args.breaks, tt.args.data); !reflect.DeepEqual(got, tt.want) { t.Errorf("Round() = %v, want %v", got, tt.want) } }) } } + +func BenchmarkNaturalBreaks(b *testing.B) { + data := []float64{28.9, 33.5, 36.1, 38.6, 40.7, 42.7, 43.6, 45.8, 48.2, 48.6, 49.0, 51.0, 52.1, 52.2, 52.2, 52.4, 53.6, 54.2, 55.8, 55.8, 56.4, 56.8, 56.8, 57.7, 57.9, 58.2, 58.3, 58.4, 60.1, 60.1, 60.2, 61.1, 61.4, 61.9, 62.1, 62.5, 62.7, 63.1, 63.6, 64.2, 64.3, 64.4, 64.6, 64.7, 64.7, 64.8, 65.4, 65.8, 65.9, 66.2, 66.4, 66.6, 66.8, 67.0, 67.0, 67.1, 67.2, 67.2, 67.4, 68.2, 68.2, 68.3, 69.4, 69.5, 69.8, 70.2, 70.3, 70.5, 70.6, 71.2, 71.2, 71.2, 71.2, 71.8, 71.9, 72.0, 72.0, 72.0, 72.3, 72.5, 72.6, 73.0, 73.0, 73.0, 73.0, 73.2, 73.4, 73.4, 73.4, 74.0, 74.2, 74.4, 74.4, 74.9, 74.9, 75.4, 75.6, 76.0, 76.3, 76.3, 76.3, 76.4, 76.7, 77.2, 77.3, 77.6, 77.7, 78.3, 78.5, 78.5, 78.6, 78.7, 78.9, 79.2, 79.2, 79.2, 79.8, 79.8, 79.9, 80.7, 80.7, 81.2, 81.4, 81.5, 81.8, 82.0, 82.1, 82.2, 82.3, 82.4, 82.8, 83.0, 83.1, 83.3, 83.4, 83.6, 83.8, 83.8, 84.0, 84.2, 85.2, 85.4, 85.8, 86.1, 86.3, 87.1, 87.5, 87.7, 87.7, 87.8, 88.3, 88.9, 89.3, 90.3, 93.1, 94.2, 94.7, 95.7, 97.8, 99.2} + + for i := 0; i < b.N; i++ { + _ = NaturalBreaks(data, 10) + } +} From 95c23a7918231c43db212c00130dc1b7821db5c5 Mon Sep 17 00:00:00 2001 From: Achille Roussel Date: Fri, 30 Aug 2019 10:10:26 -0700 Subject: [PATCH 2/6] fix go.mod --- go.mod | 2 -- go.sum | 2 -- 2 files changed, 4 deletions(-) delete mode 100644 go.sum diff --git a/go.mod b/go.mod index d5f3c68..4fcdd93 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module github.com/segmentio/jenks go 1.12 - -require github.com/ThinkingLogic/jenks v1.1.1 diff --git a/go.sum b/go.sum deleted file mode 100644 index efee7a2..0000000 --- a/go.sum +++ /dev/null @@ -1,2 +0,0 @@ -github.com/ThinkingLogic/jenks v1.1.1 h1:VN9M1nKx0aCjm1T1bh/rm/xlO4RDmvfUznORhQdBFdk= -github.com/ThinkingLogic/jenks v1.1.1/go.mod h1:kjOGWk0lcN7icl/rkxkIEHnKlZIbbiunqanQ2hxBR+I= From 770fdb8ddea4fc21b8c293b11f93165659c59a13 Mon Sep 17 00:00:00 2001 From: Achille Roussel Date: Fri, 30 Aug 2019 16:06:11 -0700 Subject: [PATCH 3/6] add BestNaturalBreaks --- jenks.go | 110 ++++++++++++++++++++++++++++++++++++++++++-------- jenks_test.go | 58 ++++++++++++++++++++++++-- 2 files changed, 149 insertions(+), 19 deletions(-) diff --git a/jenks.go b/jenks.go index 3587019..be0f101 100644 --- a/jenks.go +++ b/jenks.go @@ -12,6 +12,36 @@ import ( // Based on the javascript implementation: https://gist.github.com/tmcw/4977508 // though that implementation has a bug - it has been fixed here. +func BestNaturalBreaks(data []float64, maxClasses int, minGvf float64) []float64 { + data = sortData(data) + + uniq := countUniqueValues(data) + if maxClasses >= uniq { + if uniq <= 2 { + return deduplicate(data) + } + maxClasses = uniq + } + + lowerClassLimits, _ := getMatrices(data, maxClasses) + var bestGvf float64 + var bestClass = 1 + + for nClasses := 2; nClasses <= maxClasses; nClasses++ { + gvf := goodnessOfVarianceFit(data, lowerClassLimits, maxClasses, nClasses) + + if gvf > bestGvf { + bestGvf, bestClass = gvf, nClasses + } + + if gvf >= minGvf { + break + } + } + + return breaks(data, lowerClassLimits, maxClasses, bestClass) +} + // NaturalBreaks returns the best nClasses natural breaks in the data, // using the Jenks natural breaks classification method (http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization). // It tries to maximize the similarity of numbers in groups while maximizing the distance between the groups. @@ -99,10 +129,8 @@ func roundValue(initialValue float64, floor float64) float64 { // sortData checks to see if the data is sorted, returning it unchanged if so. Otherwise, it creates and sorts a copy. func sortData(data []float64) []float64 { if !sort.Float64sAreSorted(data) { - data2 := make([]float64, len(data)) - copy(data2, data) - sort.Float64s(data2) - data = data2 + data = copyFloat64s(data) + sort.Float64s(data) } return data } @@ -228,25 +256,31 @@ func getMatrices(data []float64, nClasses int) ([]int, []float64) { return lowerClassLimits, varianceCombinations } -// breaks is the second part of the jenks recipe: -// take the calculated matrices and derive an array of n breaks. -func breaks(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) []float64 { +func forEachBreak(data []float64, lowerClassLimits []int, maxClasses int, nClasses int, do func(class, boundary int)) { y := maxClasses + 1 - classBoundaries := make([]float64, nClasses) - - // the calculation of classes will never include the lower bound, so we need to explicitly set it - // the upper bound is not included in the result - but it would be the maximum value in the data - classBoundaries[0] = data[0] - // the lowerClassLimits matrix is used as indexes into itself here: // the next value of `k` is obtained from . k := len(data) - 1 + for i := nClasses; i > 1; i-- { - boundaryIndex := lowerClassLimits[mat2idx(k, i, y)] - 1 - classBoundaries[i-1] = data[boundaryIndex] - k = boundaryIndex + k = lowerClassLimits[mat2idx(k, i, y)] - 1 + do(i, k) } + // the calculation of classes will never include the lower bound, so we need to explicitly set it + // the upper bound is not included in the result - but it would be the maximum value in the data + do(1, 0) +} + +// breaks is the second part of the jenks recipe: +// take the calculated matrices and derive an array of n breaks. +func breaks(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) []float64 { + classBoundaries := make([]float64, nClasses) + + forEachBreak(data, lowerClassLimits, maxClasses, nClasses, func(class, boundary int) { + classBoundaries[class-1] = data[boundary] + }) + return classBoundaries } @@ -257,3 +291,47 @@ func mat2len(x, y int) int { func mat2idx(i, j, y int) int { return (i * y) + j } + +func copyFloat64s(data []float64) []float64 { + return append(make([]float64, 0, len(data)), data...) +} + +func mean(data []float64) float64 { + if len(data) == 0 { + return 0.0 + } + sum := 0.0 + for _, v := range data { + sum += v + } + return sum / float64(len(data)) +} + +func sumOfSquareDeviations(data []float64) float64 { + mean := mean(data) + sum := 0.0 + for _, v := range data { + diff := v - mean + sum += diff * diff + } + return sum +} + +func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) float64 { + boundaries := make([]int, nClasses) + + forEachBreak(data, lowerClassLimits, maxClasses, nClasses, func(class, boundary int) { + boundaries[class-1] = boundary + }) + + sdam := sumOfSquareDeviations(data) + sdcm := 0.0 + + for i, n := 0, len(boundaries)-1; i < n; i++ { + b1 := boundaries[i] + b2 := boundaries[i+1] + sdcm += sumOfSquareDeviations(data[b1:b2]) + } + + return (sdam - sdcm) / sdam +} diff --git a/jenks_test.go b/jenks_test.go index 718cd19..368bbb5 100644 --- a/jenks_test.go +++ b/jenks_test.go @@ -5,6 +5,50 @@ import ( "testing" ) +func TestBestNaturalBreaks(t *testing.T) { + type args struct { + data []float64 + nClasses int + } + tests := []struct { + name string + args args + want []float64 + }{ + {name: "two breaks", + args: args{nClasses: 2, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, + want: []float64{1, 21}}, + {name: "three breaks", + args: args{nClasses: 3, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, + want: []float64{1, 12, 21}}, + {name: "four breaks optimized to three", + args: args{nClasses: 4, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}}, + want: []float64{1, 12, 21}}, + {name: "more breaks than unique values", + args: args{nClasses: 4, data: []float64{1.1, 1.1, 1.1, 1.3, 1.3, 1.3, 1.2, 1.2, 1.2}}, + want: []float64{1.1, 1.2}}, + {name: "one unique value", + args: args{nClasses: 4, data: []float64{1, 1, 1, 1}}, + want: []float64{1}}, + {name: "two values, two breaks", + args: args{nClasses: 4, data: []float64{1, 2}}, + want: []float64{1, 2}}, + {name: "http://www.real-statistics.com/multivariate-statistics/cluster-analysis/jenks-natural-breaks#example1", + args: args{nClasses: 4, data: []float64{28.9, 33.5, 36.1, 38.6, 40.7, 42.7, 43.6, 45.8, 48.2, 48.6, 49.0, 51.0, 52.1, 52.2, 52.2, 52.4, 53.6, 54.2, 55.8, 55.8, 56.4, 56.8, 56.8, 57.7, 57.9, 58.2, 58.3, 58.4, 60.1, 60.1, 60.2, 61.1, 61.4, 61.9, 62.1, 62.5, 62.7, 63.1, 63.6, 64.2, 64.3, 64.4, 64.6, 64.7, 64.7, 64.8, 65.4, 65.8, 65.9, 66.2, 66.4, 66.6, 66.8, 67.0, 67.0, 67.1, 67.2, 67.2, 67.4, 68.2, 68.2, 68.3, 69.4, 69.5, 69.8, 70.2, 70.3, 70.5, 70.6, 71.2, 71.2, 71.2, 71.2, 71.8, 71.9, 72.0, 72.0, 72.0, 72.3, 72.5, 72.6, 73.0, 73.0, 73.0, 73.0, 73.2, 73.4, 73.4, 73.4, 74.0, 74.2, 74.4, 74.4, 74.9, 74.9, 75.4, 75.6, 76.0, 76.3, 76.3, 76.3, 76.4, 76.7, 77.2, 77.3, 77.6, 77.7, 78.3, 78.5, 78.5, 78.6, 78.7, 78.9, 79.2, 79.2, 79.2, 79.8, 79.8, 79.9, 80.7, 80.7, 81.2, 81.4, 81.5, 81.8, 82.0, 82.1, 82.2, 82.3, 82.4, 82.8, 83.0, 83.1, 83.3, 83.4, 83.6, 83.8, 83.8, 84.0, 84.2, 85.2, 85.4, 85.8, 86.1, 86.3, 87.1, 87.5, 87.7, 87.7, 87.8, 88.3, 88.9, 89.3, 90.3, 93.1, 94.2, 94.7, 95.7, 97.8, 99.2}}, + want: []float64{28.9, 55.8, 69.4, 80.7}}, + {name: "http://www.real-statistics.com/multivariate-statistics/cluster-analysis/jenks-natural-breaks#example2", + args: args{nClasses: 4, data: []float64{5, 8, 9, 12, 15}}, + want: []float64{5, 8, 12}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := BestNaturalBreaks(tt.args.data, tt.args.nClasses, 0.9); !reflect.DeepEqual(got, tt.want) { + t.Errorf("NaturalBreaks() = %v, want %v", got, tt.want) + } + }) + } +} + func TestNaturalBreaks(t *testing.T) { type args struct { data []float64 @@ -129,10 +173,18 @@ func TestRound(t *testing.T) { } } -func BenchmarkNaturalBreaks(b *testing.B) { - data := []float64{28.9, 33.5, 36.1, 38.6, 40.7, 42.7, 43.6, 45.8, 48.2, 48.6, 49.0, 51.0, 52.1, 52.2, 52.2, 52.4, 53.6, 54.2, 55.8, 55.8, 56.4, 56.8, 56.8, 57.7, 57.9, 58.2, 58.3, 58.4, 60.1, 60.1, 60.2, 61.1, 61.4, 61.9, 62.1, 62.5, 62.7, 63.1, 63.6, 64.2, 64.3, 64.4, 64.6, 64.7, 64.7, 64.8, 65.4, 65.8, 65.9, 66.2, 66.4, 66.6, 66.8, 67.0, 67.0, 67.1, 67.2, 67.2, 67.4, 68.2, 68.2, 68.3, 69.4, 69.5, 69.8, 70.2, 70.3, 70.5, 70.6, 71.2, 71.2, 71.2, 71.2, 71.8, 71.9, 72.0, 72.0, 72.0, 72.3, 72.5, 72.6, 73.0, 73.0, 73.0, 73.0, 73.2, 73.4, 73.4, 73.4, 74.0, 74.2, 74.4, 74.4, 74.9, 74.9, 75.4, 75.6, 76.0, 76.3, 76.3, 76.3, 76.4, 76.7, 77.2, 77.3, 77.6, 77.7, 78.3, 78.5, 78.5, 78.6, 78.7, 78.9, 79.2, 79.2, 79.2, 79.8, 79.8, 79.9, 80.7, 80.7, 81.2, 81.4, 81.5, 81.8, 82.0, 82.1, 82.2, 82.3, 82.4, 82.8, 83.0, 83.1, 83.3, 83.4, 83.6, 83.8, 83.8, 84.0, 84.2, 85.2, 85.4, 85.8, 86.1, 86.3, 87.1, 87.5, 87.7, 87.7, 87.8, 88.3, 88.9, 89.3, 90.3, 93.1, 94.2, 94.7, 95.7, 97.8, 99.2} +var benchmarkData = []float64{ + 28.9, 33.5, 36.1, 38.6, 40.7, 42.7, 43.6, 45.8, 48.2, 48.6, 49.0, 51.0, 52.1, 52.2, 52.2, 52.4, 53.6, 54.2, 55.8, 55.8, 56.4, 56.8, 56.8, 57.7, 57.9, 58.2, 58.3, 58.4, 60.1, 60.1, 60.2, 61.1, 61.4, 61.9, 62.1, 62.5, 62.7, 63.1, 63.6, 64.2, 64.3, 64.4, 64.6, 64.7, 64.7, 64.8, 65.4, 65.8, 65.9, 66.2, 66.4, 66.6, 66.8, 67.0, 67.0, 67.1, 67.2, 67.2, 67.4, 68.2, 68.2, 68.3, 69.4, 69.5, 69.8, 70.2, 70.3, 70.5, 70.6, 71.2, 71.2, 71.2, 71.2, 71.8, 71.9, 72.0, 72.0, 72.0, 72.3, 72.5, 72.6, 73.0, 73.0, 73.0, 73.0, 73.2, 73.4, 73.4, 73.4, 74.0, 74.2, 74.4, 74.4, 74.9, 74.9, 75.4, 75.6, 76.0, 76.3, 76.3, 76.3, 76.4, 76.7, 77.2, 77.3, 77.6, 77.7, 78.3, 78.5, 78.5, 78.6, 78.7, 78.9, 79.2, 79.2, 79.2, 79.8, 79.8, 79.9, 80.7, 80.7, 81.2, 81.4, 81.5, 81.8, 82.0, 82.1, 82.2, 82.3, 82.4, 82.8, 83.0, 83.1, 83.3, 83.4, 83.6, 83.8, 83.8, 84.0, 84.2, 85.2, 85.4, 85.8, 86.1, 86.3, 87.1, 87.5, 87.7, 87.7, 87.8, 88.3, 88.9, 89.3, 90.3, 93.1, 94.2, 94.7, 95.7, 97.8, 99.2, +} +func BenchmarkBestNaturalBreaks(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = BestNaturalBreaks(benchmarkData, 10, 0.9) + } +} + +func BenchmarkNaturalBreaks(b *testing.B) { for i := 0; i < b.N; i++ { - _ = NaturalBreaks(data, 10) + _ = NaturalBreaks(benchmarkData, 10) } } From f36467d66b6cc10166a80dacff399b311f659f0c Mon Sep 17 00:00:00 2001 From: Achille Roussel Date: Fri, 30 Aug 2019 18:01:24 -0700 Subject: [PATCH 4/6] add sanity checks --- jenks.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/jenks.go b/jenks.go index be0f101..972dff8 100644 --- a/jenks.go +++ b/jenks.go @@ -330,6 +330,12 @@ func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses in for i, n := 0, len(boundaries)-1; i < n; i++ { b1 := boundaries[i] b2 := boundaries[i+1] + if b1 < 0 { + panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d", b1, len(data), nClasses, maxClasses)) + } + if b2 > len(data) { + panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b2, len(data), len(data), nClasses, maxClasses)) + } sdcm += sumOfSquareDeviations(data[b1:b2]) } From 19a75b0ce4662c79b50efae316b7ac5a723e1e78 Mon Sep 17 00:00:00 2001 From: Achille Roussel Date: Fri, 30 Aug 2019 18:20:38 -0700 Subject: [PATCH 5/6] add more sanity checks --- jenks.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jenks.go b/jenks.go index 972dff8..273390e 100644 --- a/jenks.go +++ b/jenks.go @@ -333,6 +333,9 @@ func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses in if b1 < 0 { panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d", b1, len(data), nClasses, maxClasses)) } + if b1 > len(data) { + panic(fmt.Errorf("lower bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b1, len(data), len(data), nClasses, maxClasses)) + } if b2 > len(data) { panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b2, len(data), len(data), nClasses, maxClasses)) } From db8c8c42b57c2540c82eacc41a8e886b81342000 Mon Sep 17 00:00:00 2001 From: Achille Roussel Date: Fri, 30 Aug 2019 18:28:34 -0700 Subject: [PATCH 6/6] more sanity checks --- jenks.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/jenks.go b/jenks.go index 273390e..93905ec 100644 --- a/jenks.go +++ b/jenks.go @@ -331,13 +331,16 @@ func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses in b1 := boundaries[i] b2 := boundaries[i+1] if b1 < 0 { - panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d", b1, len(data), nClasses, maxClasses)) + panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, len(data), nClasses, maxClasses, i, boundaries)) } if b1 > len(data) { - panic(fmt.Errorf("lower bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b1, len(data), len(data), nClasses, maxClasses)) + panic(fmt.Errorf("lower bound out of bounds: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, len(data), len(data), nClasses, maxClasses, i, boundaries)) } if b2 > len(data) { - panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b2, len(data), len(data), nClasses, maxClasses)) + panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b2, len(data), len(data), nClasses, maxClasses, i, boundaries)) + } + if b1 > b2 { + panic(fmt.Errorf("lower bound greater than upper bound: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, b2, len(data), nClasses, maxClasses, i, boundaries)) } sdcm += sumOfSquareDeviations(data[b1:b2]) }