From 6c6358e6d4f71f7eb3eff67c2bbcf67ef94621d2 Mon Sep 17 00:00:00 2001
From: Achille Roussel <achille@segment.com>
Date: Fri, 30 Aug 2019 10:08:26 -0700
Subject: [PATCH 1/6] optimize implementation

---
 go.mod        |   5 +++
 go.sum        |   2 +
 jenks.go      | 111 ++++++++++++++++++++++++++++++++------------------
 jenks_test.go |  27 +++++++-----
 4 files changed, 95 insertions(+), 50 deletions(-)
 create mode 100644 go.mod
 create mode 100644 go.sum

diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..d5f3c68
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,5 @@
+module github.com/segmentio/jenks
+
+go 1.12
+
+require github.com/ThinkingLogic/jenks v1.1.1
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..efee7a2
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,2 @@
+github.com/ThinkingLogic/jenks v1.1.1 h1:VN9M1nKx0aCjm1T1bh/rm/xlO4RDmvfUznORhQdBFdk=
+github.com/ThinkingLogic/jenks v1.1.1/go.mod h1:kjOGWk0lcN7icl/rkxkIEHnKlZIbbiunqanQ2hxBR+I=
diff --git a/jenks.go b/jenks.go
index d5f2dd0..3587019 100644
--- a/jenks.go
+++ b/jenks.go
@@ -1,11 +1,11 @@
 package jenks
 
 import (
+	"fmt"
 	"math"
 	"sort"
-	"fmt"
-	"strings"
 	"strconv"
+	"strings"
 )
 
 // Jenks natural breaks optimization (http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization)
@@ -20,16 +20,15 @@ func NaturalBreaks(data []float64, nClasses int) []float64 {
 	data = sortData(data)
 
 	// sanity check
-	uniq := deduplicate(data)
-	if nClasses >= len(uniq) {
-		return uniq
+	if nClasses >= countUniqueValues(data) {
+		return deduplicate(data)
 	}
 
 	// get our basic matrices (we only need lower class limits here)
 	lowerClassLimits, _ := getMatrices(data, nClasses)
 
 	// extract nClasses out of the computed matrices
-	return breaks(data, lowerClassLimits, nClasses)
+	return breaks(data, lowerClassLimits, nClasses, nClasses)
 }
 
 // AllNaturalBreaks finds all natural breaks in the data, for every set of breaks between 2 breaks and maxClasses.
@@ -40,9 +39,9 @@ func AllNaturalBreaks(data []float64, maxClasses int) [][]float64 {
 	data = sortData(data)
 
 	// sanity check
-	uniq := deduplicate(data)
-	if maxClasses > len(uniq) {
-		maxClasses = len(uniq)
+	uniq := countUniqueValues(data)
+	if maxClasses > uniq {
+		maxClasses = uniq
 	}
 
 	// get our basic matrices (we only need lower class limits here)
@@ -51,9 +50,9 @@ func AllNaturalBreaks(data []float64, maxClasses int) [][]float64 {
 	// extract nClasses out of the computed matrices
 	allBreaks := [][]float64{}
 	for i := 2; i <= maxClasses; i++ {
-		nClasses := breaks(data, lowerClassLimits, i)
-		if i == len(uniq) {
-			nClasses = uniq
+		nClasses := breaks(data, lowerClassLimits, maxClasses, i)
+		if i == uniq {
+			nClasses = deduplicate(data)
 		}
 		allBreaks = append(allBreaks, nClasses)
 	}
@@ -71,7 +70,7 @@ func Round(breaks []float64, data []float64) []float64 {
 		dataIdx := sort.SearchFloat64s(data, breaks[breakIdx])
 		var floor float64
 		if dataIdx == 0 { // make sure we can't go below breaks[i] - (breaks[i+1]-breaks[i])
-			floor = data[0] - (breaks[breakIdx+1]-breaks[breakIdx])
+			floor = data[0] - (breaks[breakIdx+1] - breaks[breakIdx])
 		} else {
 			floor = data[dataIdx-1]
 		}
@@ -84,7 +83,7 @@ func Round(breaks []float64, data []float64) []float64 {
 func roundValue(initialValue float64, floor float64) float64 {
 	b := []byte(strings.Trim(fmt.Sprintf("%f", initialValue), "0"))
 	value := initialValue
-	for i := len(b)-1; i >= 0; i-- {
+	for i := len(b) - 1; i >= 0; i-- {
 		if b[i] != '.' {
 			b[i] = '0'
 			round, e := strconv.ParseFloat(string(b), 64)
@@ -121,38 +120,52 @@ func deduplicate(data []float64) []float64 {
 	return uniq
 }
 
+// countUniqueValues returns the number of unique values in the sorted array of
+// data points passed as arguments. This function is used as an optimization to
+// avoid calling deduplicate for the common case where there are more values
+// than classes.
+func countUniqueValues(data []float64) int {
+	n := 0
+	x := math.NaN()
+	for _, v := range data {
+		if v != x {
+			x = v
+			n++
+		}
+	}
+	return n
+}
+
 // getMatrices Computes the matrices required for Jenks breaks.
 // These matrices can be used for any classing of data with 'classes <= n_classes'
-func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) {
+func getMatrices(data []float64, nClasses int) ([]int, []float64) {
+	x := len(data) + 1
+	y := nClasses + 1
+	n := mat2len(x, y)
 
 	// in the original implementation, these matrices are referred to
 	// as 'LC' and 'OP'
 	//
 	// * lowerClassLimits (LC): optimal lower class limits
 	// * variance_combinations (OP): optimal variance combinations for all classes
-	lowerClassLimits := make([][]int, len(data)+1)
-	varianceCombinations := make([][]float64, len(data)+1)
+	lowerClassLimits := make([]int, n)
+	varianceCombinations := make([]float64, n)
 
 	// the variance, as computed at each step in the calculation
 	variance := 0.0
 
-	// Initialize and fill each matrix with zeroes
-	for i := 0; i < len(data)+1; i++ {
-		lowerClassLimits[i] = make([]int, nClasses+1)
-		varianceCombinations[i] = make([]float64, nClasses+1)
-	}
+	for i := 1; i < y; i++ {
+		index := mat2idx(1, i, y)
+		lowerClassLimits[index] = 1
+		varianceCombinations[index] = 0
 
-	for i := 1; i < nClasses+1; i++ {
-		lowerClassLimits[1][i] = 1
-		varianceCombinations[1][i] = 0
-		// in the original implementation, 'Infinity' is used but
-		// math.MaxFloat64 will do.
-		for j := 2; j < len(data)+1; j++ {
-			varianceCombinations[j][i] = math.MaxFloat64
+		for j := 2; j < x; j++ {
+			varianceCombinations[mat2idx(j, i, y)] = math.Inf(+1)
 		}
 	}
 
-	for l := 2; l < len(data)+1; l++ {
+	for l := 2; l < x; l++ {
+		i1 := mat2idx(l, 0, y) // keep multiplication out of the inner loops
 
 		// sum was 'SZ' originally.
 		// this is the sum of the values seen thus far when calculating variance.
@@ -183,21 +196,31 @@ func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) {
 			// of samples.
 			variance = sumSquares - (sum*sum)/w
 			if currentIndex != 0 {
-				for j := 2; j < nClasses+1; j++ {
+				// keep multiplication out of the inner loop
+				i2 := mat2idx(currentIndex, 0, y)
+
+				for j := 2; j < y; j++ {
 					// if adding this element to an existing class
 					// will increase its variance beyond the limit, break
 					// the class at this point, setting the lower_class_limit
 					// at this point.
-					if varianceCombinations[l][j] >= (variance + varianceCombinations[currentIndex][j-1]) {
-						lowerClassLimits[l][j] = lowerClassLimit
-						varianceCombinations[l][j] = variance + varianceCombinations[currentIndex][j-1]
+					j1 := i1 + j
+					j2 := i2 + j - 1
+
+					v1 := varianceCombinations[j1]
+					v2 := varianceCombinations[j2] + variance
+
+					if v1 >= v2 {
+						lowerClassLimits[j1] = lowerClassLimit
+						varianceCombinations[j1] = v2
 					}
 				}
 			}
 		}
 
-		lowerClassLimits[l][1] = 1;
-		varianceCombinations[l][1] = variance;
+		index := mat2idx(l, 1, y)
+		lowerClassLimits[index] = 1
+		varianceCombinations[index] = variance
 	}
 	// return the two matrices. for just providing breaks, only
 	// 'lower_class_limits' is needed, but variances can be useful to
@@ -207,8 +230,8 @@ func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) {
 
 // breaks is the second part of the jenks recipe:
 // take the calculated matrices and derive an array of n breaks.
-func breaks(data []float64, lowerClassLimits [][]int, nClasses int) []float64 {
-
+func breaks(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) []float64 {
+	y := maxClasses + 1
 	classBoundaries := make([]float64, nClasses)
 
 	// the calculation of classes will never include the lower bound, so we need to explicitly set it
@@ -218,11 +241,19 @@ func breaks(data []float64, lowerClassLimits [][]int, nClasses int) []float64 {
 	// the lowerClassLimits matrix is used as indexes into itself here:
 	// the next value of `k` is obtained from .
 	k := len(data) - 1
-	for i := nClasses; i > 1; i -- {
-		boundaryIndex := lowerClassLimits[k][i] - 1
+	for i := nClasses; i > 1; i-- {
+		boundaryIndex := lowerClassLimits[mat2idx(k, i, y)] - 1
 		classBoundaries[i-1] = data[boundaryIndex]
 		k = boundaryIndex
 	}
 
 	return classBoundaries
 }
+
+func mat2len(x, y int) int {
+	return x * y
+}
+
+func mat2idx(i, j, y int) int {
+	return (i * y) + j
+}
diff --git a/jenks_test.go b/jenks_test.go
index 0a6c23a..718cd19 100644
--- a/jenks_test.go
+++ b/jenks_test.go
@@ -1,9 +1,8 @@
-package jenks_test
+package jenks
 
 import (
 	"reflect"
 	"testing"
-	"github.com/ThinkingLogic/jenks"
 )
 
 func TestNaturalBreaks(t *testing.T) {
@@ -17,13 +16,13 @@ func TestNaturalBreaks(t *testing.T) {
 		want []float64
 	}{
 		{name: "two breaks",
-			args: args{nClasses: 2, data: []float64{1, 2, 3,   12, 13, 14,    21, 22, 23,    27, 28, 29}},
+			args: args{nClasses: 2, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}},
 			want: []float64{1, 21}},
 		{name: "three breaks",
-			args: args{nClasses: 3, data: []float64{1, 2, 3,   12, 13, 14,    21, 22, 23,    27, 28, 29}},
+			args: args{nClasses: 3, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}},
 			want: []float64{1, 12, 21}},
 		{name: "four breaks",
-			args: args{nClasses: 4, data: []float64{1, 2, 3,   12, 13, 14,    21, 22, 23,    27, 28, 29}},
+			args: args{nClasses: 4, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}},
 			want: []float64{1, 12, 21, 27}},
 		{name: "more breaks than unique values",
 			args: args{nClasses: 4, data: []float64{1.1, 1.1, 1.1, 1.3, 1.3, 1.3, 1.2, 1.2, 1.2}},
@@ -43,7 +42,7 @@ func TestNaturalBreaks(t *testing.T) {
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			if got := jenks.NaturalBreaks(tt.args.data, tt.args.nClasses); !reflect.DeepEqual(got, tt.want) {
+			if got := NaturalBreaks(tt.args.data, tt.args.nClasses); !reflect.DeepEqual(got, tt.want) {
 				t.Errorf("NaturalBreaks() = %v, want %v", got, tt.want)
 			}
 		})
@@ -87,7 +86,7 @@ func TestAllNaturalBreaks(t *testing.T) {
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			if got := jenks.AllNaturalBreaks(tt.args.data, tt.args.maxClasses); !reflect.DeepEqual(got, tt.want) {
+			if got := AllNaturalBreaks(tt.args.data, tt.args.maxClasses); !reflect.DeepEqual(got, tt.want) {
 				t.Errorf("AllNaturalBreaks() = %v, want %v", got, tt.want)
 			}
 		})
@@ -96,8 +95,8 @@ func TestAllNaturalBreaks(t *testing.T) {
 
 func TestRound(t *testing.T) {
 	type args struct {
-		data       []float64
-		breaks     []float64
+		data   []float64
+		breaks []float64
 	}
 	tests := []struct {
 		name string
@@ -123,9 +122,17 @@ func TestRound(t *testing.T) {
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			if got := jenks.Round(tt.args.breaks, tt.args.data); !reflect.DeepEqual(got, tt.want) {
+			if got := Round(tt.args.breaks, tt.args.data); !reflect.DeepEqual(got, tt.want) {
 				t.Errorf("Round() = %v, want %v", got, tt.want)
 			}
 		})
 	}
 }
+
+func BenchmarkNaturalBreaks(b *testing.B) {
+	data := []float64{28.9, 33.5, 36.1, 38.6, 40.7, 42.7, 43.6, 45.8, 48.2, 48.6, 49.0, 51.0, 52.1, 52.2, 52.2, 52.4, 53.6, 54.2, 55.8, 55.8, 56.4, 56.8, 56.8, 57.7, 57.9, 58.2, 58.3, 58.4, 60.1, 60.1, 60.2, 61.1, 61.4, 61.9, 62.1, 62.5, 62.7, 63.1, 63.6, 64.2, 64.3, 64.4, 64.6, 64.7, 64.7, 64.8, 65.4, 65.8, 65.9, 66.2, 66.4, 66.6, 66.8, 67.0, 67.0, 67.1, 67.2, 67.2, 67.4, 68.2, 68.2, 68.3, 69.4, 69.5, 69.8, 70.2, 70.3, 70.5, 70.6, 71.2, 71.2, 71.2, 71.2, 71.8, 71.9, 72.0, 72.0, 72.0, 72.3, 72.5, 72.6, 73.0, 73.0, 73.0, 73.0, 73.2, 73.4, 73.4, 73.4, 74.0, 74.2, 74.4, 74.4, 74.9, 74.9, 75.4, 75.6, 76.0, 76.3, 76.3, 76.3, 76.4, 76.7, 77.2, 77.3, 77.6, 77.7, 78.3, 78.5, 78.5, 78.6, 78.7, 78.9, 79.2, 79.2, 79.2, 79.8, 79.8, 79.9, 80.7, 80.7, 81.2, 81.4, 81.5, 81.8, 82.0, 82.1, 82.2, 82.3, 82.4, 82.8, 83.0, 83.1, 83.3, 83.4, 83.6, 83.8, 83.8, 84.0, 84.2, 85.2, 85.4, 85.8, 86.1, 86.3, 87.1, 87.5, 87.7, 87.7, 87.8, 88.3, 88.9, 89.3, 90.3, 93.1, 94.2, 94.7, 95.7, 97.8, 99.2}
+
+	for i := 0; i < b.N; i++ {
+		_ = NaturalBreaks(data, 10)
+	}
+}

From 95c23a7918231c43db212c00130dc1b7821db5c5 Mon Sep 17 00:00:00 2001
From: Achille Roussel <achille@segment.com>
Date: Fri, 30 Aug 2019 10:10:26 -0700
Subject: [PATCH 2/6] fix go.mod

---
 go.mod | 2 --
 go.sum | 2 --
 2 files changed, 4 deletions(-)
 delete mode 100644 go.sum

diff --git a/go.mod b/go.mod
index d5f3c68..4fcdd93 100644
--- a/go.mod
+++ b/go.mod
@@ -1,5 +1,3 @@
 module github.com/segmentio/jenks
 
 go 1.12
-
-require github.com/ThinkingLogic/jenks v1.1.1
diff --git a/go.sum b/go.sum
deleted file mode 100644
index efee7a2..0000000
--- a/go.sum
+++ /dev/null
@@ -1,2 +0,0 @@
-github.com/ThinkingLogic/jenks v1.1.1 h1:VN9M1nKx0aCjm1T1bh/rm/xlO4RDmvfUznORhQdBFdk=
-github.com/ThinkingLogic/jenks v1.1.1/go.mod h1:kjOGWk0lcN7icl/rkxkIEHnKlZIbbiunqanQ2hxBR+I=

From 770fdb8ddea4fc21b8c293b11f93165659c59a13 Mon Sep 17 00:00:00 2001
From: Achille Roussel <achille@segment.com>
Date: Fri, 30 Aug 2019 16:06:11 -0700
Subject: [PATCH 3/6] add BestNaturalBreaks

---
 jenks.go      | 110 ++++++++++++++++++++++++++++++++++++++++++--------
 jenks_test.go |  58 ++++++++++++++++++++++++--
 2 files changed, 149 insertions(+), 19 deletions(-)

diff --git a/jenks.go b/jenks.go
index 3587019..be0f101 100644
--- a/jenks.go
+++ b/jenks.go
@@ -12,6 +12,36 @@ import (
 // Based on the javascript implementation: https://gist.github.com/tmcw/4977508
 // though that implementation has a bug - it has been fixed here.
 
+func BestNaturalBreaks(data []float64, maxClasses int, minGvf float64) []float64 {
+	data = sortData(data)
+
+	uniq := countUniqueValues(data)
+	if maxClasses >= uniq {
+		if uniq <= 2 {
+			return deduplicate(data)
+		}
+		maxClasses = uniq
+	}
+
+	lowerClassLimits, _ := getMatrices(data, maxClasses)
+	var bestGvf float64
+	var bestClass = 1
+
+	for nClasses := 2; nClasses <= maxClasses; nClasses++ {
+		gvf := goodnessOfVarianceFit(data, lowerClassLimits, maxClasses, nClasses)
+
+		if gvf > bestGvf {
+			bestGvf, bestClass = gvf, nClasses
+		}
+
+		if gvf >= minGvf {
+			break
+		}
+	}
+
+	return breaks(data, lowerClassLimits, maxClasses, bestClass)
+}
+
 // NaturalBreaks returns the best nClasses natural breaks in the data,
 // using the Jenks natural breaks classification method (http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization).
 // It tries to maximize the similarity of numbers in groups while maximizing the distance between the groups.
@@ -99,10 +129,8 @@ func roundValue(initialValue float64, floor float64) float64 {
 // sortData checks to see if the data is sorted, returning it unchanged if so. Otherwise, it creates and sorts a copy.
 func sortData(data []float64) []float64 {
 	if !sort.Float64sAreSorted(data) {
-		data2 := make([]float64, len(data))
-		copy(data2, data)
-		sort.Float64s(data2)
-		data = data2
+		data = copyFloat64s(data)
+		sort.Float64s(data)
 	}
 	return data
 }
@@ -228,25 +256,31 @@ func getMatrices(data []float64, nClasses int) ([]int, []float64) {
 	return lowerClassLimits, varianceCombinations
 }
 
-// breaks is the second part of the jenks recipe:
-// take the calculated matrices and derive an array of n breaks.
-func breaks(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) []float64 {
+func forEachBreak(data []float64, lowerClassLimits []int, maxClasses int, nClasses int, do func(class, boundary int)) {
 	y := maxClasses + 1
-	classBoundaries := make([]float64, nClasses)
-
-	// the calculation of classes will never include the lower bound, so we need to explicitly set it
-	// the upper bound is not included in the result - but it would be the maximum value in the data
-	classBoundaries[0] = data[0]
-
 	// the lowerClassLimits matrix is used as indexes into itself here:
 	// the next value of `k` is obtained from .
 	k := len(data) - 1
+
 	for i := nClasses; i > 1; i-- {
-		boundaryIndex := lowerClassLimits[mat2idx(k, i, y)] - 1
-		classBoundaries[i-1] = data[boundaryIndex]
-		k = boundaryIndex
+		k = lowerClassLimits[mat2idx(k, i, y)] - 1
+		do(i, k)
 	}
 
+	// the calculation of classes will never include the lower bound, so we need to explicitly set it
+	// the upper bound is not included in the result - but it would be the maximum value in the data
+	do(1, 0)
+}
+
+// breaks is the second part of the jenks recipe:
+// take the calculated matrices and derive an array of n breaks.
+func breaks(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) []float64 {
+	classBoundaries := make([]float64, nClasses)
+
+	forEachBreak(data, lowerClassLimits, maxClasses, nClasses, func(class, boundary int) {
+		classBoundaries[class-1] = data[boundary]
+	})
+
 	return classBoundaries
 }
 
@@ -257,3 +291,47 @@ func mat2len(x, y int) int {
 func mat2idx(i, j, y int) int {
 	return (i * y) + j
 }
+
+func copyFloat64s(data []float64) []float64 {
+	return append(make([]float64, 0, len(data)), data...)
+}
+
+func mean(data []float64) float64 {
+	if len(data) == 0 {
+		return 0.0
+	}
+	sum := 0.0
+	for _, v := range data {
+		sum += v
+	}
+	return sum / float64(len(data))
+}
+
+func sumOfSquareDeviations(data []float64) float64 {
+	mean := mean(data)
+	sum := 0.0
+	for _, v := range data {
+		diff := v - mean
+		sum += diff * diff
+	}
+	return sum
+}
+
+func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) float64 {
+	boundaries := make([]int, nClasses)
+
+	forEachBreak(data, lowerClassLimits, maxClasses, nClasses, func(class, boundary int) {
+		boundaries[class-1] = boundary
+	})
+
+	sdam := sumOfSquareDeviations(data)
+	sdcm := 0.0
+
+	for i, n := 0, len(boundaries)-1; i < n; i++ {
+		b1 := boundaries[i]
+		b2 := boundaries[i+1]
+		sdcm += sumOfSquareDeviations(data[b1:b2])
+	}
+
+	return (sdam - sdcm) / sdam
+}
diff --git a/jenks_test.go b/jenks_test.go
index 718cd19..368bbb5 100644
--- a/jenks_test.go
+++ b/jenks_test.go
@@ -5,6 +5,50 @@ import (
 	"testing"
 )
 
+func TestBestNaturalBreaks(t *testing.T) {
+	type args struct {
+		data     []float64
+		nClasses int
+	}
+	tests := []struct {
+		name string
+		args args
+		want []float64
+	}{
+		{name: "two breaks",
+			args: args{nClasses: 2, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}},
+			want: []float64{1, 21}},
+		{name: "three breaks",
+			args: args{nClasses: 3, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}},
+			want: []float64{1, 12, 21}},
+		{name: "four breaks optimized to three",
+			args: args{nClasses: 4, data: []float64{1, 2, 3, 12, 13, 14, 21, 22, 23, 27, 28, 29}},
+			want: []float64{1, 12, 21}},
+		{name: "more breaks than unique values",
+			args: args{nClasses: 4, data: []float64{1.1, 1.1, 1.1, 1.3, 1.3, 1.3, 1.2, 1.2, 1.2}},
+			want: []float64{1.1, 1.2}},
+		{name: "one unique value",
+			args: args{nClasses: 4, data: []float64{1, 1, 1, 1}},
+			want: []float64{1}},
+		{name: "two values, two breaks",
+			args: args{nClasses: 4, data: []float64{1, 2}},
+			want: []float64{1, 2}},
+		{name: "http://www.real-statistics.com/multivariate-statistics/cluster-analysis/jenks-natural-breaks#example1",
+			args: args{nClasses: 4, data: []float64{28.9, 33.5, 36.1, 38.6, 40.7, 42.7, 43.6, 45.8, 48.2, 48.6, 49.0, 51.0, 52.1, 52.2, 52.2, 52.4, 53.6, 54.2, 55.8, 55.8, 56.4, 56.8, 56.8, 57.7, 57.9, 58.2, 58.3, 58.4, 60.1, 60.1, 60.2, 61.1, 61.4, 61.9, 62.1, 62.5, 62.7, 63.1, 63.6, 64.2, 64.3, 64.4, 64.6, 64.7, 64.7, 64.8, 65.4, 65.8, 65.9, 66.2, 66.4, 66.6, 66.8, 67.0, 67.0, 67.1, 67.2, 67.2, 67.4, 68.2, 68.2, 68.3, 69.4, 69.5, 69.8, 70.2, 70.3, 70.5, 70.6, 71.2, 71.2, 71.2, 71.2, 71.8, 71.9, 72.0, 72.0, 72.0, 72.3, 72.5, 72.6, 73.0, 73.0, 73.0, 73.0, 73.2, 73.4, 73.4, 73.4, 74.0, 74.2, 74.4, 74.4, 74.9, 74.9, 75.4, 75.6, 76.0, 76.3, 76.3, 76.3, 76.4, 76.7, 77.2, 77.3, 77.6, 77.7, 78.3, 78.5, 78.5, 78.6, 78.7, 78.9, 79.2, 79.2, 79.2, 79.8, 79.8, 79.9, 80.7, 80.7, 81.2, 81.4, 81.5, 81.8, 82.0, 82.1, 82.2, 82.3, 82.4, 82.8, 83.0, 83.1, 83.3, 83.4, 83.6, 83.8, 83.8, 84.0, 84.2, 85.2, 85.4, 85.8, 86.1, 86.3, 87.1, 87.5, 87.7, 87.7, 87.8, 88.3, 88.9, 89.3, 90.3, 93.1, 94.2, 94.7, 95.7, 97.8, 99.2}},
+			want: []float64{28.9, 55.8, 69.4, 80.7}},
+		{name: "http://www.real-statistics.com/multivariate-statistics/cluster-analysis/jenks-natural-breaks#example2",
+			args: args{nClasses: 4, data: []float64{5, 8, 9, 12, 15}},
+			want: []float64{5, 8, 12}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := BestNaturalBreaks(tt.args.data, tt.args.nClasses, 0.9); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("NaturalBreaks() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
 func TestNaturalBreaks(t *testing.T) {
 	type args struct {
 		data     []float64
@@ -129,10 +173,18 @@ func TestRound(t *testing.T) {
 	}
 }
 
-func BenchmarkNaturalBreaks(b *testing.B) {
-	data := []float64{28.9, 33.5, 36.1, 38.6, 40.7, 42.7, 43.6, 45.8, 48.2, 48.6, 49.0, 51.0, 52.1, 52.2, 52.2, 52.4, 53.6, 54.2, 55.8, 55.8, 56.4, 56.8, 56.8, 57.7, 57.9, 58.2, 58.3, 58.4, 60.1, 60.1, 60.2, 61.1, 61.4, 61.9, 62.1, 62.5, 62.7, 63.1, 63.6, 64.2, 64.3, 64.4, 64.6, 64.7, 64.7, 64.8, 65.4, 65.8, 65.9, 66.2, 66.4, 66.6, 66.8, 67.0, 67.0, 67.1, 67.2, 67.2, 67.4, 68.2, 68.2, 68.3, 69.4, 69.5, 69.8, 70.2, 70.3, 70.5, 70.6, 71.2, 71.2, 71.2, 71.2, 71.8, 71.9, 72.0, 72.0, 72.0, 72.3, 72.5, 72.6, 73.0, 73.0, 73.0, 73.0, 73.2, 73.4, 73.4, 73.4, 74.0, 74.2, 74.4, 74.4, 74.9, 74.9, 75.4, 75.6, 76.0, 76.3, 76.3, 76.3, 76.4, 76.7, 77.2, 77.3, 77.6, 77.7, 78.3, 78.5, 78.5, 78.6, 78.7, 78.9, 79.2, 79.2, 79.2, 79.8, 79.8, 79.9, 80.7, 80.7, 81.2, 81.4, 81.5, 81.8, 82.0, 82.1, 82.2, 82.3, 82.4, 82.8, 83.0, 83.1, 83.3, 83.4, 83.6, 83.8, 83.8, 84.0, 84.2, 85.2, 85.4, 85.8, 86.1, 86.3, 87.1, 87.5, 87.7, 87.7, 87.8, 88.3, 88.9, 89.3, 90.3, 93.1, 94.2, 94.7, 95.7, 97.8, 99.2}
+var benchmarkData = []float64{
+	28.9, 33.5, 36.1, 38.6, 40.7, 42.7, 43.6, 45.8, 48.2, 48.6, 49.0, 51.0, 52.1, 52.2, 52.2, 52.4, 53.6, 54.2, 55.8, 55.8, 56.4, 56.8, 56.8, 57.7, 57.9, 58.2, 58.3, 58.4, 60.1, 60.1, 60.2, 61.1, 61.4, 61.9, 62.1, 62.5, 62.7, 63.1, 63.6, 64.2, 64.3, 64.4, 64.6, 64.7, 64.7, 64.8, 65.4, 65.8, 65.9, 66.2, 66.4, 66.6, 66.8, 67.0, 67.0, 67.1, 67.2, 67.2, 67.4, 68.2, 68.2, 68.3, 69.4, 69.5, 69.8, 70.2, 70.3, 70.5, 70.6, 71.2, 71.2, 71.2, 71.2, 71.8, 71.9, 72.0, 72.0, 72.0, 72.3, 72.5, 72.6, 73.0, 73.0, 73.0, 73.0, 73.2, 73.4, 73.4, 73.4, 74.0, 74.2, 74.4, 74.4, 74.9, 74.9, 75.4, 75.6, 76.0, 76.3, 76.3, 76.3, 76.4, 76.7, 77.2, 77.3, 77.6, 77.7, 78.3, 78.5, 78.5, 78.6, 78.7, 78.9, 79.2, 79.2, 79.2, 79.8, 79.8, 79.9, 80.7, 80.7, 81.2, 81.4, 81.5, 81.8, 82.0, 82.1, 82.2, 82.3, 82.4, 82.8, 83.0, 83.1, 83.3, 83.4, 83.6, 83.8, 83.8, 84.0, 84.2, 85.2, 85.4, 85.8, 86.1, 86.3, 87.1, 87.5, 87.7, 87.7, 87.8, 88.3, 88.9, 89.3, 90.3, 93.1, 94.2, 94.7, 95.7, 97.8, 99.2,
+}
 
+func BenchmarkBestNaturalBreaks(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		_ = BestNaturalBreaks(benchmarkData, 10, 0.9)
+	}
+}
+
+func BenchmarkNaturalBreaks(b *testing.B) {
 	for i := 0; i < b.N; i++ {
-		_ = NaturalBreaks(data, 10)
+		_ = NaturalBreaks(benchmarkData, 10)
 	}
 }

From f36467d66b6cc10166a80dacff399b311f659f0c Mon Sep 17 00:00:00 2001
From: Achille Roussel <achille@segment.com>
Date: Fri, 30 Aug 2019 18:01:24 -0700
Subject: [PATCH 4/6] add sanity checks

---
 jenks.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/jenks.go b/jenks.go
index be0f101..972dff8 100644
--- a/jenks.go
+++ b/jenks.go
@@ -330,6 +330,12 @@ func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses in
 	for i, n := 0, len(boundaries)-1; i < n; i++ {
 		b1 := boundaries[i]
 		b2 := boundaries[i+1]
+		if b1 < 0 {
+			panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d", b1, len(data), nClasses, maxClasses))
+		}
+		if b2 > len(data) {
+			panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b2, len(data), len(data), nClasses, maxClasses))
+		}
 		sdcm += sumOfSquareDeviations(data[b1:b2])
 	}
 

From 19a75b0ce4662c79b50efae316b7ac5a723e1e78 Mon Sep 17 00:00:00 2001
From: Achille Roussel <achille@segment.com>
Date: Fri, 30 Aug 2019 18:20:38 -0700
Subject: [PATCH 5/6] add more sanity checks

---
 jenks.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/jenks.go b/jenks.go
index 972dff8..273390e 100644
--- a/jenks.go
+++ b/jenks.go
@@ -333,6 +333,9 @@ func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses in
 		if b1 < 0 {
 			panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d", b1, len(data), nClasses, maxClasses))
 		}
+		if b1 > len(data) {
+			panic(fmt.Errorf("lower bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b1, len(data), len(data), nClasses, maxClasses))
+		}
 		if b2 > len(data) {
 			panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b2, len(data), len(data), nClasses, maxClasses))
 		}

From db8c8c42b57c2540c82eacc41a8e886b81342000 Mon Sep 17 00:00:00 2001
From: Achille Roussel <achille@segment.com>
Date: Fri, 30 Aug 2019 18:28:34 -0700
Subject: [PATCH 6/6] more sanity checks

---
 jenks.go | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/jenks.go b/jenks.go
index 273390e..93905ec 100644
--- a/jenks.go
+++ b/jenks.go
@@ -331,13 +331,16 @@ func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses in
 		b1 := boundaries[i]
 		b2 := boundaries[i+1]
 		if b1 < 0 {
-			panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d", b1, len(data), nClasses, maxClasses))
+			panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, len(data), nClasses, maxClasses, i, boundaries))
 		}
 		if b1 > len(data) {
-			panic(fmt.Errorf("lower bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b1, len(data), len(data), nClasses, maxClasses))
+			panic(fmt.Errorf("lower bound out of bounds: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, len(data), len(data), nClasses, maxClasses, i, boundaries))
 		}
 		if b2 > len(data) {
-			panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d", b2, len(data), len(data), nClasses, maxClasses))
+			panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b2, len(data), len(data), nClasses, maxClasses, i, boundaries))
+		}
+		if b1 > b2 {
+			panic(fmt.Errorf("lower bound greater than upper bound: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, b2, len(data), nClasses, maxClasses, i, boundaries))
 		}
 		sdcm += sumOfSquareDeviations(data[b1:b2])
 	}