Merge db8c8c4 into 6a89f80

ThinkingLogic · Aug 31, 2019 · c26f879 · c26f879
2 parents 6a89f80 + db8c8c4
commit c26f879
Show file tree

Hide file tree

Showing 3 changed files with 246 additions and 63 deletions.
diff --git a/go.mod b/go.mod
@@ -0,0 +1,3 @@
+module github.com/segmentio/jenks
+
+go 1.12
diff --git a/jenks.go b/jenks.go
@@ -1,17 +1,47 @@
 package jenks
 
 import (
+	"fmt"
 	"math"
 	"sort"
-	"fmt"
-	"strings"
 	"strconv"
+	"strings"
 )
 
 // Jenks natural breaks optimization (http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization)
 // Based on the javascript implementation: https://gist.github.com/tmcw/4977508
 // though that implementation has a bug - it has been fixed here.
 
+func BestNaturalBreaks(data []float64, maxClasses int, minGvf float64) []float64 {
+	data = sortData(data)
+
+	uniq := countUniqueValues(data)
+	if maxClasses >= uniq {
+		if uniq <= 2 {
+			return deduplicate(data)
+		}
+		maxClasses = uniq
+	}
+
+	lowerClassLimits, _ := getMatrices(data, maxClasses)
+	var bestGvf float64
+	var bestClass = 1
+
+	for nClasses := 2; nClasses <= maxClasses; nClasses++ {
+		gvf := goodnessOfVarianceFit(data, lowerClassLimits, maxClasses, nClasses)
+
+		if gvf > bestGvf {
+			bestGvf, bestClass = gvf, nClasses
+		}
+
+		if gvf >= minGvf {
+			break
+		}
+	}
+
+	return breaks(data, lowerClassLimits, maxClasses, bestClass)
+}
+
 // NaturalBreaks returns the best nClasses natural breaks in the data,
 // using the Jenks natural breaks classification method (http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization).
 // It tries to maximize the similarity of numbers in groups while maximizing the distance between the groups.
@@ -20,16 +50,15 @@ func NaturalBreaks(data []float64, nClasses int) []float64 {
 	data = sortData(data)
 
 	// sanity check
-	uniq := deduplicate(data)
-	if nClasses >= len(uniq) {
-		return uniq
+	if nClasses >= countUniqueValues(data) {
+		return deduplicate(data)
 	}
 
 	// get our basic matrices (we only need lower class limits here)
 	lowerClassLimits, _ := getMatrices(data, nClasses)
 
 	// extract nClasses out of the computed matrices
-	return breaks(data, lowerClassLimits, nClasses)
+	return breaks(data, lowerClassLimits, nClasses, nClasses)
 }
 
 // AllNaturalBreaks finds all natural breaks in the data, for every set of breaks between 2 breaks and maxClasses.
@@ -40,9 +69,9 @@ func AllNaturalBreaks(data []float64, maxClasses int) [][]float64 {
 	data = sortData(data)
 
 	// sanity check
-	uniq := deduplicate(data)
-	if maxClasses > len(uniq) {
-		maxClasses = len(uniq)
+	uniq := countUniqueValues(data)
+	if maxClasses > uniq {
+		maxClasses = uniq
 	}
 
 	// get our basic matrices (we only need lower class limits here)
@@ -51,9 +80,9 @@ func AllNaturalBreaks(data []float64, maxClasses int) [][]float64 {
 	// extract nClasses out of the computed matrices
 	allBreaks := [][]float64{}
 	for i := 2; i <= maxClasses; i++ {
-		nClasses := breaks(data, lowerClassLimits, i)
-		if i == len(uniq) {
-			nClasses = uniq
+		nClasses := breaks(data, lowerClassLimits, maxClasses, i)
+		if i == uniq {
+			nClasses = deduplicate(data)
 		}
 		allBreaks = append(allBreaks, nClasses)
 	}
@@ -71,7 +100,7 @@ func Round(breaks []float64, data []float64) []float64 {
 		dataIdx := sort.SearchFloat64s(data, breaks[breakIdx])
 		var floor float64
 		if dataIdx == 0 { // make sure we can't go below breaks[i] - (breaks[i+1]-breaks[i])
-			floor = data[0] - (breaks[breakIdx+1]-breaks[breakIdx])
+			floor = data[0] - (breaks[breakIdx+1] - breaks[breakIdx])
 		} else {
 			floor = data[dataIdx-1]
 		}
@@ -84,7 +113,7 @@ func Round(breaks []float64, data []float64) []float64 {
 func roundValue(initialValue float64, floor float64) float64 {
 	b := []byte(strings.Trim(fmt.Sprintf("%f", initialValue), "0"))
 	value := initialValue
-	for i := len(b)-1; i >= 0; i-- {
+	for i := len(b) - 1; i >= 0; i-- {
 		if b[i] != '.' {
 			b[i] = '0'
 			round, e := strconv.ParseFloat(string(b), 64)
@@ -100,10 +129,8 @@ func roundValue(initialValue float64, floor float64) float64 {
 // sortData checks to see if the data is sorted, returning it unchanged if so. Otherwise, it creates and sorts a copy.
 func sortData(data []float64) []float64 {
 	if !sort.Float64sAreSorted(data) {
-		data2 := make([]float64, len(data))
-		copy(data2, data)
-		sort.Float64s(data2)
-		data = data2
+		data = copyFloat64s(data)
+		sort.Float64s(data)
 	}
 	return data
 }
@@ -121,38 +148,52 @@ func deduplicate(data []float64) []float64 {
 	return uniq
 }
 
+// countUniqueValues returns the number of unique values in the sorted array of
+// data points passed as arguments. This function is used as an optimization to
+// avoid calling deduplicate for the common case where there are more values
+// than classes.
+func countUniqueValues(data []float64) int {
+	n := 0
+	x := math.NaN()
+	for _, v := range data {
+		if v != x {
+			x = v
+			n++
+		}
+	}
+	return n
+}
+
 // getMatrices Computes the matrices required for Jenks breaks.
 // These matrices can be used for any classing of data with 'classes <= n_classes'
-func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) {
+func getMatrices(data []float64, nClasses int) ([]int, []float64) {
+	x := len(data) + 1
+	y := nClasses + 1
+	n := mat2len(x, y)
 
 	// in the original implementation, these matrices are referred to
 	// as 'LC' and 'OP'
 	//
 	// * lowerClassLimits (LC): optimal lower class limits
 	// * variance_combinations (OP): optimal variance combinations for all classes
-	lowerClassLimits := make([][]int, len(data)+1)
-	varianceCombinations := make([][]float64, len(data)+1)
+	lowerClassLimits := make([]int, n)
+	varianceCombinations := make([]float64, n)
 
 	// the variance, as computed at each step in the calculation
 	variance := 0.0
 
-	// Initialize and fill each matrix with zeroes
-	for i := 0; i < len(data)+1; i++ {
-		lowerClassLimits[i] = make([]int, nClasses+1)
-		varianceCombinations[i] = make([]float64, nClasses+1)
-	}
+	for i := 1; i < y; i++ {
+		index := mat2idx(1, i, y)
+		lowerClassLimits[index] = 1
+		varianceCombinations[index] = 0
 
-	for i := 1; i < nClasses+1; i++ {
-		lowerClassLimits[1][i] = 1
-		varianceCombinations[1][i] = 0
-		// in the original implementation, 'Infinity' is used but
-		// math.MaxFloat64 will do.
-		for j := 2; j < len(data)+1; j++ {
-			varianceCombinations[j][i] = math.MaxFloat64
+		for j := 2; j < x; j++ {
+			varianceCombinations[mat2idx(j, i, y)] = math.Inf(+1)
 		}
 	}
 
-	for l := 2; l < len(data)+1; l++ {
+	for l := 2; l < x; l++ {
+		i1 := mat2idx(l, 0, y) // keep multiplication out of the inner loops
 
 		// sum was 'SZ' originally.
 		// this is the sum of the values seen thus far when calculating variance.
@@ -183,46 +224,126 @@ func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) {
 			// of samples.
 			variance = sumSquares - (sum*sum)/w
 			if currentIndex != 0 {
-				for j := 2; j < nClasses+1; j++ {
+				// keep multiplication out of the inner loop
+				i2 := mat2idx(currentIndex, 0, y)
+
+				for j := 2; j < y; j++ {
 					// if adding this element to an existing class
 					// will increase its variance beyond the limit, break
 					// the class at this point, setting the lower_class_limit
 					// at this point.
-					if varianceCombinations[l][j] >= (variance + varianceCombinations[currentIndex][j-1]) {
-						lowerClassLimits[l][j] = lowerClassLimit
-						varianceCombinations[l][j] = variance + varianceCombinations[currentIndex][j-1]
+					j1 := i1 + j
+					j2 := i2 + j - 1
+
+					v1 := varianceCombinations[j1]
+					v2 := varianceCombinations[j2] + variance
+
+					if v1 >= v2 {
+						lowerClassLimits[j1] = lowerClassLimit
+						varianceCombinations[j1] = v2
 					}
 				}
 			}
 		}
 
-		lowerClassLimits[l][1] = 1;
-		varianceCombinations[l][1] = variance;
+		index := mat2idx(l, 1, y)
+		lowerClassLimits[index] = 1
+		varianceCombinations[index] = variance
 	}
 	// return the two matrices. for just providing breaks, only
 	// 'lower_class_limits' is needed, but variances can be useful to
 	// evaluate goodness of fit.
 	return lowerClassLimits, varianceCombinations
 }
 
-// breaks is the second part of the jenks recipe:
-// take the calculated matrices and derive an array of n breaks.
-func breaks(data []float64, lowerClassLimits [][]int, nClasses int) []float64 {
+func forEachBreak(data []float64, lowerClassLimits []int, maxClasses int, nClasses int, do func(class, boundary int)) {
+	y := maxClasses + 1
+	// the lowerClassLimits matrix is used as indexes into itself here:
+	// the next value of `k` is obtained from .
+	k := len(data) - 1
 
-	classBoundaries := make([]float64, nClasses)
+	for i := nClasses; i > 1; i-- {
+		k = lowerClassLimits[mat2idx(k, i, y)] - 1
+		do(i, k)
+	}
 
 	// the calculation of classes will never include the lower bound, so we need to explicitly set it
 	// the upper bound is not included in the result - but it would be the maximum value in the data
-	classBoundaries[0] = data[0]
+	do(1, 0)
+}
 
-	// the lowerClassLimits matrix is used as indexes into itself here:
-	// the next value of `k` is obtained from .
-	k := len(data) - 1
-	for i := nClasses; i > 1; i -- {
-		boundaryIndex := lowerClassLimits[k][i] - 1
-		classBoundaries[i-1] = data[boundaryIndex]
-		k = boundaryIndex
-	}
+// breaks is the second part of the jenks recipe:
+// take the calculated matrices and derive an array of n breaks.
+func breaks(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) []float64 {
+	classBoundaries := make([]float64, nClasses)
+
+	forEachBreak(data, lowerClassLimits, maxClasses, nClasses, func(class, boundary int) {
+		classBoundaries[class-1] = data[boundary]
+	})
 
 	return classBoundaries
 }
+
+func mat2len(x, y int) int {
+	return x * y
+}
+
+func mat2idx(i, j, y int) int {
+	return (i * y) + j
+}
+
+func copyFloat64s(data []float64) []float64 {
+	return append(make([]float64, 0, len(data)), data...)
+}
+
+func mean(data []float64) float64 {
+	if len(data) == 0 {
+		return 0.0
+	}
+	sum := 0.0
+	for _, v := range data {
+		sum += v
+	}
+	return sum / float64(len(data))
+}
+
+func sumOfSquareDeviations(data []float64) float64 {
+	mean := mean(data)
+	sum := 0.0
+	for _, v := range data {
+		diff := v - mean
+		sum += diff * diff
+	}
+	return sum
+}
+
+func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) float64 {
+	boundaries := make([]int, nClasses)
+
+	forEachBreak(data, lowerClassLimits, maxClasses, nClasses, func(class, boundary int) {
+		boundaries[class-1] = boundary
+	})
+
+	sdam := sumOfSquareDeviations(data)
+	sdcm := 0.0
+
+	for i, n := 0, len(boundaries)-1; i < n; i++ {
+		b1 := boundaries[i]
+		b2 := boundaries[i+1]
+		if b1 < 0 {
+			panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, len(data), nClasses, maxClasses, i, boundaries))
+		}
+		if b1 > len(data) {
+			panic(fmt.Errorf("lower bound out of bounds: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, len(data), len(data), nClasses, maxClasses, i, boundaries))
+		}
+		if b2 > len(data) {
+			panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b2, len(data), len(data), nClasses, maxClasses, i, boundaries))
+		}
+		if b1 > b2 {
+			panic(fmt.Errorf("lower bound greater than upper bound: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, b2, len(data), nClasses, maxClasses, i, boundaries))
+		}
+		sdcm += sumOfSquareDeviations(data[b1:b2])
+	}
+
+	return (sdam - sdcm) / sdam
+}