Skip to content

Commit

Permalink
Merge db8c8c4 into 6a89f80
Browse files Browse the repository at this point in the history
  • Loading branch information
achille-roussel committed Aug 31, 2019
2 parents 6a89f80 + db8c8c4 commit c26f879
Show file tree
Hide file tree
Showing 3 changed files with 246 additions and 63 deletions.
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/segmentio/jenks

go 1.12
227 changes: 174 additions & 53 deletions jenks.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,47 @@
package jenks

import (
"fmt"
"math"
"sort"
"fmt"
"strings"
"strconv"
"strings"
)

// Jenks natural breaks optimization (http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization)
// Based on the javascript implementation: https://gist.github.com/tmcw/4977508
// though that implementation has a bug - it has been fixed here.

func BestNaturalBreaks(data []float64, maxClasses int, minGvf float64) []float64 {
data = sortData(data)

uniq := countUniqueValues(data)
if maxClasses >= uniq {
if uniq <= 2 {
return deduplicate(data)
}
maxClasses = uniq
}

lowerClassLimits, _ := getMatrices(data, maxClasses)
var bestGvf float64
var bestClass = 1

for nClasses := 2; nClasses <= maxClasses; nClasses++ {
gvf := goodnessOfVarianceFit(data, lowerClassLimits, maxClasses, nClasses)

if gvf > bestGvf {
bestGvf, bestClass = gvf, nClasses
}

if gvf >= minGvf {
break
}
}

return breaks(data, lowerClassLimits, maxClasses, bestClass)
}

// NaturalBreaks returns the best nClasses natural breaks in the data,
// using the Jenks natural breaks classification method (http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization).
// It tries to maximize the similarity of numbers in groups while maximizing the distance between the groups.
Expand All @@ -20,16 +50,15 @@ func NaturalBreaks(data []float64, nClasses int) []float64 {
data = sortData(data)

// sanity check
uniq := deduplicate(data)
if nClasses >= len(uniq) {
return uniq
if nClasses >= countUniqueValues(data) {
return deduplicate(data)
}

// get our basic matrices (we only need lower class limits here)
lowerClassLimits, _ := getMatrices(data, nClasses)

// extract nClasses out of the computed matrices
return breaks(data, lowerClassLimits, nClasses)
return breaks(data, lowerClassLimits, nClasses, nClasses)
}

// AllNaturalBreaks finds all natural breaks in the data, for every set of breaks between 2 breaks and maxClasses.
Expand All @@ -40,9 +69,9 @@ func AllNaturalBreaks(data []float64, maxClasses int) [][]float64 {
data = sortData(data)

// sanity check
uniq := deduplicate(data)
if maxClasses > len(uniq) {
maxClasses = len(uniq)
uniq := countUniqueValues(data)
if maxClasses > uniq {
maxClasses = uniq
}

// get our basic matrices (we only need lower class limits here)
Expand All @@ -51,9 +80,9 @@ func AllNaturalBreaks(data []float64, maxClasses int) [][]float64 {
// extract nClasses out of the computed matrices
allBreaks := [][]float64{}
for i := 2; i <= maxClasses; i++ {
nClasses := breaks(data, lowerClassLimits, i)
if i == len(uniq) {
nClasses = uniq
nClasses := breaks(data, lowerClassLimits, maxClasses, i)
if i == uniq {
nClasses = deduplicate(data)
}
allBreaks = append(allBreaks, nClasses)
}
Expand All @@ -71,7 +100,7 @@ func Round(breaks []float64, data []float64) []float64 {
dataIdx := sort.SearchFloat64s(data, breaks[breakIdx])
var floor float64
if dataIdx == 0 { // make sure we can't go below breaks[i] - (breaks[i+1]-breaks[i])
floor = data[0] - (breaks[breakIdx+1]-breaks[breakIdx])
floor = data[0] - (breaks[breakIdx+1] - breaks[breakIdx])
} else {
floor = data[dataIdx-1]
}
Expand All @@ -84,7 +113,7 @@ func Round(breaks []float64, data []float64) []float64 {
func roundValue(initialValue float64, floor float64) float64 {
b := []byte(strings.Trim(fmt.Sprintf("%f", initialValue), "0"))
value := initialValue
for i := len(b)-1; i >= 0; i-- {
for i := len(b) - 1; i >= 0; i-- {
if b[i] != '.' {
b[i] = '0'
round, e := strconv.ParseFloat(string(b), 64)
Expand All @@ -100,10 +129,8 @@ func roundValue(initialValue float64, floor float64) float64 {
// sortData checks to see if the data is sorted, returning it unchanged if so. Otherwise, it creates and sorts a copy.
func sortData(data []float64) []float64 {
if !sort.Float64sAreSorted(data) {
data2 := make([]float64, len(data))
copy(data2, data)
sort.Float64s(data2)
data = data2
data = copyFloat64s(data)
sort.Float64s(data)
}
return data
}
Expand All @@ -121,38 +148,52 @@ func deduplicate(data []float64) []float64 {
return uniq
}

// countUniqueValues returns the number of unique values in the sorted array of
// data points passed as arguments. This function is used as an optimization to
// avoid calling deduplicate for the common case where there are more values
// than classes.
func countUniqueValues(data []float64) int {
n := 0
x := math.NaN()
for _, v := range data {
if v != x {
x = v
n++
}
}
return n
}

// getMatrices Computes the matrices required for Jenks breaks.
// These matrices can be used for any classing of data with 'classes <= n_classes'
func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) {
func getMatrices(data []float64, nClasses int) ([]int, []float64) {
x := len(data) + 1
y := nClasses + 1
n := mat2len(x, y)

// in the original implementation, these matrices are referred to
// as 'LC' and 'OP'
//
// * lowerClassLimits (LC): optimal lower class limits
// * variance_combinations (OP): optimal variance combinations for all classes
lowerClassLimits := make([][]int, len(data)+1)
varianceCombinations := make([][]float64, len(data)+1)
lowerClassLimits := make([]int, n)
varianceCombinations := make([]float64, n)

// the variance, as computed at each step in the calculation
variance := 0.0

// Initialize and fill each matrix with zeroes
for i := 0; i < len(data)+1; i++ {
lowerClassLimits[i] = make([]int, nClasses+1)
varianceCombinations[i] = make([]float64, nClasses+1)
}
for i := 1; i < y; i++ {
index := mat2idx(1, i, y)
lowerClassLimits[index] = 1
varianceCombinations[index] = 0

for i := 1; i < nClasses+1; i++ {
lowerClassLimits[1][i] = 1
varianceCombinations[1][i] = 0
// in the original implementation, 'Infinity' is used but
// math.MaxFloat64 will do.
for j := 2; j < len(data)+1; j++ {
varianceCombinations[j][i] = math.MaxFloat64
for j := 2; j < x; j++ {
varianceCombinations[mat2idx(j, i, y)] = math.Inf(+1)
}
}

for l := 2; l < len(data)+1; l++ {
for l := 2; l < x; l++ {
i1 := mat2idx(l, 0, y) // keep multiplication out of the inner loops

// sum was 'SZ' originally.
// this is the sum of the values seen thus far when calculating variance.
Expand Down Expand Up @@ -183,46 +224,126 @@ func getMatrices(data []float64, nClasses int) ([][]int, [][]float64) {
// of samples.
variance = sumSquares - (sum*sum)/w
if currentIndex != 0 {
for j := 2; j < nClasses+1; j++ {
// keep multiplication out of the inner loop
i2 := mat2idx(currentIndex, 0, y)

for j := 2; j < y; j++ {
// if adding this element to an existing class
// will increase its variance beyond the limit, break
// the class at this point, setting the lower_class_limit
// at this point.
if varianceCombinations[l][j] >= (variance + varianceCombinations[currentIndex][j-1]) {
lowerClassLimits[l][j] = lowerClassLimit
varianceCombinations[l][j] = variance + varianceCombinations[currentIndex][j-1]
j1 := i1 + j
j2 := i2 + j - 1

v1 := varianceCombinations[j1]
v2 := varianceCombinations[j2] + variance

if v1 >= v2 {
lowerClassLimits[j1] = lowerClassLimit
varianceCombinations[j1] = v2
}
}
}
}

lowerClassLimits[l][1] = 1;
varianceCombinations[l][1] = variance;
index := mat2idx(l, 1, y)
lowerClassLimits[index] = 1
varianceCombinations[index] = variance
}
// return the two matrices. for just providing breaks, only
// 'lower_class_limits' is needed, but variances can be useful to
// evaluate goodness of fit.
return lowerClassLimits, varianceCombinations
}

// breaks is the second part of the jenks recipe:
// take the calculated matrices and derive an array of n breaks.
func breaks(data []float64, lowerClassLimits [][]int, nClasses int) []float64 {
func forEachBreak(data []float64, lowerClassLimits []int, maxClasses int, nClasses int, do func(class, boundary int)) {
y := maxClasses + 1
// the lowerClassLimits matrix is used as indexes into itself here:
// the next value of `k` is obtained from .
k := len(data) - 1

classBoundaries := make([]float64, nClasses)
for i := nClasses; i > 1; i-- {
k = lowerClassLimits[mat2idx(k, i, y)] - 1
do(i, k)
}

// the calculation of classes will never include the lower bound, so we need to explicitly set it
// the upper bound is not included in the result - but it would be the maximum value in the data
classBoundaries[0] = data[0]
do(1, 0)
}

// the lowerClassLimits matrix is used as indexes into itself here:
// the next value of `k` is obtained from .
k := len(data) - 1
for i := nClasses; i > 1; i -- {
boundaryIndex := lowerClassLimits[k][i] - 1
classBoundaries[i-1] = data[boundaryIndex]
k = boundaryIndex
}
// breaks is the second part of the jenks recipe:
// take the calculated matrices and derive an array of n breaks.
func breaks(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) []float64 {
classBoundaries := make([]float64, nClasses)

forEachBreak(data, lowerClassLimits, maxClasses, nClasses, func(class, boundary int) {
classBoundaries[class-1] = data[boundary]
})

return classBoundaries
}

func mat2len(x, y int) int {
return x * y
}

func mat2idx(i, j, y int) int {
return (i * y) + j
}

func copyFloat64s(data []float64) []float64 {
return append(make([]float64, 0, len(data)), data...)
}

func mean(data []float64) float64 {
if len(data) == 0 {
return 0.0
}
sum := 0.0
for _, v := range data {
sum += v
}
return sum / float64(len(data))
}

func sumOfSquareDeviations(data []float64) float64 {
mean := mean(data)
sum := 0.0
for _, v := range data {
diff := v - mean
sum += diff * diff
}
return sum
}

func goodnessOfVarianceFit(data []float64, lowerClassLimits []int, maxClasses int, nClasses int) float64 {
boundaries := make([]int, nClasses)

forEachBreak(data, lowerClassLimits, maxClasses, nClasses, func(class, boundary int) {
boundaries[class-1] = boundary
})

sdam := sumOfSquareDeviations(data)
sdcm := 0.0

for i, n := 0, len(boundaries)-1; i < n; i++ {
b1 := boundaries[i]
b2 := boundaries[i+1]
if b1 < 0 {
panic(fmt.Errorf("lower bound out of bounds: %d < 0; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, len(data), nClasses, maxClasses, i, boundaries))
}
if b1 > len(data) {
panic(fmt.Errorf("lower bound out of bounds: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, len(data), len(data), nClasses, maxClasses, i, boundaries))
}
if b2 > len(data) {
panic(fmt.Errorf("upper bound out of bounds: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b2, len(data), len(data), nClasses, maxClasses, i, boundaries))
}
if b1 > b2 {
panic(fmt.Errorf("lower bound greater than upper bound: %d > %d; len(data)=%d; class=%d/%d; index=%d; boundaries=%v", b1, b2, len(data), nClasses, maxClasses, i, boundaries))
}
sdcm += sumOfSquareDeviations(data[b1:b2])
}

return (sdam - sdcm) / sdam
}
Loading

0 comments on commit c26f879

Please sign in to comment.