diff --git a/pom.xml b/pom.xml
index 510cb51c..be2aabfb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
baseCode
baseCode
baseCode
- 1.0.32
+ 1.1
2003
@@ -49,7 +49,7 @@
org.apache.poi
poi
- 5.0.0
+ 5.1.0
colt
diff --git a/src/ubic/basecode/dataStructure/Link.java b/src/ubic/basecode/dataStructure/Link.java
index d8fc0786..750f3d6b 100644
--- a/src/ubic/basecode/dataStructure/Link.java
+++ b/src/ubic/basecode/dataStructure/Link.java
@@ -19,6 +19,7 @@
package ubic.basecode.dataStructure;
import java.text.NumberFormat;
+import java.util.Locale;
/**
* Implements comparable, which sorts by the 'x' coordinate and then secondarily by the 'y' coordinate. (This behavior
@@ -96,7 +97,7 @@ public int hashCode() {
*/
@Override
public String toString() {
- return super.toString() + "\t" + NumberFormat.getInstance().format( this.weight );
+ return super.toString() + "\t" + NumberFormat.getInstance( Locale.ENGLISH ).format( this.weight );
}
}
\ No newline at end of file
diff --git a/src/ubic/basecode/dataStructure/matrix/DoubleMatrix.java b/src/ubic/basecode/dataStructure/matrix/DoubleMatrix.java
index c47fd333..8b035d70 100644
--- a/src/ubic/basecode/dataStructure/matrix/DoubleMatrix.java
+++ b/src/ubic/basecode/dataStructure/matrix/DoubleMatrix.java
@@ -18,10 +18,7 @@
*/
package ubic.basecode.dataStructure.matrix;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
+import java.util.*;
import cern.colt.list.DoubleArrayList;
import cern.colt.matrix.DoubleMatrix1D;
@@ -188,7 +185,7 @@ public final String toString() {
if ( Double.isNaN( value ) ) {
buf.append( "\t" );
} else {
- buf.append( "\t" + String.format( "%.4g", value ) );
+ buf.append( "\t" + String.format( Locale.ENGLISH, "%.4g", value ) );
}
}
buf.append( "\n" );
diff --git a/src/ubic/basecode/io/reader/DoubleMatrixReader.java b/src/ubic/basecode/io/reader/DoubleMatrixReader.java
index a1ed488d..bfac5037 100644
--- a/src/ubic/basecode/io/reader/DoubleMatrixReader.java
+++ b/src/ubic/basecode/io/reader/DoubleMatrixReader.java
@@ -26,11 +26,7 @@
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.text.ParseException;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Vector;
+import java.util.*;
import org.apache.commons.lang3.StringUtils;
@@ -47,7 +43,7 @@
*/
public class DoubleMatrixReader extends AbstractMatrixReader, Double> {
- private static NumberFormat nf = NumberFormat.getInstance();
+ private static NumberFormat nf = NumberFormat.getInstance( Locale.ENGLISH );
static {
if ( nf instanceof DecimalFormat ) {
// ( ( DecimalFormat ) nf ).setDecimalSeparatorAlwaysShown( true );
diff --git a/src/ubic/basecode/io/reader/SparseDoubleMatrixReader.java b/src/ubic/basecode/io/reader/SparseDoubleMatrixReader.java
index 6339a908..303a9653 100644
--- a/src/ubic/basecode/io/reader/SparseDoubleMatrixReader.java
+++ b/src/ubic/basecode/io/reader/SparseDoubleMatrixReader.java
@@ -22,17 +22,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Scanner;
-import java.util.Set;
-import java.util.StringTokenizer;
-import java.util.Vector;
+import java.util.*;
import ubic.basecode.dataStructure.matrix.DoubleMatrix;
import ubic.basecode.dataStructure.matrix.SparseDoubleMatrix;
@@ -179,7 +169,7 @@ public DoubleMatrix readJW( InputStream stream ) throws IOExcept
BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
- Scanner ff = new Scanner( dis );
+ Scanner ff = new Scanner( dis ).useLocale( Locale.ENGLISH );
int index = 0;
int amount = 0;
diff --git a/src/ubic/basecode/math/Stats.java b/src/ubic/basecode/math/Stats.java
index ad4af80a..3eb44ef9 100644
--- a/src/ubic/basecode/math/Stats.java
+++ b/src/ubic/basecode/math/Stats.java
@@ -1,8 +1,8 @@
/*
* The baseCode project
- *
- * Copyright (c) 2006 University of British Columbia
- *
+ *
+ * Copyright (c) 2006-2021 University of British Columbia
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
@@ -23,16 +23,17 @@
import cern.colt.list.DoubleArrayList;
import cern.jet.stat.Descriptive;
+import org.apache.commons.math3.util.DoubleArray;
/**
* Miscellaneous functions used for statistical analysis. Some are optimized or specialized versions of methods that can
* be found elsewhere.
- *
+ *
+ * @author Paul Pavlidis
* @see cern.jet.math
- *
+ *
* @see cern.jet.stat
- *
- * @author Paul Pavlidis
+ *
*/
public class Stats {
@@ -40,32 +41,32 @@ public class Stats {
/**
* Convert an array into a cumulative density function (CDF). This assumes that the input contains counts
* representing the distribution in question.
- *
+ *
* @param x The input of counts (i.e. a histogram).
* @return DoubleArrayList the CDF.
*/
- public static DoubleArrayList cdf( DoubleArrayList x ) {
- return cumulateRight( normalize( x ) );
+ public static DoubleArrayList cdf(DoubleArrayList x) {
+ return cumulateRight(normalize(x));
}
/**
* Convert an array into a cumulative array. Summing is from the left hand side. Use this to make CDFs where the
* concern is the left tail.
- *
+ *
* @param x DoubleArrayList
* @return cern.colt.list.DoubleArrayList
*/
- public static DoubleArrayList cumulate( DoubleArrayList x ) {
- if ( x.size() == 0 ) {
- return new DoubleArrayList( 0 );
+ public static DoubleArrayList cumulate(DoubleArrayList x) {
+ if (x.size() == 0) {
+ return new DoubleArrayList(0);
}
DoubleArrayList r = new DoubleArrayList();
double sum = 0.0;
- for ( int i = 0; i < x.size(); i++ ) {
- sum += x.get( i );
- r.add( sum );
+ for (int i = 0; i < x.size(); i++) {
+ sum += x.get(i);
+ r.add(sum);
}
return r;
}
@@ -74,21 +75,21 @@ public static DoubleArrayList cumulate( DoubleArrayList x ) {
* Convert an array into a cumulative array. Summing is from the right hand side. This is useful for creating
* upper-tail cumulative density histograms from count histograms, where the upper tail is expected to have very
* small numbers that could be lost to rounding.
- *
+ *
* @param x the array of data to be cumulated.
* @return cern.colt.list.DoubleArrayList
*/
- public static DoubleArrayList cumulateRight( DoubleArrayList x ) {
- if ( x.size() == 0 ) {
- return new DoubleArrayList( 0 );
+ public static DoubleArrayList cumulateRight(DoubleArrayList x) {
+ if (x.size() == 0) {
+ return new DoubleArrayList(0);
}
- DoubleArrayList r = new DoubleArrayList( new double[x.size()] );
+ DoubleArrayList r = new DoubleArrayList(new double[x.size()]);
double sum = 0.0;
- for ( int i = x.size() - 1; i >= 0; i-- ) {
- sum += x.get( i );
- r.set( i, sum );
+ for (int i = x.size() - 1; i >= 0; i--) {
+ sum += x.get(i);
+ r.set(i, sum);
}
return r;
}
@@ -97,33 +98,33 @@ public static DoubleArrayList cumulateRight( DoubleArrayList x ) {
* Compute the coefficient of variation of an array (standard deviation / mean). If the variance is zero, this
* returns zero. If the mean is zero, NaN is returned. If the mean is negative, the CV is computed relative to the
* absolute value of the mean; that is, negative values are treated as magnitudes.
- *
+ *
* @param data DoubleArrayList
* @return the cv
* @todo offer a regularized version of this function.
*/
- public static double cv( DoubleArrayList data ) {
- double mean = DescriptiveWithMissing.mean( data );
+ public static double cv(DoubleArrayList data) {
+ double mean = DescriptiveWithMissing.mean(data);
- double sampleVariance = DescriptiveWithMissing.sampleVariance( data, mean );
+ double sampleVariance = DescriptiveWithMissing.sampleVariance(data, mean);
- if ( sampleVariance == 0.0 ) return 0.0;
+ if (sampleVariance == 0.0) return 0.0;
- if ( mean == 0.0 ) {
+ if (mean == 0.0) {
return 0.0;
}
- return Math.sqrt( sampleVariance ) / Math.abs( mean );
+ return Math.sqrt(sampleVariance) / Math.abs(mean);
}
/**
* Test whether a value is a valid fractional or probability value.
- *
+ *
* @param value
* @return true if the value is in the interval 0 to 1.
*/
- public static boolean isValidFraction( double value ) {
- if ( value > 1.0 || value < 0.0 ) {
+ public static boolean isValidFraction(double value) {
+ if (value > 1.0 || value < 0.0) {
return false;
}
return true;
@@ -132,14 +133,14 @@ public static boolean isValidFraction( double value ) {
/**
* calculate the mean of the values above (NOT greater or equal to) a particular index rank of an array. Quantile
* must be a value from 0 to 100.
- *
- * @see DescriptiveWithMissing#meanAboveQuantile
- * @param index the rank of the value we wish to average above.
- * @param array Array for which we want to get the quantile.
+ *
+ * @param index the rank of the value we wish to average above.
+ * @param array Array for which we want to get the quantile.
* @param effectiveSize The size of the array, not including NaNs.
* @return double
+ * @see DescriptiveWithMissing#meanAboveQuantile
*/
- public static double meanAboveQuantile( int index, double[] array, int effectiveSize ) {
+ public static double meanAboveQuantile(int index, double[] array, int effectiveSize) {
double[] temp = new double[effectiveSize];
double median;
@@ -147,10 +148,10 @@ public static double meanAboveQuantile( int index, double[] array, int effective
int k = 0;
temp = array;
- median = quantile( index, array, effectiveSize );
+ median = quantile(index, array, effectiveSize);
- for ( int i = 0; i < effectiveSize; i++ ) {
- if ( temp[i] > median ) {
+ for (int i = 0; i < effectiveSize; i++) {
+ if (temp[i] > median) {
returnvalue += temp[i];
k++;
}
@@ -160,77 +161,128 @@ public static double meanAboveQuantile( int index, double[] array, int effective
/**
* Adjust the elements of an array so they total to 1.0.
- *
+ *
* @param x Input array.
* @return Normalized array.
*/
- public static DoubleArrayList normalize( DoubleArrayList x ) {
- return normalize( x, Descriptive.sum( x ) );
+ public static DoubleArrayList normalize(DoubleArrayList x) {
+ return normalize(x, Descriptive.sum(x));
}
/**
* Divide the elements of an array by a given factor.
- *
- * @param x Input array.
+ *
+ * @param x Input array.
* @param normfactor double
* @return Normalized array.
*/
- public static DoubleArrayList normalize( DoubleArrayList x, double normfactor ) {
- if ( x.size() == 0 ) {
- return new DoubleArrayList( 0 );
+ public static DoubleArrayList normalize(DoubleArrayList x, double normfactor) {
+ if (x.size() == 0) {
+ return new DoubleArrayList(0);
}
DoubleArrayList r = new DoubleArrayList();
- for ( int i = 0; i < x.size(); i++ ) {
- r.add( x.get( i ) / normfactor );
+ for (int i = 0; i < x.size(); i++) {
+ r.add(x.get(i) / normfactor);
}
return r;
}
/**
- * @param array
+ * @param array input data
* @param tolerance a small constant
* @return number of distinct values in the array, within tolerance. Double.NaN is counted as a distinct
- * value.
+ * value.
*/
- public static Integer numberofDistinctValues( DoubleArrayList array, double tolerance ) {
+ public static Integer numberofDistinctValues(DoubleArrayList array, double tolerance) {
Set distinct = new HashSet<>();
int r = 1;
- if ( tolerance > 0.0 ) {
- r = ( int ) Math.ceil( 1.0 / tolerance );
+ if (tolerance > 0.0) {
+ r = (int) Math.ceil(1.0 / tolerance);
}
- for ( int i = 0; i < array.size(); i++ ) {
- double v = array.get( i );
- if ( tolerance > 0 ) {
+ for (int i = 0; i < array.size(); i++) {
+ double v = array.get(i);
+ if (tolerance > 0) {
// this might not be foolproof
- distinct.add( ( double ) Math.round( v * r ) / r );
+ distinct.add((double) Math.round(v * r) / r);
} else {
- distinct.add( v );
+ distinct.add(v);
}
}
- return Math.max( 0, distinct.size() );
+ return Math.max(0, distinct.size());
}
+
/**
- * Given a double array, calculate the quantile requested. Note that no interpolation is done.
- *
- * @see DescriptiveWithMissing#quantile
- * @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
- * we should enter 100.
- * @param values double[] - array of data we want quantile of
+ * @param tolerance a small constant
+ * @return number of distinct values in the array, within tolerance. Double.NaN is ignored entirely
+ */
+ public static Integer numberofDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
+
+ Set distinct = new HashSet<>();
+ int r = 1;
+ if (tolerance > 0.0) {
+ r = (int) Math.ceil(1.0 / tolerance);
+ }
+ for (int i = 0; i < array.size(); i++) {
+ double v = array.get(i);
+ if (Double.isNaN(v)) {
+ continue;
+ }
+ if (tolerance > 0) {
+ // this might not be foolproof
+ distinct.add((double) Math.round(v * r) / r);
+ } else {
+ distinct.add(v);
+ }
+ }
+ return Math.max(0, distinct.size());
+
+ }
+
+ /**
+ * Compute the fraction of values which are distinct. NaNs are ignored entirely. If the data are all NaN, 0.0 is returned.
+ *
+ * @param array input data
+ * @param tolerance a small constant to define the difference that is "distinct"
+ * @return
+ */
+ public static Double fractionDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
+ double numNonNA = (double) numNonMissing(array);
+ if (numNonNA == 0) return 0.0;
+ return (double) numberofDistinctValuesNonNA(array, tolerance) / numNonNA;
+ }
+
+ private static Integer numNonMissing(DoubleArrayList array) {
+ int nm = 0;
+ for (int i = 0; i < array.size(); i++) {
+ if (Double.isNaN(array.get(i))) continue;
+ nm++;
+ }
+ return nm;
+ }
+
+
+ /**
+ * Given a double array, calculate the quantile requested. Note that no interpolation is done and missing values are ignored.
+ *
+ * @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
+ * we should enter 100.
+ * @param values double[] - array of data we want quantile of
* @param effectiveSize int the effective size of the array
* @return double the value at the requested quantile
+ * @see DescriptiveWithMissing#quantile
*/
- public static double quantile( int index, double[] values, int effectiveSize ) {
+ public static double quantile(int index, double[] values, int effectiveSize) {
double pivot = -1.0;
- if ( index == 0 ) {
+ if (index == 0) {
double ans = values[0];
- for ( int i = 1; i < effectiveSize; i++ ) {
- if ( ans > values[i] ) {
+ for (int i = 1; i < effectiveSize; i++) {
+ if (ans > values[i]) {
ans = values[i];
}
}
@@ -239,7 +291,7 @@ public static double quantile( int index, double[] values, int effectiveSize ) {
double[] temp = new double[effectiveSize];
- for ( int i = 0; i < effectiveSize; i++ ) {
+ for (int i = 0; i < effectiveSize; i++) {
temp[i] = values[i];
}
@@ -249,19 +301,19 @@ public static double quantile( int index, double[] values, int effectiveSize ) {
double[] bigger = new double[effectiveSize];
int itrSm = 0;
int itrBg = 0;
- for ( int i = 1; i < effectiveSize; i++ ) {
- if ( temp[i] <= pivot ) {
+ for (int i = 1; i < effectiveSize; i++) {
+ if (temp[i] <= pivot) {
smaller[itrSm] = temp[i];
itrSm++;
- } else if ( temp[i] > pivot ) {
+ } else if (temp[i] > pivot) {
bigger[itrBg] = temp[i];
itrBg++;
}
}
- if ( itrSm > index ) { // quantile must be in the 'smaller' array
- return quantile( index, smaller, itrSm );
- } else if ( itrSm < index ) { // quantile is in the 'bigger' array
- return quantile( index - itrSm - 1, bigger, itrBg );
+ if (itrSm > index) { // quantile must be in the 'smaller' array
+ return quantile(index, smaller, itrSm);
+ } else if (itrSm < index) { // quantile is in the 'bigger' array
+ return quantile(index - itrSm - 1, bigger, itrBg);
} else {
return pivot;
}
@@ -269,13 +321,13 @@ public static double quantile( int index, double[] values, int effectiveSize ) {
}
/**
- * Compute the range of an array.
- *
+ * Compute the range of an array. Missing values are ignored.
+ *
* @param data DoubleArrayList
* @return double
*/
- public static double range( DoubleArrayList data ) {
- return DescriptiveWithMissing.max( data ) - DescriptiveWithMissing.min( data );
+ public static double range(DoubleArrayList data) {
+ return DescriptiveWithMissing.max(data) - DescriptiveWithMissing.min(data);
}
private Stats() { /* block instantiation */
diff --git a/test/ubic/basecode/math/TestStats.java b/test/ubic/basecode/math/TestStats.java
index 6633571c..d8cf786c 100644
--- a/test/ubic/basecode/math/TestStats.java
+++ b/test/ubic/basecode/math/TestStats.java
@@ -148,6 +148,24 @@ public final void testNumberOfDistinctValues() {
assertEquals( 6, actualReturn );
}
+
+ @Test
+ public final void testNumberOfDistinctValuesNonNA() {
+ int actualReturn = Stats.numberofDistinctValuesNonNA( new DoubleArrayList( new double[] { 1.0, 1.0, 3.0, 4.0, 5.0,
+ 6.0, Double.NaN } ), 0.01 );
+ assertEquals( 5, actualReturn );
+
+ actualReturn = Stats.numberofDistinctValuesNonNA( data1Nomissing, 0.01 );
+ assertEquals( 5, actualReturn );
+ actualReturn = Stats.numberofDistinctValuesNonNA( new DoubleArrayList( new double[] { 1.0, 1.0, 3.0, 4.0, 4.00001,
+ 5.0, 6.0 } ), 0.0001 );
+ assertEquals( 5, actualReturn );
+ actualReturn = Stats.numberofDistinctValuesNonNA( new DoubleArrayList( new double[] { 1.0, 1.0, 3.0, 4.0, 4.00001,
+ 5.0, Double.NaN, 6.0 } ), 0.00001 );
+ assertEquals( 6, actualReturn );
+ }
+
+
@Test
public final void testQuantile() {
double expectedReturn = 0.595221355;