diff --git a/pom.xml b/pom.xml index 510cb51c..be2aabfb 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ baseCode baseCode baseCode - 1.0.32 + 1.1 2003 @@ -49,7 +49,7 @@ org.apache.poi poi - 5.0.0 + 5.1.0 colt diff --git a/src/ubic/basecode/dataStructure/Link.java b/src/ubic/basecode/dataStructure/Link.java index d8fc0786..750f3d6b 100644 --- a/src/ubic/basecode/dataStructure/Link.java +++ b/src/ubic/basecode/dataStructure/Link.java @@ -19,6 +19,7 @@ package ubic.basecode.dataStructure; import java.text.NumberFormat; +import java.util.Locale; /** * Implements comparable, which sorts by the 'x' coordinate and then secondarily by the 'y' coordinate. (This behavior @@ -96,7 +97,7 @@ public int hashCode() { */ @Override public String toString() { - return super.toString() + "\t" + NumberFormat.getInstance().format( this.weight ); + return super.toString() + "\t" + NumberFormat.getInstance( Locale.ENGLISH ).format( this.weight ); } } \ No newline at end of file diff --git a/src/ubic/basecode/dataStructure/matrix/DoubleMatrix.java b/src/ubic/basecode/dataStructure/matrix/DoubleMatrix.java index c47fd333..8b035d70 100644 --- a/src/ubic/basecode/dataStructure/matrix/DoubleMatrix.java +++ b/src/ubic/basecode/dataStructure/matrix/DoubleMatrix.java @@ -18,10 +18,7 @@ */ package ubic.basecode.dataStructure.matrix; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; +import java.util.*; import cern.colt.list.DoubleArrayList; import cern.colt.matrix.DoubleMatrix1D; @@ -188,7 +185,7 @@ public final String toString() { if ( Double.isNaN( value ) ) { buf.append( "\t" ); } else { - buf.append( "\t" + String.format( "%.4g", value ) ); + buf.append( "\t" + String.format( Locale.ENGLISH, "%.4g", value ) ); } } buf.append( "\n" ); diff --git a/src/ubic/basecode/io/reader/DoubleMatrixReader.java b/src/ubic/basecode/io/reader/DoubleMatrixReader.java index a1ed488d..bfac5037 100644 --- a/src/ubic/basecode/io/reader/DoubleMatrixReader.java +++ b/src/ubic/basecode/io/reader/DoubleMatrixReader.java @@ -26,11 +26,7 @@ import java.text.DecimalFormat; import java.text.NumberFormat; import java.text.ParseException; -import java.util.Collection; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Vector; +import java.util.*; import org.apache.commons.lang3.StringUtils; @@ -47,7 +43,7 @@ */ public class DoubleMatrixReader extends AbstractMatrixReader, Double> { - private static NumberFormat nf = NumberFormat.getInstance(); + private static NumberFormat nf = NumberFormat.getInstance( Locale.ENGLISH ); static { if ( nf instanceof DecimalFormat ) { // ( ( DecimalFormat ) nf ).setDecimalSeparatorAlwaysShown( true ); diff --git a/src/ubic/basecode/io/reader/SparseDoubleMatrixReader.java b/src/ubic/basecode/io/reader/SparseDoubleMatrixReader.java index 6339a908..303a9653 100644 --- a/src/ubic/basecode/io/reader/SparseDoubleMatrixReader.java +++ b/src/ubic/basecode/io/reader/SparseDoubleMatrixReader.java @@ -22,17 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Scanner; -import java.util.Set; -import java.util.StringTokenizer; -import java.util.Vector; +import java.util.*; import ubic.basecode.dataStructure.matrix.DoubleMatrix; import ubic.basecode.dataStructure.matrix.SparseDoubleMatrix; @@ -179,7 +169,7 @@ public DoubleMatrix readJW( InputStream stream ) throws IOExcept BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) ); - Scanner ff = new Scanner( dis ); + Scanner ff = new Scanner( dis ).useLocale( Locale.ENGLISH ); int index = 0; int amount = 0; diff --git a/src/ubic/basecode/math/Stats.java b/src/ubic/basecode/math/Stats.java index ad4af80a..3eb44ef9 100644 --- a/src/ubic/basecode/math/Stats.java +++ b/src/ubic/basecode/math/Stats.java @@ -1,8 +1,8 @@ /* * The baseCode project - * - * Copyright (c) 2006 University of British Columbia - * + * + * Copyright (c) 2006-2021 University of British Columbia + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -23,16 +23,17 @@ import cern.colt.list.DoubleArrayList; import cern.jet.stat.Descriptive; +import org.apache.commons.math3.util.DoubleArray; /** * Miscellaneous functions used for statistical analysis. Some are optimized or specialized versions of methods that can * be found elsewhere. - * + * + * @author Paul Pavlidis * @see cern.jet.math - * + * * @see cern.jet.stat - * - * @author Paul Pavlidis + * */ public class Stats { @@ -40,32 +41,32 @@ public class Stats { /** * Convert an array into a cumulative density function (CDF). This assumes that the input contains counts * representing the distribution in question. - * + * * @param x The input of counts (i.e. a histogram). * @return DoubleArrayList the CDF. */ - public static DoubleArrayList cdf( DoubleArrayList x ) { - return cumulateRight( normalize( x ) ); + public static DoubleArrayList cdf(DoubleArrayList x) { + return cumulateRight(normalize(x)); } /** * Convert an array into a cumulative array. Summing is from the left hand side. Use this to make CDFs where the * concern is the left tail. - * + * * @param x DoubleArrayList * @return cern.colt.list.DoubleArrayList */ - public static DoubleArrayList cumulate( DoubleArrayList x ) { - if ( x.size() == 0 ) { - return new DoubleArrayList( 0 ); + public static DoubleArrayList cumulate(DoubleArrayList x) { + if (x.size() == 0) { + return new DoubleArrayList(0); } DoubleArrayList r = new DoubleArrayList(); double sum = 0.0; - for ( int i = 0; i < x.size(); i++ ) { - sum += x.get( i ); - r.add( sum ); + for (int i = 0; i < x.size(); i++) { + sum += x.get(i); + r.add(sum); } return r; } @@ -74,21 +75,21 @@ public static DoubleArrayList cumulate( DoubleArrayList x ) { * Convert an array into a cumulative array. Summing is from the right hand side. This is useful for creating * upper-tail cumulative density histograms from count histograms, where the upper tail is expected to have very * small numbers that could be lost to rounding. - * + * * @param x the array of data to be cumulated. * @return cern.colt.list.DoubleArrayList */ - public static DoubleArrayList cumulateRight( DoubleArrayList x ) { - if ( x.size() == 0 ) { - return new DoubleArrayList( 0 ); + public static DoubleArrayList cumulateRight(DoubleArrayList x) { + if (x.size() == 0) { + return new DoubleArrayList(0); } - DoubleArrayList r = new DoubleArrayList( new double[x.size()] ); + DoubleArrayList r = new DoubleArrayList(new double[x.size()]); double sum = 0.0; - for ( int i = x.size() - 1; i >= 0; i-- ) { - sum += x.get( i ); - r.set( i, sum ); + for (int i = x.size() - 1; i >= 0; i--) { + sum += x.get(i); + r.set(i, sum); } return r; } @@ -97,33 +98,33 @@ public static DoubleArrayList cumulateRight( DoubleArrayList x ) { * Compute the coefficient of variation of an array (standard deviation / mean). If the variance is zero, this * returns zero. If the mean is zero, NaN is returned. If the mean is negative, the CV is computed relative to the * absolute value of the mean; that is, negative values are treated as magnitudes. - * + * * @param data DoubleArrayList * @return the cv * @todo offer a regularized version of this function. */ - public static double cv( DoubleArrayList data ) { - double mean = DescriptiveWithMissing.mean( data ); + public static double cv(DoubleArrayList data) { + double mean = DescriptiveWithMissing.mean(data); - double sampleVariance = DescriptiveWithMissing.sampleVariance( data, mean ); + double sampleVariance = DescriptiveWithMissing.sampleVariance(data, mean); - if ( sampleVariance == 0.0 ) return 0.0; + if (sampleVariance == 0.0) return 0.0; - if ( mean == 0.0 ) { + if (mean == 0.0) { return 0.0; } - return Math.sqrt( sampleVariance ) / Math.abs( mean ); + return Math.sqrt(sampleVariance) / Math.abs(mean); } /** * Test whether a value is a valid fractional or probability value. - * + * * @param value * @return true if the value is in the interval 0 to 1. */ - public static boolean isValidFraction( double value ) { - if ( value > 1.0 || value < 0.0 ) { + public static boolean isValidFraction(double value) { + if (value > 1.0 || value < 0.0) { return false; } return true; @@ -132,14 +133,14 @@ public static boolean isValidFraction( double value ) { /** * calculate the mean of the values above (NOT greater or equal to) a particular index rank of an array. Quantile * must be a value from 0 to 100. - * - * @see DescriptiveWithMissing#meanAboveQuantile - * @param index the rank of the value we wish to average above. - * @param array Array for which we want to get the quantile. + * + * @param index the rank of the value we wish to average above. + * @param array Array for which we want to get the quantile. * @param effectiveSize The size of the array, not including NaNs. * @return double + * @see DescriptiveWithMissing#meanAboveQuantile */ - public static double meanAboveQuantile( int index, double[] array, int effectiveSize ) { + public static double meanAboveQuantile(int index, double[] array, int effectiveSize) { double[] temp = new double[effectiveSize]; double median; @@ -147,10 +148,10 @@ public static double meanAboveQuantile( int index, double[] array, int effective int k = 0; temp = array; - median = quantile( index, array, effectiveSize ); + median = quantile(index, array, effectiveSize); - for ( int i = 0; i < effectiveSize; i++ ) { - if ( temp[i] > median ) { + for (int i = 0; i < effectiveSize; i++) { + if (temp[i] > median) { returnvalue += temp[i]; k++; } @@ -160,77 +161,128 @@ public static double meanAboveQuantile( int index, double[] array, int effective /** * Adjust the elements of an array so they total to 1.0. - * + * * @param x Input array. * @return Normalized array. */ - public static DoubleArrayList normalize( DoubleArrayList x ) { - return normalize( x, Descriptive.sum( x ) ); + public static DoubleArrayList normalize(DoubleArrayList x) { + return normalize(x, Descriptive.sum(x)); } /** * Divide the elements of an array by a given factor. - * - * @param x Input array. + * + * @param x Input array. * @param normfactor double * @return Normalized array. */ - public static DoubleArrayList normalize( DoubleArrayList x, double normfactor ) { - if ( x.size() == 0 ) { - return new DoubleArrayList( 0 ); + public static DoubleArrayList normalize(DoubleArrayList x, double normfactor) { + if (x.size() == 0) { + return new DoubleArrayList(0); } DoubleArrayList r = new DoubleArrayList(); - for ( int i = 0; i < x.size(); i++ ) { - r.add( x.get( i ) / normfactor ); + for (int i = 0; i < x.size(); i++) { + r.add(x.get(i) / normfactor); } return r; } /** - * @param array + * @param array input data * @param tolerance a small constant * @return number of distinct values in the array, within tolerance. Double.NaN is counted as a distinct - * value. + * value. */ - public static Integer numberofDistinctValues( DoubleArrayList array, double tolerance ) { + public static Integer numberofDistinctValues(DoubleArrayList array, double tolerance) { Set distinct = new HashSet<>(); int r = 1; - if ( tolerance > 0.0 ) { - r = ( int ) Math.ceil( 1.0 / tolerance ); + if (tolerance > 0.0) { + r = (int) Math.ceil(1.0 / tolerance); } - for ( int i = 0; i < array.size(); i++ ) { - double v = array.get( i ); - if ( tolerance > 0 ) { + for (int i = 0; i < array.size(); i++) { + double v = array.get(i); + if (tolerance > 0) { // this might not be foolproof - distinct.add( ( double ) Math.round( v * r ) / r ); + distinct.add((double) Math.round(v * r) / r); } else { - distinct.add( v ); + distinct.add(v); } } - return Math.max( 0, distinct.size() ); + return Math.max(0, distinct.size()); } + /** - * Given a double array, calculate the quantile requested. Note that no interpolation is done. - * - * @see DescriptiveWithMissing#quantile - * @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median, - * we should enter 100. - * @param values double[] - array of data we want quantile of + * @param tolerance a small constant + * @return number of distinct values in the array, within tolerance. Double.NaN is ignored entirely + */ + public static Integer numberofDistinctValuesNonNA(DoubleArrayList array, double tolerance) { + + Set distinct = new HashSet<>(); + int r = 1; + if (tolerance > 0.0) { + r = (int) Math.ceil(1.0 / tolerance); + } + for (int i = 0; i < array.size(); i++) { + double v = array.get(i); + if (Double.isNaN(v)) { + continue; + } + if (tolerance > 0) { + // this might not be foolproof + distinct.add((double) Math.round(v * r) / r); + } else { + distinct.add(v); + } + } + return Math.max(0, distinct.size()); + + } + + /** + * Compute the fraction of values which are distinct. NaNs are ignored entirely. If the data are all NaN, 0.0 is returned. + * + * @param array input data + * @param tolerance a small constant to define the difference that is "distinct" + * @return + */ + public static Double fractionDistinctValuesNonNA(DoubleArrayList array, double tolerance) { + double numNonNA = (double) numNonMissing(array); + if (numNonNA == 0) return 0.0; + return (double) numberofDistinctValuesNonNA(array, tolerance) / numNonNA; + } + + private static Integer numNonMissing(DoubleArrayList array) { + int nm = 0; + for (int i = 0; i < array.size(); i++) { + if (Double.isNaN(array.get(i))) continue; + nm++; + } + return nm; + } + + + /** + * Given a double array, calculate the quantile requested. Note that no interpolation is done and missing values are ignored. + * + * @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median, + * we should enter 100. + * @param values double[] - array of data we want quantile of * @param effectiveSize int the effective size of the array * @return double the value at the requested quantile + * @see DescriptiveWithMissing#quantile */ - public static double quantile( int index, double[] values, int effectiveSize ) { + public static double quantile(int index, double[] values, int effectiveSize) { double pivot = -1.0; - if ( index == 0 ) { + if (index == 0) { double ans = values[0]; - for ( int i = 1; i < effectiveSize; i++ ) { - if ( ans > values[i] ) { + for (int i = 1; i < effectiveSize; i++) { + if (ans > values[i]) { ans = values[i]; } } @@ -239,7 +291,7 @@ public static double quantile( int index, double[] values, int effectiveSize ) { double[] temp = new double[effectiveSize]; - for ( int i = 0; i < effectiveSize; i++ ) { + for (int i = 0; i < effectiveSize; i++) { temp[i] = values[i]; } @@ -249,19 +301,19 @@ public static double quantile( int index, double[] values, int effectiveSize ) { double[] bigger = new double[effectiveSize]; int itrSm = 0; int itrBg = 0; - for ( int i = 1; i < effectiveSize; i++ ) { - if ( temp[i] <= pivot ) { + for (int i = 1; i < effectiveSize; i++) { + if (temp[i] <= pivot) { smaller[itrSm] = temp[i]; itrSm++; - } else if ( temp[i] > pivot ) { + } else if (temp[i] > pivot) { bigger[itrBg] = temp[i]; itrBg++; } } - if ( itrSm > index ) { // quantile must be in the 'smaller' array - return quantile( index, smaller, itrSm ); - } else if ( itrSm < index ) { // quantile is in the 'bigger' array - return quantile( index - itrSm - 1, bigger, itrBg ); + if (itrSm > index) { // quantile must be in the 'smaller' array + return quantile(index, smaller, itrSm); + } else if (itrSm < index) { // quantile is in the 'bigger' array + return quantile(index - itrSm - 1, bigger, itrBg); } else { return pivot; } @@ -269,13 +321,13 @@ public static double quantile( int index, double[] values, int effectiveSize ) { } /** - * Compute the range of an array. - * + * Compute the range of an array. Missing values are ignored. + * * @param data DoubleArrayList * @return double */ - public static double range( DoubleArrayList data ) { - return DescriptiveWithMissing.max( data ) - DescriptiveWithMissing.min( data ); + public static double range(DoubleArrayList data) { + return DescriptiveWithMissing.max(data) - DescriptiveWithMissing.min(data); } private Stats() { /* block instantiation */ diff --git a/test/ubic/basecode/math/TestStats.java b/test/ubic/basecode/math/TestStats.java index 6633571c..d8cf786c 100644 --- a/test/ubic/basecode/math/TestStats.java +++ b/test/ubic/basecode/math/TestStats.java @@ -148,6 +148,24 @@ public final void testNumberOfDistinctValues() { assertEquals( 6, actualReturn ); } + + @Test + public final void testNumberOfDistinctValuesNonNA() { + int actualReturn = Stats.numberofDistinctValuesNonNA( new DoubleArrayList( new double[] { 1.0, 1.0, 3.0, 4.0, 5.0, + 6.0, Double.NaN } ), 0.01 ); + assertEquals( 5, actualReturn ); + + actualReturn = Stats.numberofDistinctValuesNonNA( data1Nomissing, 0.01 ); + assertEquals( 5, actualReturn ); + actualReturn = Stats.numberofDistinctValuesNonNA( new DoubleArrayList( new double[] { 1.0, 1.0, 3.0, 4.0, 4.00001, + 5.0, 6.0 } ), 0.0001 ); + assertEquals( 5, actualReturn ); + actualReturn = Stats.numberofDistinctValuesNonNA( new DoubleArrayList( new double[] { 1.0, 1.0, 3.0, 4.0, 4.00001, + 5.0, Double.NaN, 6.0 } ), 0.00001 ); + assertEquals( 6, actualReturn ); + } + + @Test public final void testQuantile() { double expectedReturn = 0.595221355;