BUG: stats: Merge pull request scipy#87 (handle array_like in kruskal…

…, improve code in kruskal and f_oneway)
WarrenWeckesser · Oct 7, 2011 · ea3d301 · ea3d301
2 parents 0496569 + 9807ac1
commit ea3d301
Show file tree

Hide file tree

Showing 2 changed files with 92 additions and 32 deletions.
diff --git a/scipy/stats/stats.py b/scipy/stats/stats.py
@@ -2152,26 +2152,25 @@ def f_oneway(*args):
     .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.
 
     """
-    na = len(args)            # ANOVA on 'na' groups, each in it's own array
-    tmp = map(np.array,args)
+    args = map(np.asarray, args) # convert to an numpy array
+    na = len(args)              # ANOVA on 'na' groups, each in it's own array
     alldata = np.concatenate(args)
     bign = len(alldata)
-    sstot = ss(alldata)-(square_of_sums(alldata)/float(bign))
+    sstot = ss(alldata) - (square_of_sums(alldata) / float(bign))
     ssbn = 0
     for a in args:
-        ssbn = ssbn + square_of_sums(array(a))/float(len(a))
-    ssbn = ssbn - (square_of_sums(alldata)/float(bign))
-    sswn = sstot-ssbn
-    dfbn = na-1
+        ssbn += square_of_sums(a) / float(len(a))
+    ssbn -= (square_of_sums(alldata) / float(bign))
+    sswn = sstot - ssbn
+    dfbn = na - 1
     dfwn = bign - na
-    msb = ssbn/float(dfbn)
-    msw = sswn/float(dfwn)
-    f = msb/msw
-    prob = fprob(dfbn,dfwn,f)
+    msb = ssbn / float(dfbn)
+    msw = sswn / float(dfwn)
+    f = msb / msw
+    prob = fprob(dfbn, dfwn, f)
     return f, prob
 
 
-
 def pearsonr(x, y):
     """Calculates a Pearson correlation coefficient and the p-value for testing
     non-correlation.
@@ -3548,30 +3547,31 @@ def kruskal(*args):
     .. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance
 
     """
-    if len(args) < 2:
+    args = map(np.asarray, args) # convert to a numpy array
+    na = len(args)               # Kruskal-Wallis on 'na' groups, each in it's own array
+    if na < 2:
         raise ValueError("Need at least two groups in stats.kruskal()")
-    n = map(len,args)
-    all = []
-    for i in range(len(args)):
-        all.extend(args[i].tolist())
-    ranked = list(rankdata(all))
-    T = tiecorrect(ranked)
-    args = list(args)
-    for i in range(len(args)):
-        args[i] = ranked[0:n[i]]
-        del ranked[0:n[i]]
-    rsums = []
-    for i in range(len(args)):
-        rsums.append(np.sum(args[i],axis=0)**2)
-        rsums[i] = rsums[i] / float(n[i])
-    ssbn = np.sum(rsums,axis=0)
-    totaln = np.sum(n,axis=0)
-    h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
-    df = len(args) - 1
+    n = np.asarray(map(len, args))
+
+    alldata = np.concatenate(args)
+
+    ranked = rankdata(alldata)  # Rank the data
+    T = tiecorrect(ranked)      # Correct for ties
     if T == 0:
         raise ValueError('All numbers are identical in kruskal')
+
+    # Compute sum^2/n for each group and sum
+    j = np.insert(np.cumsum(n), 0, 0)
+    ssbn = 0
+    for i in range(na):
+        ssbn += square_of_sums(ranked[j[i]:j[i+1]]) / float(n[i]) 
+
+    totaln = np.sum(n)
+    h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1)
+    df = na - 1
     h = h / float(T)
-    return h, chisqprob(h,df)
+    return h, chisqprob(h, df)
+
 
 
 def friedmanchisquare(*args):

diff --git a/scipy/stats/tests/test_stats.py b/scipy/stats/tests/test_stats.py
@@ -1873,5 +1873,65 @@ def test_basic(self):
         # result in F being exactly 2.0.
         assert_equal(F, 2.0)
 
+
+class TestKruskal(TestCase):
+
+    def test_simple(self):
+        """A really simple case for stats.kruskal"""
+        x = [1]
+        y = [2]
+        h, p = stats.kruskal(x, y)
+        assert_equal(h, 1.0)
+        assert_approx_equal(p, stats.chisqprob(h, 1))
+        h, p = stats.kruskal(np.array(x), np.array(y))
+        assert_equal(h, 1.0)
+        assert_approx_equal(p, stats.chisqprob(h, 1))
+
+    def test_basic(self):
+        """A basic test, with no ties."""
+        x = [1, 3, 5, 7, 9]
+        y = [2, 4, 6, 8, 10]
+        h, p = stats.kruskal(x, y)
+        assert_approx_equal(h, 3./11, significant=10)
+        assert_approx_equal(p, stats.chisqprob(3./11, 1))
+        h, p = stats.kruskal(np.array(x), np.array(y))
+        assert_approx_equal(h, 3./11, significant=10)
+        assert_approx_equal(p, stats.chisqprob(3./11, 1))
+
+    def test_simple_tie(self):
+        """A simple case with a tie."""
+        x = [1]
+        y = [1, 2]
+        h_uncorr = 1.5**2 + 2*2.25**2 - 12
+        corr = 0.75
+        expected = h_uncorr / corr   # 0.5
+        h, p = stats.kruskal(x, y)
+        # Since the expression is simple and the exact answer is 0.5, it
+        # should be safe to use assert_equal().
+        assert_equal(h, expected)
+
+    def test_another_tie(self):
+        """Another test of stats.kruskal with a tie."""
+        x = [1, 1, 1, 2]
+        y = [2, 2, 2, 2]
+        h_uncorr = (12. / 8. / 9.) * 4 * (3**2 + 6**2) - 3 * 9
+        corr = 1 - float(3**3 - 3 + 5**3 - 5) / (8**3 - 8)
+        expected = h_uncorr / corr
+        h, p = stats.kruskal(x, y)
+        assert_approx_equal(h, expected)
+
+    def test_three_groups(self):
+        """A test of stats.kruskal with three groups, with ties."""
+        x = [1, 1, 1]
+        y = [2, 2, 2]
+        z = [2, 2]
+        h_uncorr = (12. / 8. / 9.) * (3*2**2 + 3*6**2 + 2*6**2) - 3 * 9  # 5.0
+        corr = 1 - float(3**3 - 3 + 5**3 - 5) / (8**3 - 8)
+        expected = h_uncorr / corr  # 7.0
+        h, p = stats.kruskal(x, y, z)
+        assert_approx_equal(h, expected)
+        assert_approx_equal(p, stats.chisqprob(h, 2))
+
+
 if __name__ == "__main__":
     run_module_suite()