Skip to content

Commit

Permalink
BUG: stats: Merge pull request scipy#87 (handle array_like in kruskal…
Browse files Browse the repository at this point in the history
…, improve code in kruskal and f_oneway)
  • Loading branch information
WarrenWeckesser committed Oct 7, 2011
2 parents 0496569 + 9807ac1 commit ea3d301
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 32 deletions.
64 changes: 32 additions & 32 deletions scipy/stats/stats.py
Expand Up @@ -2152,26 +2152,25 @@ def f_oneway(*args):
.. [2] Heiman, G.W. Research Methods in Statistics. 2002.
"""
na = len(args) # ANOVA on 'na' groups, each in it's own array
tmp = map(np.array,args)
args = map(np.asarray, args) # convert to an numpy array
na = len(args) # ANOVA on 'na' groups, each in it's own array
alldata = np.concatenate(args)
bign = len(alldata)
sstot = ss(alldata)-(square_of_sums(alldata)/float(bign))
sstot = ss(alldata) - (square_of_sums(alldata) / float(bign))
ssbn = 0
for a in args:
ssbn = ssbn + square_of_sums(array(a))/float(len(a))
ssbn = ssbn - (square_of_sums(alldata)/float(bign))
sswn = sstot-ssbn
dfbn = na-1
ssbn += square_of_sums(a) / float(len(a))
ssbn -= (square_of_sums(alldata) / float(bign))
sswn = sstot - ssbn
dfbn = na - 1
dfwn = bign - na
msb = ssbn/float(dfbn)
msw = sswn/float(dfwn)
f = msb/msw
prob = fprob(dfbn,dfwn,f)
msb = ssbn / float(dfbn)
msw = sswn / float(dfwn)
f = msb / msw
prob = fprob(dfbn, dfwn, f)
return f, prob



def pearsonr(x, y):
"""Calculates a Pearson correlation coefficient and the p-value for testing
non-correlation.
Expand Down Expand Up @@ -3548,30 +3547,31 @@ def kruskal(*args):
.. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance
"""
if len(args) < 2:
args = map(np.asarray, args) # convert to a numpy array
na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array
if na < 2:
raise ValueError("Need at least two groups in stats.kruskal()")
n = map(len,args)
all = []
for i in range(len(args)):
all.extend(args[i].tolist())
ranked = list(rankdata(all))
T = tiecorrect(ranked)
args = list(args)
for i in range(len(args)):
args[i] = ranked[0:n[i]]
del ranked[0:n[i]]
rsums = []
for i in range(len(args)):
rsums.append(np.sum(args[i],axis=0)**2)
rsums[i] = rsums[i] / float(n[i])
ssbn = np.sum(rsums,axis=0)
totaln = np.sum(n,axis=0)
h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
df = len(args) - 1
n = np.asarray(map(len, args))

alldata = np.concatenate(args)

ranked = rankdata(alldata) # Rank the data
T = tiecorrect(ranked) # Correct for ties
if T == 0:
raise ValueError('All numbers are identical in kruskal')

# Compute sum^2/n for each group and sum
j = np.insert(np.cumsum(n), 0, 0)
ssbn = 0
for i in range(na):
ssbn += square_of_sums(ranked[j[i]:j[i+1]]) / float(n[i])

totaln = np.sum(n)
h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1)
df = na - 1
h = h / float(T)
return h, chisqprob(h,df)
return h, chisqprob(h, df)



def friedmanchisquare(*args):
Expand Down
60 changes: 60 additions & 0 deletions scipy/stats/tests/test_stats.py
Expand Up @@ -1873,5 +1873,65 @@ def test_basic(self):
# result in F being exactly 2.0.
assert_equal(F, 2.0)


class TestKruskal(TestCase):

def test_simple(self):
"""A really simple case for stats.kruskal"""
x = [1]
y = [2]
h, p = stats.kruskal(x, y)
assert_equal(h, 1.0)
assert_approx_equal(p, stats.chisqprob(h, 1))
h, p = stats.kruskal(np.array(x), np.array(y))
assert_equal(h, 1.0)
assert_approx_equal(p, stats.chisqprob(h, 1))

def test_basic(self):
"""A basic test, with no ties."""
x = [1, 3, 5, 7, 9]
y = [2, 4, 6, 8, 10]
h, p = stats.kruskal(x, y)
assert_approx_equal(h, 3./11, significant=10)
assert_approx_equal(p, stats.chisqprob(3./11, 1))
h, p = stats.kruskal(np.array(x), np.array(y))
assert_approx_equal(h, 3./11, significant=10)
assert_approx_equal(p, stats.chisqprob(3./11, 1))

def test_simple_tie(self):
"""A simple case with a tie."""
x = [1]
y = [1, 2]
h_uncorr = 1.5**2 + 2*2.25**2 - 12
corr = 0.75
expected = h_uncorr / corr # 0.5
h, p = stats.kruskal(x, y)
# Since the expression is simple and the exact answer is 0.5, it
# should be safe to use assert_equal().
assert_equal(h, expected)

def test_another_tie(self):
"""Another test of stats.kruskal with a tie."""
x = [1, 1, 1, 2]
y = [2, 2, 2, 2]
h_uncorr = (12. / 8. / 9.) * 4 * (3**2 + 6**2) - 3 * 9
corr = 1 - float(3**3 - 3 + 5**3 - 5) / (8**3 - 8)
expected = h_uncorr / corr
h, p = stats.kruskal(x, y)
assert_approx_equal(h, expected)

def test_three_groups(self):
"""A test of stats.kruskal with three groups, with ties."""
x = [1, 1, 1]
y = [2, 2, 2]
z = [2, 2]
h_uncorr = (12. / 8. / 9.) * (3*2**2 + 3*6**2 + 2*6**2) - 3 * 9 # 5.0
corr = 1 - float(3**3 - 3 + 5**3 - 5) / (8**3 - 8)
expected = h_uncorr / corr # 7.0
h, p = stats.kruskal(x, y, z)
assert_approx_equal(h, expected)
assert_approx_equal(p, stats.chisqprob(h, 2))


if __name__ == "__main__":
run_module_suite()

0 comments on commit ea3d301

Please sign in to comment.