##### mwu4Testing.ipynb

January 4, 2023

Development and Testing of function to perform
Mann-Whitney U non-parametric (rank-based) test (aka Wilcoxon for 2 independent groups)


In [1]:
import pandas as pd
import numpy as np

#import scipy.stats

####  Test Data and Sample grouping files

In [2]:
# Read in data matrix and sample group info

dataf = pd.read_table('mwu4TestingData.txt', na_values=' ').set_index('UID')
print(dataf)

grpdf = pd.read_table('mwu4TestingGrps.txt', na_values=' ').set_index('UID')
print(grpdf)
# grpdf has cols to test several things
#  Note column w/ NaN has float values


        S1  S2  S3  S4    S5    S6    S7  S8   S9  S10   S11  S12  S13  S14  \
UID                                                                           
ev01   0.0   1   2   3   4.0   5.0   6.0   7  8.0  9.0  10.0   11   12   13   
ev02   0.0   0   2   2   4.0   4.0   6.0   7  8.0  9.0  10.0   11   12   13   
ev03   0.0   1   2   3   4.0   5.0   6.0   7  NaN  NaN   NaN   11   12   13   
ev04  17.0  16  15  14  13.0  12.0  11.0  10  9.0  8.0   7.0    6    5    4   
ev05   NaN  16  15  14  13.0  12.0  11.0  10  9.0  8.0   7.0    6    5    4   
ev06   1.0   3   2   4   5.0   6.0   3.0   1  4.0  2.0   6.0    5   12   13   
ev07  17.0   0   1   2   NaN   NaN   NaN   6  7.0  8.0   9.0   10   11   12   
ev08   0.0   1   3   2   4.0   5.0   7.0   6  8.0  9.0   9.0    6    5   12   

      S15   S16   S17   S18  
UID                          
ev01   14  15.0  16.0  17.0  
ev02   14  15.0  16.0  17.0  
ev03   14  15.0  16.0  17.0  
ev04    3   2.0   1.0   0.0  
ev05    3   2.0   1.0   NaN  

### Define function to perform some data checks before mwu computation

In [30]:
def mwuChecks( dataf, grpdf, *args, grpvar='grp',min_group_size=6 ): 
#
#   This function performs some checks before calling mwuCompute().    
#
#   dataf is a pandas data frame with samples as columns and events (features) as rows.  
#   Event names must be the row names (index), not in a column.
#   Missing values (np.nan='NaN') are allowed
#
#   grpdf is a pandas data frame with a group membership column ('grp' is the default ).
#   Sample names must be the row names (index), not in a column.
#   Aside from ordering, must be 1-1 match between dataf.columns and grpdf.index
#
#   min_group_size is minimum group size for kw test to be computed.
#   min_group_size = 6 is based on statistical rule-of-thumb for Mann-Whitney 2-group test.
#
#  Returns integer to indicate course of action:
#     0 = Problem, do not run mwuCompute
#     1 = Run mwuCompute


    import pandas as pd
    import numpy as np

    
#    print(grpvar)
#    print(type(grpvar))
    
    # --- Check for 1-1 match between sample names provided in grpdf and the data matrix

    if set(dataf.columns)!= set(grpdf.index) :
        print('Group information is inconsistent with names in data matrix')
        return( 0 )
  
        
    grpCounts = grpdf[grpvar].value_counts().to_frame()  
# returns a pandas df of counts, indexed by grp, decreasing by count
    grpCounts.columns=['grpRawN']
    nGrps = grpCounts.shape[0]
    minGrpN = min( grpCounts['grpRawN'] )
    # print('nGrps, minGrpN',nGrps,minGrpN)
    
    
    # -- Handle groups <> 2 --

    if nGrps < 2 :
        print('Number of sample groups is < 2; Mann-Whitney test not conducted')
        return( 0 )
        
    if nGrps > 2 :
        print('Number of sample groups is > 2; Mann-Whitney test not conducted')
        return( 2 )
    
    # -- Don't proceed if already know a group size is < minimum --   

    if minGrpN < min_group_size:
        print('Mann-Whitney test not conducted: A group has fewer samples than minimum group size: ',minGrpN,' < ',min_group_size)
        return( 0 )
    
    return( 1 )

####  Establish the mwuChecks function catches errors as intended

In [35]:

#  Everything OK
test_OK = mwuChecks( dataf, grpdf, grpvar='grp12',min_group_size=6 )
print(test_OK)
# Should return test_OK = 1 

#  Grps file index not match sample names
grpdf_sn = grpdf.copy()
grpdf_sn.at['S1','samp'] = 'S100'
grpdf_sn.rename(index={'S1':'S100'},inplace=True)
#display(grpdf_sn)
test_sn = mwuChecks( dataf, grpdf_sn, grpvar='grp12',min_group_size=6 )
print(test_sn)
# Should return test_sn = 0 & inconsistent names message

# Test 1 group situation
grpdf['grp'] = 1
#display(grpdf)
test_1g = mwuChecks( dataf, grpdf, grpvar='grp',min_group_size=6 )
print(test_1g)
# Should return test_1g = 0 & groups < 2 message

# Test > 2 group situation
grpdf['grp'] = range(0,18)
#display(grpdf)
test_g = mwuChecks( dataf, grpdf, grpvar='grp',min_group_size=6 )
print(test_g)
# Should return test_1g = 0 & groups > 2 message

#  A group N is < min_group_size
test_gs = mwuChecks( dataf, grpdf, grpvar='grp12',min_group_size=10 )
print(test_gs)
# Should returntest_gs == 0  & fewer samples than min message

1
Group information is inconsistent with names in data matrix
0
Number of sample groups is < 2; Mann-Whitney test not conducted
0
Number of sample groups is > 2; Mann-Whitney test not conducted
2
Mann-Whitney test not conducted: A group has fewer samples than minimum group size:  9  <  10
0


In [36]:
# Check group formation works as intened with several grouping configurations

groups = dataf.T.groupby(grpdf['grp12'])
print(groups.groups)

groups = dataf.T.groupby(grpdf['grp12NaN'])
print(groups.groups)
#

groups = dataf.T.groupby(grpdf['grpAB'])
print(groups.groups)

{1: ['S1', 'S3', 'S5', 'S7', 'S9', 'S11', 'S13', 'S15', 'S17'], 2: ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18']}
{1.0: ['S1', 'S3', 'S5', 'S9', 'S11', 'S13', 'S15', 'S17'], 2.0: ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18']}
{'A': ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18'], 'B': ['S1', 'S3', 'S5', 'S7', 'S9', 'S11', 'S13', 'S15', 'S17']}


### Define function to compute the mwu test and collect summary stats (Ns, medians)

In [108]:
def mwuCompute(dataf, grpdf, *args, grpvar='grp' ,min_group_size=6 ): 
    #
    #   This function performs the Mann-Whitney NP (rank) two independent groups test, aka Wilcoxon 2 grp indep test.    
    #
    #   dataf is a pandas data frame with samples as columns and events (features) as rows.  
    #   Event names must be the row names (index), not in a column.
    #   Missing values (np.nan='NaN') are allowed
    #
    #   grpdf is a pandas data frame with a group membership column.
    #   Sample names must be the row names (index), not in a column.
    #   Aside from ordering, must be 1-1 match between dataf.columns and grpdf.index
    #
    #   min_group_size is minimum group size for kw test to be computed.
    #   6 is based on statistical rule-of-thumb for Mann-Whitney 2-group test.
    #
    #   Returns pandas dataframe with columns containing group Ns (xNaN) & medians, value of mwu test statistic, and p-value.
    #   Sign (direction) is determined by comparing test statistic value to the null mean (not comparing group medians)
    #   Returned df has row for each event, even if test was not computed (in this case mwu values will be NaN)
    
    import pandas as pd
    import numpy as np

    import scipy.stats
    
    
    groups = dataf.T.groupby(grpdf[grpvar])
#    print(groups.groups)

    grpCounts = grpdf[grpvar].value_counts().to_frame()
    display(grpCounts)
    
    grpLevels = grpCounts.index
    print(grpLevels)
    print(type(grpLevels))
    grpLevels = grpLevels.tolist()
    print(grpLevels)
    print(type(grpLevels))
    SgrpLevels = sorted(grpLevels)
    print("SgrpLevels",SgrpLevels)
    print(type(SgrpLevels))
    
    g1Level = SgrpLevels[0]
    g2Level = SgrpLevels[1]
    print('g1Level',g1Level)
    print('g2Level',g2Level)
#    ints = all([isinstance(item, int) for item in grpLevels])  
#    floats = all([isinstance(item, float) for item in grpLevels])
#    strings = all([isinstance(item, str) for item in grpLevels])
    
#    grpLevels = [s.strip("'") for s in grpLevels]
#    print(grpLevels)
#    grpLevels = grpLevels.sort()
#    print(grpLevels)

    g1 = groups.get_group(g1Level)
    g2 = groups.get_group(g2Level)
    print('g1 dat',g1)
    print('g2 dat',g2)
#   count() method counts number of non-missing entries
#    eventsToRun = ((g1.count() >= min_group_size) & (g2.count() >= min_group_size)).values
#    print(eventsToRun)
#    print(sum(eventsToRun))
#    print(g1.iloc[:,eventsToRun])
#    print(g2.iloc[:,eventsToRun])

#   Set up df for results
#    result = pd.DataFrame(
#    {
#        "n1": g1.count(),
#        "n2": g2.count(),
#        "median1": g1.median(),
#        "median2": g2.median()
#    },
#    index=dataf.index)

#   Handle situation of no events meeting min_group_size criterion

#    if sum(eventsToRun) == 0:
#        print('Mann-Whitney test not conducted: No events meet minimum group size criterion.')
#        return( result)

#   Compute the mwu test statistic & p-value for eventsToRun
    
#    mwu = scipy.stats.mannwhitneyu(
#    g1.iloc[:,eventsToRun].values, 
#    g2.iloc[:,eventsToRun].values,
#    alternative="two-sided",
#    method="asymptotic",
#    use_continuity=True,
#    nan_policy='omit'
#    )  
#    print(mwu)

#   Populate stat & p-value columns    
#    result.loc[eventsToRun, "MWUstat"] = mwu.statistic
#    result.loc[eventsToRun, "MWUpval"] = mwu.pvalue
#    display(result)

#   Determine sign (direction) of test result based on expected mean of test stat under null
#    nullMean = (result['n1'] * result['n2']) / 2.0 + 0.5
#    result["MWUsign"] = np.select(
#    [
#        (result["MWUstat"] < nullMean),
#        (result["MWUstat"] == nullMean),
#        (result["MWUstat"] > nullMean),
#    ],
#    [-1, 0, 1],
#    default=np.nan )
#    display(result)

    return( 'X' )

    
    

In [None]:
def mwuCompute(dataf, grpdf, *args, grpvar='grp' ,min_group_size=6 ): 
    #
    #   This function performs the Mann-Whitney NP (rank) two independent groups test, aka Wilcoxon 2 grp indep test.    
    #
    #   dataf is a pandas data frame with samples as columns and events (features) as rows.  
    #   Event names must be the row names (index), not in a column.
    #   Missing values (np.nan='NaN') are allowed
    #
    #   grpdf is a pandas data frame with a group membership column.
    #   Sample names must be the row names (index), not in a column.
    #   Aside from ordering, must be 1-1 match between dataf.columns and grpdf.index
    #
    #   min_group_size is minimum group size for kw test to be computed.
    #   6 is based on statistical rule-of-thumb for Mann-Whitney 2-group test.
    #
    #   Returns pandas dataframe with columns containing group Ns (xNaN) & medians, value of mwu test statistic, and p-value.
    #   Sign (direction) is determined by comparing test statistic value to the null mean (not comparing group medians)
    #   Returned df has row for each event, even if test was not computed (in this case mwu values will be NaN)
    
    import pandas as pd
    import numpy as np

    import scipy.stats
    
    # Set up little df with grp info, for group Ns and medians work
    
    grpCounts = grpdf['grp'].value_counts().to_frame()  # returns a pandas df of counts, indexed by grp, decreasing by count
    grpCounts.columns=['grpRawN']
    grpCounts['grpID']=grpCounts.index
    grpCounts = grpCounts.sort_index()
    # print(grpCounts) 
    nGrps = grpCounts.shape[0]

    # Compute group N and median for each event, in blocks by group, right-side joining as go. 
    #. Accumulate in resdf
    gindex = 0
    for gID in grpCounts.index:
        gindex = gindex + 1
        gSamps = grpdf.loc[grpdf['grp'] == gID, 'samp'].tolist()
    #    print(gSamps)
        gdf=dataf[gSamps]
    #    display(gdf)
    #
        meds = np.nanmedian(gdf,axis=1)
    #    print(meds)

        okVals = np.sum(~np.isnan(gdf), axis=1)
    #    print(okVals)

        tempdf=pd.DataFrame(zip(okVals,meds),index=okVals.index)
        tempdf.columns=[('N_' + str(gID)),('Median_' + str(gID))]
        if gindex == 1:
            resdf = tempdf.copy()
        else:
            resdf = resdf.set_index(resdf.index).join(tempdf)
 #       display(resdf)
# display(resdf) 
#    return( resdf )

#    Add columns for the kw statistic & p-value 
    
    resdf['mwuStat'], resdf['mwuPval'] = [ np.nan, np.nan ]
    
    display()
    
    

### Test the mwuCompute function

In [109]:
mwuRun = mwuCompute(dataf, grpdf, grpvar='grp12NaN' ,min_group_size=8 )
print(mwuRun)
# Should run and produce 
#      n1  n2  median1  median2  MWUstat   MWUpval  MWUsign
#UID                                                       
#ev01   8   9      9.0      9.0     33.0  0.809894     -1.0
#ev02   8   9      9.0      9.0     34.5  0.923201     -1.0
#ev03   6   8      8.0      9.0      NaN       NaN      NaN
#ev04   8   9      8.0      8.0     39.0  0.809894      1.0
#ev05   7   8      7.0      9.0      NaN       NaN      NaN
#ev06   8   9      5.5      5.0     36.5  1.000000      0.0
#ev07   6   7     10.0      8.0      NaN       NaN      NaN
#ev08   8   9      4.5      6.0     33.0  0.809320     -1.0

mwuRun = mwuCompute(dataf, grpdf, grpvar='grpAB' ,min_group_size=8 )
print(mwuRun)


#  Too few events
mwuRun = mwuCompute(dataf, grpdf, grpvar='grp12NaN' ,min_group_size=11 )
print(mwuRun)
# Should run, output error message, and return group Ns and medians

Unnamed: 0,grp12NaN
2.0,9
1.0,8


Float64Index([2.0, 1.0], dtype='float64')
<class 'pandas.core.indexes.numeric.Float64Index'>
[2.0, 1.0]
<class 'list'>
SgrpLevels [1.0, 2.0]
<class 'list'>
g1Level 1.0
g2Level 2.0
g1 dat UID  ev01  ev02  ev03  ev04  ev05  ev06  ev07  ev08
S1    0.0   0.0   0.0  17.0   NaN   1.0  17.0   0.0
S3    2.0   2.0   2.0  15.0  15.0   2.0   1.0   3.0
S5    4.0   4.0   4.0  13.0  13.0   5.0   NaN   4.0
S9    8.0   8.0   NaN   9.0   9.0   4.0   7.0   8.0
S11  10.0  10.0   NaN   7.0   7.0   6.0   9.0   9.0
S13  12.0  12.0  12.0   5.0   5.0  12.0  11.0   5.0
S15  14.0  14.0  14.0   3.0   3.0  14.0  13.0  13.0
S17  16.0  16.0  16.0   1.0   1.0  16.0   NaN   3.0
g2 dat UID  ev01  ev02  ev03  ev04  ev05  ev06  ev07  ev08
S2    1.0   0.0   1.0  16.0  16.0   3.0   0.0   1.0
S4    3.0   2.0   3.0  14.0  14.0   4.0   2.0   2.0
S6    5.0   4.0   5.0  12.0  12.0   6.0   NaN   5.0
S8    7.0   7.0   7.0  10.0  10.0   1.0   6.0   6.0
S10   9.0   9.0   NaN   8.0   8.0   2.0   8.0   9.0
S12  11.0  11.0  11.0   6.

Unnamed: 0,grpAB
B,9
A,9


Index(['B', 'A'], dtype='object')
<class 'pandas.core.indexes.base.Index'>
['B', 'A']
<class 'list'>
SgrpLevels ['A', 'B']
<class 'list'>
g1Level A
g2Level B
g1 dat UID  ev01  ev02  ev03  ev04  ev05  ev06  ev07  ev08
S2    1.0   0.0   1.0  16.0  16.0   3.0   0.0   1.0
S4    3.0   2.0   3.0  14.0  14.0   4.0   2.0   2.0
S6    5.0   4.0   5.0  12.0  12.0   6.0   NaN   5.0
S8    7.0   7.0   7.0  10.0  10.0   1.0   6.0   6.0
S10   9.0   9.0   NaN   8.0   8.0   2.0   8.0   9.0
S12  11.0  11.0  11.0   6.0   6.0   5.0  10.0   6.0
S14  13.0  13.0  13.0   4.0   4.0  13.0  12.0  12.0
S16  15.0  15.0  15.0   2.0   2.0  15.0   NaN  14.0
S18  17.0  17.0  17.0   0.0   NaN  17.0  15.0   2.0
g2 dat UID  ev01  ev02  ev03  ev04  ev05  ev06  ev07  ev08
S1    0.0   0.0   0.0  17.0   NaN   1.0  17.0   0.0
S3    2.0   2.0   2.0  15.0  15.0   2.0   1.0   3.0
S5    4.0   4.0   4.0  13.0  13.0   5.0   NaN   4.0
S7    6.0   6.0   6.0  11.0  11.0   3.0   NaN   7.0
S9    8.0   8.0   NaN   9.0   9.0   4.0   7.0   

Unnamed: 0,grp12NaN
2.0,9
1.0,8


Float64Index([2.0, 1.0], dtype='float64')
<class 'pandas.core.indexes.numeric.Float64Index'>
[2.0, 1.0]
<class 'list'>
SgrpLevels [1.0, 2.0]
<class 'list'>
g1Level 1.0
g2Level 2.0
g1 dat UID  ev01  ev02  ev03  ev04  ev05  ev06  ev07  ev08
S1    0.0   0.0   0.0  17.0   NaN   1.0  17.0   0.0
S3    2.0   2.0   2.0  15.0  15.0   2.0   1.0   3.0
S5    4.0   4.0   4.0  13.0  13.0   5.0   NaN   4.0
S9    8.0   8.0   NaN   9.0   9.0   4.0   7.0   8.0
S11  10.0  10.0   NaN   7.0   7.0   6.0   9.0   9.0
S13  12.0  12.0  12.0   5.0   5.0  12.0  11.0   5.0
S15  14.0  14.0  14.0   3.0   3.0  14.0  13.0  13.0
S17  16.0  16.0  16.0   1.0   1.0  16.0   NaN   3.0
g2 dat UID  ev01  ev02  ev03  ev04  ev05  ev06  ev07  ev08
S2    1.0   0.0   1.0  16.0  16.0   3.0   0.0   1.0
S4    3.0   2.0   3.0  14.0  14.0   4.0   2.0   2.0
S6    5.0   4.0   5.0  12.0  12.0   6.0   NaN   5.0
S8    7.0   7.0   7.0  10.0  10.0   1.0   6.0   6.0
S10   9.0   9.0   NaN   8.0   8.0   2.0   8.0   9.0
S12  11.0  11.0  11.0   6.