##### mwu5Testing.ipynb

#### February 2, 2023

#### Development and Testing of function to perform
#### Mann-Whitney U non-parametric (rank-based) test (aka Wilcoxon for 2 independent groups)

#### Added group means to output df, verified works with grp values w/ > 1 characters


In [13]:
import pandas as pd
import numpy as np

#import scipy.stats

####  Test Data and Sample grouping files

In [14]:
# Read in data matrix and sample group info

dataf = pd.read_table('mwu5TestingData.txt', na_values=' ').set_index('UID')
print(dataf)

grpdf = pd.read_table('mwu5TestingGrps.txt', na_values=' ').set_index('UID')
print(grpdf)
# grpdf has cols to test several things
#  Note column w/ NaN has float values


        S1  S2  S3  S4    S5    S6    S7  S8   S9  S10   S11  S12  S13  S14  \
UID                                                                           
ev01   0.0   1   2   3   4.0   5.0   6.0   7  8.0  9.0  10.0   11   12   13   
ev02   0.0   0   2   2   4.0   4.0   6.0   7  8.0  9.0  10.0   11   12   13   
ev03   0.0   1   2   3   4.0   5.0   6.0   7  NaN  NaN   NaN   11   12   13   
ev04  17.0  16  15  14  13.0  12.0  11.0  10  9.0  8.0   7.0    6    5    4   
ev05   NaN  16  15  14  13.0  12.0  11.0  10  9.0  8.0   7.0    6    5    4   
ev06   1.0   3   2   4   5.0   6.0   3.0   1  4.0  2.0   6.0    5   12   13   
ev07  17.0   0   1   2   NaN   NaN   NaN   6  7.0  8.0   9.0   10   11   12   
ev08   0.0   1   3   2   4.0   5.0   7.0   6  8.0  9.0   9.0    6    5   12   

      S15   S16   S17   S18  
UID                          
ev01   14  15.0  16.0  17.0  
ev02   14  15.0  16.0  17.0  
ev03   14  15.0  16.0  17.0  
ev04    3   2.0   1.0   0.0  
ev05    3   2.0   1.0   NaN  

### Define function to perform some data checks before mwu computation

In [7]:
# Do not run, For Reference.  Import function below.

def mwuChecks( dataf, grpdf, *args, grpvar='grp',min_group_size=6 ): 
#
#   This function performs some checks before calling mwuCompute().    
#
#   dataf is a pandas data frame with samples as columns and events (features) as rows.  
#   Event names must be the row names (index), not in a column.
#   Missing values (np.nan='NaN') are allowed
#
#   grpdf is a pandas data frame with a group membership column ('grp' is the default ).
#   Sample names must be the row names (index), not in a column.
#   Aside from ordering, must be 1-1 match between dataf.columns and grpdf.index
#
#   min_group_size is minimum group size for kw test to be computed.
#   min_group_size = 6 is based on statistical rule-of-thumb for Mann-Whitney 2-group test.
#
#  Returns integer to indicate course of action:
#     0 = Problem, do not run mwuCompute
#     1 = Run mwuCompute


    import pandas as pd
    import numpy as np

    
#    print(grpvar)
#    print(type(grpvar))
    
    # --- Check for 1-1 match between sample names provided in grpdf and the data matrix

    if set(dataf.columns)!= set(grpdf.index) :
        print('Group information is inconsistent with names in data matrix')
        return( 0 )
  
        
    grpCounts = grpdf[grpvar].value_counts().to_frame()  
# returns a pandas df of counts, indexed by grp, decreasing by count
    grpCounts.columns=['grpRawN']
    nGrps = grpCounts.shape[0]
    minGrpN = min( grpCounts['grpRawN'] )
    # print('nGrps, minGrpN',nGrps,minGrpN)
    
    
    # -- Handle groups <> 2 --

    if nGrps < 2 :
        print('Number of sample groups is < 2; Mann-Whitney test not conducted')
        return( 0 )
        
    if nGrps > 2 :
        print('Number of sample groups is > 2; Mann-Whitney test not conducted')
        return( 2 )
    
    # -- Don't proceed if already know a group size is < minimum --   

    if minGrpN < min_group_size:
        print('Mann-Whitney test not conducted: A group has fewer samples than minimum group size: ',minGrpN,' < ',min_group_size)
        return( 0 )
    
    return( 1 )

In [15]:


from mwu5 import mwuChecks



###  Establish the mwuChecks function catches errors as intended

In [17]:

#  Everything OK
test_OK = mwuChecks( dataf, grpdf, grpvar='grp12',min_group_size=6 )
print(test_OK)
# Should return test_OK = 1 

#  Everything OK
test_OK = mwuChecks( dataf, grpdf, grpvar='cellType',min_group_size=6 )
print(test_OK)
# Should return test_OK = 1

#  Grps file index not match sample names
grpdf_sn = grpdf.copy()
grpdf_sn.at['S1','samp'] = 'S100'
grpdf_sn.rename(index={'S1':'S100'},inplace=True)
#display(grpdf_sn)
test_sn = mwuChecks( dataf, grpdf_sn, grpvar='grp12',min_group_size=6 )
print(test_sn)
# Should return test_sn = 0 & inconsistent names message

# Test 1 group situation
grpdf['grp'] = 1
#display(grpdf)
test_1g = mwuChecks( dataf, grpdf, grpvar='grp',min_group_size=6 )
print(test_1g)
# Should return test_1g = 0 & groups < 2 message

# Test > 2 group situation
grpdf['grp'] = range(0,18)
#display(grpdf)
test_g = mwuChecks( dataf, grpdf, grpvar='grp',min_group_size=6 )
print(test_g)
# Should return test_1g = 0 & groups > 2 message

#  A group N is < min_group_size
test_gs = mwuChecks( dataf, grpdf, grpvar='grp12',min_group_size=10 )
print(test_gs)
# Should returntest_gs == 0  & fewer samples than min message

1
1
Group information is inconsistent with names in data matrix
0
Number of sample groups is < 2; Mann-Whitney test not conducted
0
Number of sample groups is > 2; Mann-Whitney test not conducted
2
Mann-Whitney test not conducted: A group has fewer samples than minimum group size:  9  <  10
0


In [16]:
# Check group formation works as intened with several grouping configurations

groups = dataf.T.groupby(grpdf['grp12'])
print(groups.groups)

groups = dataf.T.groupby(grpdf['grp12NaN'])
print(groups.groups)
#

groups = dataf.T.groupby(grpdf['grpAB'])
print(groups.groups)

groups = dataf.T.groupby(grpdf['cellType'])
print(groups.groups)

{1: ['S1', 'S3', 'S5', 'S7', 'S9', 'S11', 'S13', 'S15', 'S17'], 2: ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18']}
{1.0: ['S1', 'S3', 'S5', 'S9', 'S11', 'S13', 'S15', 'S17'], 2.0: ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18']}
{'A': ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18'], 'B': ['S1', 'S3', 'S5', 'S7', 'S9', 'S11', 'S13', 'S15', 'S17']}
{'B': ['S1', 'S3', 'S5', 'S7', 'S9', 'S11', 'S13', 'S15', 'S17'], 'CD8': ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18']}


### Define function to compute the mwu test and collect summary stats (Ns, medians)

In [11]:
# Do not run, For Reference.  Import function below.

def mwuCompute(dataf, grpdf, *args, grpvar='grp' ,min_group_size=6 ): 
    #
    #   This function performs the Mann-Whitney NP (rank) two independent groups test, aka Wilcoxon 2 grp indep test.    
    #
    #   dataf is a pandas data frame with samples as columns and events (features) as rows.  
    #   Event names must be the row names (index), not in a column.
    #   Missing values (np.nan='NaN') are allowed
    #
    #   grpdf is a pandas data frame with a group membership column.
    #   Sample names must be the row names (index), not in a column.
    #   Aside from ordering, must be 1-1 match between dataf.columns and grpdf.index
    #
    #   min_group_size is minimum group size for mwu test to be computed.
    #   6 is based on statistical rule-of-thumb for Mann-Whitney 2-group test.
    #
    #   Direction of comparison is by sorted grp levels, e.g. A vs. B, 1 vs 2.
    #   Sign of result is determined by comparing test statistic value to the null mean (not comparing group medians or means)
    #
    #   Returns pandas dataframe with columns containing group Ns (xNaN) & medians, value of mwu test statistic, and p-value.
    #   Returned df has row for each event, even if test was not computed (in this case mwu values will be NaN)
    
    import pandas as pd
    import numpy as np

    import scipy.stats
    
    
    groups = dataf.T.groupby(grpdf[grpvar])
#    print(groups.groups)

    grpCounts = grpdf[grpvar].value_counts().to_frame()
#    display(grpCounts)
    
    grpLevels = grpCounts.index
    grpLevels = grpLevels.tolist()
#    print(grpLevels)
    SgrpLevels = sorted(grpLevels)
#    print("SgrpLevels",SgrpLevels)

    
    g1Level = SgrpLevels[0]
    g2Level = SgrpLevels[1]
#    print('g1Level',g1Level)
#    print('g2Level',g2Level)

    g1 = groups.get_group(g1Level)
    g2 = groups.get_group(g2Level)
#    print('g1 dat',g1)
#    print('g2 dat',g2)
#   count() method counts number of non-missing entries
    eventsToRun = ((g1.count() >= min_group_size) & (g2.count() >= min_group_size)).values
#    print('eventsToRun, sum',eventsToRun,sum(eventsToRun))
#    print(g1.iloc[:,eventsToRun])
#    print(g2.iloc[:,eventsToRun])

#   Set up df for results
# tempdf.columns=[('N_' + str(gID)),('Median_' + str(gID))]

    result = pd.DataFrame(
    {
        ('N_' + str(g1Level)): g1.count(),
        ('N_' + str(g2Level)): g2.count(),
        ('Median_' + str(g1Level)): g1.median(),
        ('Median_' + str(g2Level)): g2.median(),
        ('Mean_' + str(g1Level)): g1.mean(),
        ('Mean_' + str(g2Level)): g2.mean(),
        'mwuStat': np.nan,
        'mwuPval': np.nan,
        'mwuSign': np.nan
    },
        index=dataf.index )
    
#    display(result)

#   Handle situation of no events meeting min_group_size criterion

    if sum(eventsToRun) == 0:
        print('Mann-Whitney test not conducted: No events meet minimum group size criterion of ',min_group_size,'.')
        return( result)

#   Compute the mwu test statistic & p-value for eventsToRun
    
    mwu = scipy.stats.mannwhitneyu(
    g1.iloc[:,eventsToRun].values, 
    g2.iloc[:,eventsToRun].values,
    alternative="two-sided",
    method="asymptotic",
    use_continuity=True,
    nan_policy='omit'
    )  
#    print(mwu)

#   Populate stat & p-value columns    
    result.loc[eventsToRun, "mwuStat"] = mwu.statistic
    result.loc[eventsToRun, "mwuPval"] = mwu.pvalue
#    display(result)

#   Determine sign (direction) of test result based on expected mean of test stat under null
    nullMean = (result[('N_' + str(g1Level))] * result[('N_' + str(g2Level))]) / 2.0 + 0.5
#    print(nullMean)
    result["mwuSign"] = np.select(
    [
        (result["mwuStat"] < nullMean),
        (result["mwuStat"] == nullMean),
        (result["mwuStat"] > nullMean),
    ],
    [-1, 0, 1],
    default=np.nan )

#    display(result)

    return( result)

    
    

In [18]:


from mwu5 import mwuCompute



### Test the mwuCompute function

In [19]:
# NaN value in grouping variable  (makes values float)
mwuRun = mwuCompute(dataf, grpdf, grpvar='grp12NaN' ,min_group_size=8 )
print(mwuRun)
# Should run and produce 
#      n1  n2  median1  median2  MWUstat   MWUpval  MWUsign
#UID                                                       
#ev01   8   9      9.0      9.0     33.0  0.809894     -1.0
#ev02   8   9      9.0      9.0     34.5  0.923201     -1.0
#ev03   6   8      8.0      9.0      NaN       NaN      NaN
#ev04   8   9      8.0      8.0     39.0  0.809894      1.0
#ev05   7   8      7.0      9.0      NaN       NaN      NaN
#ev06   8   9      5.5      5.0     36.5  1.000000      0.0
#ev07   6   7     10.0      8.0      NaN       NaN      NaN
#ev08   8   9      4.5      6.0     33.0  0.809320     -1.0

# Grouping variable has integer values
mwuRun = mwuCompute(dataf, grpdf, grpvar='grp12' ,min_group_size=8 )
print(mwuRun)
# Should run and produce
#      N_1  N_2  Median_1  Median_2  mwuStat   mwuPval  MWUsign
#UID                                                           
#ev01    9    9       8.0       9.0     36.0  0.723932     -1.0
#ev02    9    9       8.0       9.0     37.5  0.825016     -1.0
#ev03    7    8       6.0       9.0      NaN       NaN      NaN
#ev04    9    9       9.0       8.0     45.0  0.723932      1.0
#ev05    8    8       8.0       9.0     28.0  0.713191     -1.0
#ev06    9    9       5.0       5.0     39.0  0.929418     -1.0
#ev07    6    7      10.0       8.0      NaN       NaN      NaN
#ev08    9    9       5.0       6.0     39.0  0.929455     -1.0

#  Grouping variable levels are character (strings)
mwuRun = mwuCompute(dataf, grpdf, grpvar='grpAB' ,min_group_size=8 )
print(mwuRun)
# Should run and produce
#      N_A  N_B  Median_A  Median_B  mwuStat   mwuPval  MWUsign
#UID                                                           
#ev01    9    9       9.0       8.0     45.0  0.723932      1.0
#ev02    9    9       9.0       8.0     43.5  0.825016      1.0
#ev03    8    7       9.0       6.0      NaN       NaN      NaN
#ev04    9    9       8.0       9.0     36.0  0.723932     -1.0
#ev05    8    8       9.0       8.0     36.0  0.713191      1.0
#ev06    9    9       5.0       5.0     42.0  0.929418      1.0
#ev07    7    6       8.0      10.0      NaN       NaN      NaN
#ev08    9    9       6.0       5.0     42.0  0.929455      1.0


#  More complex group names, reversal of A & B above
mwuRun = mwuCompute(dataf, grpdf, grpvar='cellType' ,min_group_size=8 )
print(mwuRun)
# Should run and produce
#      N_A  N_B  Median_A  Median_B  mwuStat   mwuPval  MWUsign
#UID                                                           
#ev01    9    9       9.0       8.0     45.0  0.723932      1.0
#ev02    9    9       9.0       8.0     43.5  0.825016      1.0
#ev03    8    7       9.0       6.0      NaN       NaN      NaN
#ev04    9    9       8.0       9.0     36.0  0.723932     -1.0
#ev05    8    8       9.0       8.0     36.0  0.713191      1.0
#ev06    9    9       5.0       5.0     42.0  0.929418      1.0
#ev07    7    6       8.0      10.0      NaN       NaN      NaN
#ev08    9    9       6.0       5.0     42.0  0.929455      1.0


#  Too few events
mwuRun = mwuCompute(dataf, grpdf, grpvar='grp12NaN' ,min_group_size=11 )
print(mwuRun)
#Should run and produce
#Mann-Whitney test not conducted: No events meet minimum group size criterion of  11 .
#      N_1.0  N_2.0  Median_1.0  Median_2.0  mwuStat  mwuPval  mwuSign
#UID                                                                  
#ev01      8      9         9.0         9.0      NaN      NaN      NaN
#ev02      8      9         9.0         9.0      NaN      NaN      NaN
#ev03      6      8         8.0         9.0      NaN      NaN      NaN
#ev04      8      9         8.0         8.0      NaN      NaN      NaN
#ev05      7      8         7.0         9.0      NaN      NaN      NaN
#ev06      8      9         5.5         5.0      NaN      NaN      NaN
#ev07      6      7        10.0         8.0      NaN      NaN      NaN
#ev08      8      9         4.5         6.0      NaN      NaN      NaN


      N_1.0  N_2.0  Median_1.0  Median_2.0  mwuStat   mwuPval  mwuSign
UID                                                                   
ev01      8      9         9.0         9.0     33.0  0.809894     -1.0
ev02      8      9         9.0         9.0     34.5  0.923201     -1.0
ev03      6      8         8.0         9.0      NaN       NaN      NaN
ev04      8      9         8.0         8.0     39.0  0.809894      1.0
ev05      7      8         7.0         9.0      NaN       NaN      NaN
ev06      8      9         5.5         5.0     36.5  1.000000      0.0
ev07      6      7        10.0         8.0      NaN       NaN      NaN
ev08      8      9         4.5         6.0     33.0  0.809320     -1.0
      N_1  N_2  Median_1  Median_2  mwuStat   mwuPval  mwuSign
UID                                                           
ev01    9    9       8.0       9.0     36.0  0.723932     -1.0
ev02    9    9       8.0       9.0     37.5  0.825016     -1.0
ev03    7    8       6.0       9.0    