### mwu6Testing.ipynb

#### May 3, 2023

#### Development and Testing of function to perform
#### Mann-Whitney U non-parametric (rank-based) test (aka Wilcoxon for 2 independent groups)

#### Added group means to output df, verified works with grp values w/ > 1 characters
#### Added difference of group means, which in some contexts will be dPSI or log-fold-change
#### Added fdr-adjusted p-values


In [1]:
import pandas as pd
import numpy as np

#import scipy.stats

####  Test Data and Sample grouping files

In [2]:
# Read in data matrix and sample group info

dataf = pd.read_table('mwu6TestingData.txt', na_values=' ').set_index('UID')
print(dataf)

grpdf = pd.read_table('mwu6TestingGrps.txt', na_values=' ').set_index('UID')
print(grpdf)
# grpdf has cols to test several things
#  Note column w/ NaN has float values


        S1  S2  S3  S4    S5    S6    S7  S8   S9  S10   S11  S12  S13  S14  \
UID                                                                           
ev01   0.0   1   2   3   4.0   5.0   6.0   7  8.0  9.0  10.0   11   12   13   
ev02   0.0   0   2   2   4.0   4.0   6.0   7  8.0  9.0  10.0   11   12   13   
ev03   0.0   1   2   3   4.0   5.0   6.0   7  NaN  NaN   NaN   11   12   13   
ev04  17.0  16  15  14  13.0  12.0  11.0  10  9.0  8.0   7.0    6    5    4   
ev05   NaN  16  15  14  13.0  12.0  11.0  10  9.0  8.0   7.0    6    5    4   
ev06   1.0   3   2   4   5.0   6.0   3.0   1  4.0  2.0   6.0    5   12   13   
ev07  17.0   0   1   2   NaN   NaN   NaN   6  7.0  8.0   9.0   10   11   12   
ev08   0.0   1   3   2   4.0   5.0   7.0   6  8.0  9.0   9.0    6    5   12   

      S15   S16   S17   S18  
UID                          
ev01   14  15.0  16.0  17.0  
ev02   14  15.0  16.0  17.0  
ev03   14  15.0  16.0  17.0  
ev04    3   2.0   1.0   0.0  
ev05    3   2.0   1.0   NaN  

### Define function to perform some data checks before mwu computation

In [3]:
# Do not run, For Reference.  Import function below.

def mwuChecks( dataf, grpdf, *args, grpvar='grp',min_group_size=6 ): 
#
#   This function performs some checks before calling mwuCompute().    
#
#   dataf is a pandas data frame with samples as columns and events (features) as rows.  
#   Event names must be the row names (index), not in a column.
#   Missing values (np.nan='NaN') are allowed
#
#   grpdf is a pandas data frame with a group membership column ('grp' is the default ).
#   Sample names must be the row names (index), not in a column.
#   Aside from ordering, must be 1-1 match between dataf.columns and grpdf.index
#
#   min_group_size is minimum group size for kw test to be computed.
#   min_group_size = 6 is based on statistical rule-of-thumb for Mann-Whitney 2-group test.
#
#  Returns integer to indicate course of action:
#     0 = Problem, do not run mwuCompute
#     1 = Run mwuCompute


    import pandas as pd
    import numpy as np

    
#    print(grpvar)
#    print(type(grpvar))
    
    # --- Check for 1-1 match between sample names provided in grpdf and the data matrix

    if set(dataf.columns)!= set(grpdf.index) :
        print('Group information is inconsistent with names in data matrix')
        return( 0 )
  
        
    grpCounts = grpdf[grpvar].value_counts().to_frame()  
# returns a pandas df of counts, indexed by grp, decreasing by count
    grpCounts.columns=['grpRawN']
    nGrps = grpCounts.shape[0]
    minGrpN = min( grpCounts['grpRawN'] )
    # print('nGrps, minGrpN',nGrps,minGrpN)
    
    
    # -- Handle groups <> 2 --

    if nGrps < 2 :
        print('Number of sample groups is < 2; Mann-Whitney test not conducted')
        return( 0 )
        
    if nGrps > 2 :
        print('Number of sample groups is > 2; Mann-Whitney test not conducted')
        return( 2 )
    
    # -- Don't proceed if already know a group size is < minimum --   

    if minGrpN < min_group_size:
        print('Mann-Whitney test not conducted: A group has fewer samples than minimum group size: ',minGrpN,' < ',min_group_size)
        return( 0 )
    
    return( 1 )

In [3]:


from mwu6 import mwuChecks



###  Establish the mwuChecks function catches errors as intended

In [4]:

#  Everything OK
test_OK = mwuChecks( dataf, grpdf, grpvar='grp12',min_group_size=6 )
print(test_OK)
# Should return test_OK = 1 

#  Everything OK
test_OK = mwuChecks( dataf, grpdf, grpvar='cellType',min_group_size=6 )
print(test_OK)
# Should return test_OK = 1

#  Grps file index not match sample names
grpdf_sn = grpdf.copy()
grpdf_sn.at['S1','samp'] = 'S100'
grpdf_sn.rename(index={'S1':'S100'},inplace=True)
#display(grpdf_sn)
test_sn = mwuChecks( dataf, grpdf_sn, grpvar='grp12',min_group_size=6 )
print(test_sn)
# Should return test_sn = 0 & inconsistent names message

# Test 1 group situation
grpdf['grp'] = 1
#display(grpdf)
test_1g = mwuChecks( dataf, grpdf, grpvar='grp',min_group_size=6 )
print(test_1g)
# Should return test_1g = 0 & groups < 2 message

# Test > 2 group situation
grpdf['grp'] = range(0,18)
#display(grpdf)
test_g = mwuChecks( dataf, grpdf, grpvar='grp',min_group_size=6 )
print(test_g)
# Should return test_1g = 0 & groups > 2 message

#  A group N is < min_group_size
test_gs = mwuChecks( dataf, grpdf, grpvar='grp12',min_group_size=10 )
print(test_gs)
# Should returntest_gs == 0  & fewer samples than min message

1
1
Group information is inconsistent with names in data matrix
0
Number of sample groups is < 2; Mann-Whitney test not conducted
0
Number of sample groups is > 2; Mann-Whitney test not conducted
2
Mann-Whitney test not conducted: A group has fewer samples than minimum group size:  9  <  10
0


In [5]:
# Check group formation works as intened with several grouping configurations

groups = dataf.T.groupby(grpdf['grp12'])
print(groups.groups)

groups = dataf.T.groupby(grpdf['grp12NaN'])
print(groups.groups)
#

groups = dataf.T.groupby(grpdf['grpAB'])
print(groups.groups)

groups = dataf.T.groupby(grpdf['cellType'])
print(groups.groups)

{1: ['S1', 'S3', 'S5', 'S7', 'S9', 'S11', 'S13', 'S15', 'S17'], 2: ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18']}
{1.0: ['S1', 'S3', 'S5', 'S9', 'S11', 'S13', 'S15', 'S17'], 2.0: ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18']}
{'A': ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18'], 'B': ['S1', 'S3', 'S5', 'S7', 'S9', 'S11', 'S13', 'S15', 'S17']}
{'B': ['S1', 'S3', 'S5', 'S7', 'S9', 'S11', 'S13', 'S15', 'S17'], 'CD8': ['S2', 'S4', 'S6', 'S8', 'S10', 'S12', 'S14', 'S16', 'S18']}


### Define function to compute the mwu test and collect summary stats (Ns, medians)

In [20]:
# Do not run, For Reference.  Import function below.

def mwuCompute(dataf, grpdf, *args, grpvar='grp' ,min_group_size=6 ): 
    #
    #   This function performs the Mann-Whitney NP (rank) two independent groups test, aka Wilcoxon 2 grp indep test.    
    #
    #   dataf is a pandas data frame with samples as columns and events (features) as rows.  
    #   Event names must be the row names (index), not in a column.
    #   Missing values (np.nan='NaN') are allowed
    #
    #   grpdf is a pandas data frame with a group membership column.
    #   Sample names must be the row names (index), not in a column.
    #   Aside from ordering, must be 1-1 match between dataf.columns and grpdf.index
    #
    #   min_group_size is minimum group size for mwu test to be computed.
    #   6 is based on statistical rule-of-thumb for Mann-Whitney 2-group test.
    #
    #   Direction of comparison is by sorted grp levels, e.g. A vs. B, 1 vs 2.
    #   Sign of result is determined by comparing test statistic value to the null mean (not comparing group medians or means)
    #
    #   Returns pandas dataframe with columns containing group Ns (xNaN) & medians, value of mwu test statistic, and p-value.
    #   Returned df has row for each event, even if test was not computed (in this case mwu values will be NaN)
    
    import pandas as pd
    import numpy as np

    import scipy.stats
    
    import statsmodels.stats.multitest as smm
    
    
    groups = dataf.T.groupby(grpdf[grpvar])
#    print(groups.groups)

    grpCounts = grpdf[grpvar].value_counts().to_frame()
#    display(grpCounts)
    
    grpLevels = grpCounts.index
    grpLevels = grpLevels.tolist()
#    print(grpLevels)
    SgrpLevels = sorted(grpLevels)
#    print("SgrpLevels",SgrpLevels)

    
    g1Level = SgrpLevels[0]
    g2Level = SgrpLevels[1]
#    print('g1Level',g1Level)
#    print('g2Level',g2Level)

    g1 = groups.get_group(g1Level)
    g2 = groups.get_group(g2Level)
#    print('g1 dat',g1)
#    print('g2 dat',g2)
#   count() method counts number of non-missing entries
    eventsToRun = ((g1.count() >= min_group_size) & (g2.count() >= min_group_size)).values
#    print('eventsToRun, sum',eventsToRun,sum(eventsToRun))
#    print(g1.iloc[:,eventsToRun])
#    print(g2.iloc[:,eventsToRun])

    diffGrpMeans = g1.mean() - g2.mean()
    
#   Set up df for results
    result = pd.DataFrame(
    {
        ('N_' + str(g1Level)): g1.count(),
        ('N_' + str(g2Level)): g2.count(),
        ('Median_' + str(g1Level)): g1.median(),
        ('Median_' + str(g2Level)): g2.median(),
        ('Mean_' + str(g1Level)): g1.mean(),
        ('Mean_' + str(g2Level)): g2.mean(),
        ('DiffMeans_' + str(g1Level) + '_m_' + str(g2Level)): diffGrpMeans,
        'mwuStat': np.nan,
        'mwuSign': np.nan,  
        'mwuPval': np.nan,
        'mwuAdjPval':np.nan       
    },
        index=dataf.index )
    
#    display(result)

#   Handle situation of no events meeting min_group_size criterion

    if sum(eventsToRun) == 0:
        print('Mann-Whitney test not conducted: No events meet minimum group size criterion of ',min_group_size,'.')
        return( result)

#   Compute the mwu test statistic & p-value for eventsToRun
    
    mwu = scipy.stats.mannwhitneyu(
    g1.iloc[:,eventsToRun].values, 
    g2.iloc[:,eventsToRun].values,
    alternative="two-sided",
    method="asymptotic",
    use_continuity=True,
    nan_policy='omit'
    )  
#    print(mwu)

#   Populate stat & p-value columns    
    result.loc[eventsToRun, "mwuStat"] = mwu.statistic
    result.loc[eventsToRun, "mwuPval"] = mwu.pvalue
#    display(result)

#   Determine sign (direction) of test result based on expected mean of test stat under null
#   NOTE--this sign need not match the sign of difference of means or medians
    nullMean = (result[('N_' + str(g1Level))] * result[('N_' + str(g2Level))]) / 2.0 + 0.5
#    print(nullMean)
    result["mwuSign"] = np.select(
    [
        (result["mwuStat"] < nullMean),
        (result["mwuStat"] == nullMean),
        (result["mwuStat"] > nullMean),
    ],
    [-1, 0, 1],
    default=np.nan )

#   Calculate FDR of mwuPval
#   Re-create mask in case any of the calculated p-values were NaN
    eventsForFDR = (eventsToRun & (~ result['mwuPval'].isnull().values ) )
    
#   Do not provide alpha cutoff value to function since want adjusted p-values, not significance calls
    FDRres = smm.fdrcorrection(result.loc[eventsForFDR, "mwuPval"].to_numpy(),
                               method="indep",is_sorted=False)
    result.loc[eventsForFDR,'mwuAdjPval'] = FDRres[1]

#    display(result)

    return( result)

    
    

In [6]:


from mwu6 import mwuCompute



### Test the mwuCompute function

In [7]:
# NaN value in grouping variable  (makes values float)
mwuRun1 = mwuCompute(dataf, grpdf, grpvar='grp12NaN' ,min_group_size=8 )
print(mwuRun1)
# Should run and produce NaNa for evs 03, 05 & 07


# Grouping variable has integer values
mwuRun2 = mwuCompute(dataf, grpdf, grpvar='grp12' ,min_group_size=8 )
print(mwuRun2)
# Should run and produce NaNs for evs 03 & 07


#  Grouping variable levels are character (strings)
mwuRun3 = mwuCompute(dataf, grpdf, grpvar='grpAB' ,min_group_size=8 )
print(mwuRun3)
# Should run and produce NaNs for evs 03 & 0


#  More complex group names, reversal of A & B above
mwuRun4 = mwuCompute(dataf, grpdf, grpvar='cellType' ,min_group_size=8 )
print(mwuRun4)
# Should run, produce NaNs for evs 03 & 07, MWUsign & diffMeans signs should reverse

#  Too few events
mwuRun5 = mwuCompute(dataf, grpdf, grpvar='grp12NaN' ,min_group_size=11 )
print(mwuRun5)
#Should run and produce following message:
#Mann-Whitney test not conducted: No events meet minimum group size criterion of  11 .
#Should return NaNs for both p-value columns



      N_1.0  N_2.0  Median_1.0  Median_2.0  Mean_1.0  Mean_2.0  \
UID                                                              
ev01      8      9         9.0         9.0  8.250000  9.000000   
ev02      8      9         9.0         9.0  8.250000  8.666667   
ev03      6      8         8.0         9.0  8.000000  9.000000   
ev04      8      9         8.0         8.0  8.750000  8.000000   
ev05      7      8         7.0         9.0  7.571429  9.000000   
ev06      8      9         5.5         5.0  7.500000  7.333333   
ev07      6      7        10.0         8.0  9.666667  7.571429   
ev08      8      9         4.5         6.0  5.625000  6.333333   

      DiffMeans_1.0_m_2.0  mwuStat  mwuSign   mwuPval  mwuAdjPval  
UID                                                                
ev01            -0.750000     33.0     -1.0  0.809894         1.0  
ev02            -0.416667     34.5     -1.0  0.923201         1.0  
ev03            -1.000000      NaN      NaN       NaN         NaN  

###  WRITE OUT RESULTS FILE

In [8]:
mwuRun1.to_csv('mwuRun1.txt', sep ='\t')

In [9]:
Check = pd.read_table('mwuRun1.txt', na_values=' ').set_index('UID')
print(Check)

      N_1.0  N_2.0  Median_1.0  Median_2.0  Mean_1.0  Mean_2.0  \
UID                                                              
ev01      8      9         9.0         9.0  8.250000  9.000000   
ev02      8      9         9.0         9.0  8.250000  8.666667   
ev03      6      8         8.0         9.0  8.000000  9.000000   
ev04      8      9         8.0         8.0  8.750000  8.000000   
ev05      7      8         7.0         9.0  7.571429  9.000000   
ev06      8      9         5.5         5.0  7.500000  7.333333   
ev07      6      7        10.0         8.0  9.666667  7.571429   
ev08      8      9         4.5         6.0  5.625000  6.333333   

      DiffMeans_1.0_m_2.0  mwuStat  mwuSign   mwuPval  mwuAdjPval  
UID                                                                
ev01            -0.750000     33.0     -1.0  0.809894         1.0  
ev02            -0.416667     34.5     -1.0  0.923201         1.0  
ev03            -1.000000      NaN      NaN       NaN         NaN  