In [1]:
import altair as alt
import numpy as np
import oscutils
import scipy
import statsmodels.stats.multitest

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
inputs = {}

for source in ["mm", "pd"]:
    df = oscutils.\
    load_protein_table(source, "quant", clean=True).\
    drop(columns=["sample", "FileName"]).\
    set_index(["sample_type", "sample_condition", "sample_num"])
    
    inputs[source] = df

## Use Mann-Whitney U test to test for differential expression

In [3]:
def diff_expr_test(inputs_dict):
    
    results = {}
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]

        sample_types = []
        prots = []
        pvals = []
        fcs = []

        for sample_type in df.index.get_level_values("sample_type").unique():
            healthy = df.xs((sample_type, "healthy"), level=("sample_type", "sample_condition"), drop_level=True)
            unhealthy = df.xs((sample_type, "unhealthy"), level=("sample_type", "sample_condition"), drop_level=True)

            for prot in healthy.columns:
                a = healthy[prot].dropna()
                b = unhealthy[prot].dropna()

                if sample_type in ["boost", "pbulk"]:
                    min_count = 2
                else:
                    min_count = 15
                    
                if len(a) >= min_count and len(b) >= min_count:
                    t, p = scipy.stats.mannwhitneyu(x=a, y=b)
                else:
                    p = np.nan
                    
                fc = np.log2(b.mean() / a.mean())

                sample_types.append(sample_type)
                prots.append(prot)
                pvals.append(p)
                fcs.append(fc)

        raw_pvals = pd.DataFrame({
            "sample_type": sample_types,
            "protein": prots,
            "p_uncorrected": pvals,
            "log2_fold_change": fcs,
        })

        pvals = raw_pvals[raw_pvals["p_uncorrected"].notna()]

        # Correct the p values
        reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
            pvals=pvals["p_uncorrected"].dropna(),
            alpha=0.05,
            method="fdr_bh",
        )

        pvals = pvals.assign(p_corrected=pvals_corrected) 
        
        results[source] = {
            "pvals": pvals,
            "raw_pvals": raw_pvals,
        }
    
    return results

results = diff_expr_test(inputs)

Below are the results of the tests. Significantly more proteins had differential expression in the Proteome Discoverer data, which tracks since it found more proteins overall (whether all those additional abundances it found are valid is another question). A future thing to do could be to look up these proteins and see if there's anything interesting about them in the context of the experiment. 

In [4]:
alpha = 0.05

for source in inputs.keys():
    print(source)
    print("Number of proteins: ", results[source]["raw_pvals"].shape[0])
    print("Number of proteins with significant differential expression before multiple testing correction: ", (results[source]["pvals"]["p_uncorrected"] <= alpha).sum())
    print("Number of proteins with significant differential expression after multiple testing correction: ", (results[source]["pvals"]["p_corrected"] <= alpha).sum())
    print("Smallest corrected p-value: ", results[source]["pvals"]["p_corrected"].min())
    print("Proteins with significant differential expression:\n", results[source]["pvals"][results[source]["pvals"]["p_corrected"] <= alpha])
    print()

mm
Number of proteins:  4335
Number of proteins with significant differential expression before multiple testing correction:  189
Number of proteins with significant differential expression after multiple testing correction:  12
Smallest corrected p-value:  0.0004060466094914959
Proteins with significant differential expression:
      sample_type protein  p_uncorrected  log2_fold_change  p_corrected
1681      hfl1sc  P02771   2.624690e-06         -0.919627     0.001656
1878      hfl1sc  P01024   2.480848e-06         -0.634641     0.001656
2170      hfl1sc  P05455   1.964795e-04          0.657920     0.045083
2199      hfl1sc  P19823   1.485271e-06         -0.854549     0.001656
2223      hfl1sc  O43854   8.894603e-05         -0.762977     0.024944
2235      hfl1sc  P36551   2.236042e-04         -0.592636     0.047031
2366      hfl1sc  Q9P0L0   1.274057e-05          0.347244     0.005360
2394      hfl1sc  Q6UW68   1.607333e-04          0.831721     0.040569
2395      hfl1sc  P01023   9.