In [1]:
import altair as alt
import numpy as np
import oscutils
import scipy
import statsmodels.stats.multitest

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
sources = ["mm", "pd"]

In [3]:
inputs = {}

for source in sources:
    df = oscutils.\
    load_protein_table(source, "quant", clean=True).\
    drop(columns="sample").\
    set_index(["sample_type", "sample_condition", "sample_num"])
    
    inputs[source] = df

In [4]:
def get_sample_counts(inputs_dict):
    
    sources = []
    sample_types = []
    sample_conditions = []
    counts = []
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]
        for sample_type in df.index.get_level_values("sample_type").unique():
            for sample_condition in df.index.get_level_values("sample_condition").unique():
                df_sel = df.xs((sample_type, sample_condition), level=("sample_type", "sample_condition"), drop_level=False)
                
                sources.append(source)
                sample_types.append(sample_type)
                sample_conditions.append(sample_condition)
                counts.append(df_sel.shape[0])
    
    return pd.DataFrame({
        "source": sources,
        "sample_type": sample_types,
        "sample_condition": sample_conditions,
        "count": counts,
    })
                
get_sample_counts(inputs)

Unnamed: 0,source,sample_type,sample_condition,count
0,mm,boost,healthy,3
1,mm,boost,unhealthy,3
2,mm,hfl1,healthy,36
3,mm,hfl1,unhealthy,35
4,mm,pbulk,healthy,3
5,mm,pbulk,unhealthy,3
6,pd,boost,healthy,3
7,pd,boost,unhealthy,3
8,pd,hfl1,healthy,36
9,pd,hfl1,unhealthy,35


In [5]:
def get_protein_coverages(inputs_dict):
    
    sources = []
    sample_types = []
    sample_conditions = []
    samples_counts = []
    proteins = []
    samples_found_counts = [] 
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]
        for sample_type in df.index.get_level_values("sample_type").unique():
            for sample_condition in df.index.get_level_values("sample_condition").unique():
                df_sel = df.xs((sample_type, sample_condition), level=("sample_type", "sample_condition"), drop_level=False)
                for protein in df_sel.columns:
                    
                    sources.append(source)
                    sample_types.append(sample_type)
                    sample_conditions.append(sample_condition)
                    samples_counts.append(df_sel.shape[0])
                    proteins.append(protein)
                    samples_found_counts.append(df_sel[protein].notna().sum())
    
    coverages_df = pd.DataFrame({
        "source": sources,
        "sample_type": sample_types,
        "sample_condition": sample_conditions,
        "samples_count": samples_counts,
        "protein": proteins,
        "samples_found_count": samples_found_counts,
    })
    
    coverages_df = coverages_df.assign(
        samples_found_prop=coverages_df["samples_found_count"] / coverages_df["samples_count"]
    )
    
    return coverages_df
                
coverages = get_protein_coverages(inputs)

coverages = coverages.assign(
    source_type_condition=coverages["source"] + "_" + coverages["sample_type"] + "_" + coverages["sample_condition"],
    source_condition=coverages["source"] + "_" + coverages["sample_condition"],
)

In [6]:
alt.vconcat(*[
    alt.Chart(coverages[coverages["sample_type"] == sample_type]).mark_bar().encode(
        x=alt.X(
            "samples_found_count",
            bin=True,
        ),
        y="count()",
        color="source_condition",
    ).facet(
        facet=alt.Facet(
            "source_type_condition",
            header=alt.Header(
                labelOrient="bottom",
            ),
        ),
        columns=4
    ).resolve_scale(
        x="independent",
        y="shared",
    )
    
    for sample_type in coverages["sample_type"].unique()
])

In [7]:
def diff_expr_ttest(inputs_dict):
    
    results = {}
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]

        sample_types = []
        prots = []
        pvals = []
        fcs = []

        for sample_type in df.index.get_level_values("sample_type").unique():
            healthy = df.xs((sample_type, "healthy"), level=("sample_type", "sample_condition"), drop_level=True)
            unhealthy = df.xs((sample_type, "unhealthy"), level=("sample_type", "sample_condition"), drop_level=True)

            for prot in healthy.columns:
                a = healthy[prot].dropna()
                b = unhealthy[prot].dropna()

                if sample_type in ["boost", "pbulk"]:
                    min_count = 2
                else:
                    min_coun = 15
                    
                if len(a) >= min_count and len(b) >= min_count:
                    t, p = scipy.stats.mannwhitneyu(x=a, y=b)#, equal_var=True)
                else:
                    p = np.nan
                    
                fc = np.log2(b.mean() / a.mean())

                sample_types.append(sample_type)
                prots.append(prot)
                pvals.append(p)
                fcs.append(fc)

        raw_pvals = pd.DataFrame({
            "sample_type": sample_types,
            "protein": prots,
            "p_uncorrected": pvals,
            "log2_fold_change": fcs,
        })

        pvals = raw_pvals[raw_pvals["p_uncorrected"].notna()]

        # Correct the p values
        reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
            pvals=pvals["p_uncorrected"].dropna(),
            alpha=0.05,
            method="fdr_bh",
        )

        pvals = pvals.assign(p_corrected=pvals_corrected) 
        
        results[source] = {
            "pvals": pvals,
            "raw_pvals": raw_pvals,
        }
    
    return results

results = diff_expr_ttest(inputs)

In [8]:
alpha = 0.05

for source in sources:
    print(source)
    print(results[source]["raw_pvals"].shape)
    print((results[source]["pvals"]["p_uncorrected"] <= alpha).sum())
    print((results[source]["pvals"]["p_corrected"] <= alpha).sum())
    print(results[source]["pvals"]["p_corrected"].min())
    print(results[source]["pvals"][results[source]["pvals"]["p_corrected"] <= alpha])
    print()

mm
(5151, 4)
192
0
0.11005467532360468
Empty DataFrame
Columns: [sample_type, protein, p_uncorrected, log2_fold_change, p_corrected]
Index: []

pd
(8694, 4)
270
22
0.004525293464946904
     sample_type protein  p_uncorrected  log2_fold_change  p_corrected
3057        hfl1  P02788   1.413384e-05         -0.616684     0.011179
3067        hfl1  P05089   2.033733e-05         -0.889077     0.012690
3081        hfl1  P01024   7.371807e-07         -1.333245     0.004525
3218        hfl1  P04114   1.612294e-05         -1.338347     0.011179
3243        hfl1  Q96TA1   1.450415e-06         -1.012678     0.004525
3260        hfl1  P54136   1.374440e-05         -0.694078     0.011179
3416        hfl1  P19823   3.056961e-05         -1.030339     0.017341
3440        hfl1  P62081   3.553998e-05         -0.903409     0.018481
3557        hfl1  Q9Y4W6   8.536390e-05         -1.319288     0.029593
3858        hfl1  Q9UHB9   5.427059e-05         -0.910543     0.021166
3861        hfl1  P02771   4.10042