In [1]:
import altair as alt
import numpy as np
import oscutils
import scipy
import statsmodels.stats.multitest

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
inputs = {}

for source in ["mm", "pd"]:
    df = oscutils.\
    load_protein_table(source, "quant", clean=True).\
    drop(columns="sample").\
    set_index(["sample_type", "sample_condition", "sample_num"])
    
    inputs[source] = df

## Sample counts

See how many samples there are in each condition, so we have an idea of what might be a good cutoff for how many samples a protein has to be found in in each condition before we do a t-test.

In [3]:
def get_sample_counts(inputs_dict):
    
    sources = []
    sample_types = []
    sample_conditions = []
    counts = []
    
    df = inputs_dict["mm"] # Doesn't matter which we use, sample counts are the same for both
    for sample_type in df.index.get_level_values("sample_type").unique():
        for sample_condition in df.index.get_level_values("sample_condition").unique():
            df_sel = df.xs((sample_type, sample_condition), level=("sample_type", "sample_condition"), drop_level=False)

            sources.append(source)
            sample_types.append(sample_type)
            sample_conditions.append(sample_condition)
            counts.append(df_sel.shape[0])
    
    return pd.DataFrame({
        "sample_type": sample_types,
        "sample_condition": sample_conditions,
        "count": counts,
    })
                
get_sample_counts(inputs)

Unnamed: 0,sample_type,sample_condition,count
0,boost,healthy,3
1,boost,unhealthy,3
2,hfl1,healthy,36
3,hfl1,unhealthy,35
4,pbulk,healthy,3
5,pbulk,unhealthy,3


## Histogram of proportion of proteins with different coverages

Make a histogram showing, for each number of samples in each condition, what proportion of proteins are found in that number of samples

In [4]:
def get_protein_coverages(inputs_dict):
    
    sources = []
    sample_types = []
    sample_conditions = []
    samples_counts = []
    proteins = []
    samples_found_counts = [] 
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]
        for sample_type in df.index.get_level_values("sample_type").unique():
            for sample_condition in df.index.get_level_values("sample_condition").unique():
                df_sel = df.xs((sample_type, sample_condition), level=("sample_type", "sample_condition"), drop_level=False)
                for protein in df_sel.columns:
                    
                    sources.append(source)
                    sample_types.append(sample_type)
                    sample_conditions.append(sample_condition)
                    samples_counts.append(df_sel.shape[0])
                    proteins.append(protein)
                    samples_found_counts.append(df_sel[protein].notna().sum())
    
    coverages_df = pd.DataFrame({
        "source": sources,
        "sample_type": sample_types,
        "sample_condition": sample_conditions,
        "samples_count": samples_counts,
        "protein": proteins,
        "samples_found_count": samples_found_counts,
    })
    
    coverages_df = coverages_df.assign(
        samples_found_prop=coverages_df["samples_found_count"] / coverages_df["samples_count"],
        source_type_condition=coverages_df["source"] + "_" + coverages_df["sample_type"] + "_" + coverages_df["sample_condition"],
        source_condition=coverages_df["source"] + "_" + coverages_df["sample_condition"],
    )
    
    return coverages_df
                
coverages = get_protein_coverages(inputs)

In [5]:
alt.vconcat(*[
    alt.hconcat(*[
        alt.Chart(coverages[
            (coverages["sample_type"] == sample_type) &
            (coverages["source_condition"] == source_condition)
        ]).transform_joinaggregate(
            total="count(*)",
        ).transform_calculate(
            pct="1 / datum.total",
        ).mark_bar().encode(
            x=alt.X(
                "samples_found_count",
                bin=alt.Bin(
                    extent=[0, 37] if sample_type == "hfl1" else [0, 4], 
                    step=1
                ),
                title=[
                    "Number of samples each protein is found in",
                    f"{sample_type}: {source_condition}",
                ],
            ),
            y=alt.Y(
                "sum(pct):Q",
                title="Percentage of all proteins",
                axis=alt.Axis(
                    format="%",
                ),
            ),
            color="sample_condition",
        )
        
        for source_condition in coverages["source_condition"].unique()
    ]).resolve_scale(
        x="independent",
        y="shared",
    )
    
    for sample_type in coverages["sample_type"].unique()
])

## Distributions of abundances for each protein

For each protein in each sample, show what the distribution of abundances looks like.

In [6]:
def individual_distributions_plot(df):
    
    df = df.sample(
        n=25,
        replace=False,
        random_state=0,
        axis=1,
        ignore_index=False,
    )
    
    df = pd.melt(
        frame=df,
        var_name="protein",
        value_name="abundance",
        ignore_index=False,
    ).\
    reset_index(drop=False).\
    sort_values(by=["sample_type", "protein"]).\
    dropna(
        axis=0,
        how="all",
        subset="abundance",
    )
    
    return alt.vconcat(*[
        alt.hconcat(*[
            alt.hconcat(*[
            
                alt.Chart(df[
                    (df["sample_type"] == sample_type) &
                    (df["sample_condition"] == sample_condition) &
                    (df["protein"] == protein)
                ]).mark_bar().encode(
                    x=alt.X(
                        "abundance:Q",
                        title=f"{sample_condition} - {protein}",
                        bin=alt.Bin(
                        ),
                    ),
                    y="count()",
                    color="sample_type:N"
                )

                for sample_condition in df["sample_condition"].unique()
            ])
            for protein in df["protein"].unique()
        ]) 
        for sample_type in df["sample_type"].unique()
    ])

In [7]:
individual_distributions_plot(inputs["mm"])

In [8]:
individual_distributions_plot(inputs["pd"])

## Distribution of normalized abundances across all proteins

To see overall.

In [26]:
def overall_distribution_plot(df):
    
    normalized = pd.DataFrame()
    
    for sample_type in df.index.get_level_values("sample_type").unique():
        for sample_condition in df.index.get_level_values("sample_condition").unique():
            df_sel = df.xs((sample_type, sample_condition), level=("sample_type", "sample_condition"), drop_level=False)
            return df_sel
            
            for prot in df_sel.columns:
    
overall_distribution_plot(inputs["mm"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,P35527,Q86YZ3,P35908,P13645,P52895,P06744,P08779,P04264,P05783,P05787,...,Q13435,Q9H8Y8,Q02750,Q9H2W6,Q9BT78,O75494,P28065,Q10471,Q9UFN0,Q9H1E3
sample_type,sample_condition,sample_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
boost,healthy,1,1417.371918,292.380529,713.336812,736.157197,31689.766316,21811.110329,887.233857,692.219512,143415.484212,73847.458113,...,767.764444,,392.346056,93.247225,247.585252,226.344955,138.582771,125.449957,179.655155,586.356464
boost,healthy,2,1849.764874,370.206344,801.212991,1139.691981,67849.718922,26846.567481,2267.416618,919.579117,189844.42289,96010.992274,...,1332.025899,,432.03976,93.120013,217.697011,340.840642,108.673042,169.807851,172.994538,262.028479
boost,healthy,3,1916.17998,371.187075,656.111779,1462.419836,38341.627927,35403.896737,1768.365745,1176.936873,327893.44647,161267.222935,...,1075.292231,,439.90876,70.304042,345.491007,376.029042,166.155977,108.783478,160.777618,835.479672


In [9]:
# Plan: Test for normality, try with log transform if fail, MWU if still fail; check variance if succeed, Welch if different variance
# Confirm PD data already normalized

## Shapro-Wilk test for normality

In [10]:
import warnings
warnings.simplefilter("always")

def shapiro_wilk(inputs_dict):
    
    results = {}
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]

        sample_types = []
        sample_conditions = []
        prots = []
        pvals = []

        for sample_type in df.index.get_level_values("sample_type").unique():
            for sample_condition in df.index.get_level_values("sample_condition").unique():
                df_sel = df.xs((sample_type, sample_condition), level=("sample_type", "sample_condition"), drop_level=False)
                for prot in df_sel.columns:
                    
                    abundances = df_sel[prot].dropna()

                    if sample_type in ["boost", "pbulk"]:
                        min_count = 3
                    else:
                        min_coun = 15

                    if len(abundances) >= min_count:
                        stat, p = scipy.stats.shapiro(abundances)
                    else:
                        p = np.nan

                    sample_types.append(sample_type)
                    sample_conditions.append(sample_condition)
                    prots.append(prot)
                    pvals.append(p)
                    
        raw_pvals = pd.DataFrame({
            "sample_type": sample_types,
            "sample_condition": sample_conditions,
            "protein": prots,
            "p_uncorrected": pvals,
        })

        pvals = raw_pvals[raw_pvals["p_uncorrected"].notna()]

        # Correct the p values
        reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
            pvals=pvals["p_uncorrected"].dropna(),
            alpha=0.05,
            method="fdr_bh",
        )

        pvals = pvals.assign(p_corrected=pvals_corrected) 
        
        results[source] = pvals

    return results

sw = shapiro_wilk(inputs)



In [11]:
sources = []
sample_types = []
sample_conditions = []
counts = []
props_uncorrected_sig = []
props_corrected_sig = []

for source in sw.keys():
    df = sw[source]
    for sample_type in df["sample_type"].unique():
        for sample_condition in df["sample_condition"].unique():
            
            df_sel = df[(df["sample_type"] == sample_type) & (df["sample_condition"] == sample_condition)]
            
            sources.append(source)
            sample_types.append(sample_type)
            sample_conditions.append(sample_condition)
            
            if df_sel.shape[0] > 0:
                props_uncorrected_sig.append((df_sel["p_uncorrected"] <= 0.05).sum() / df.shape[0])
                props_corrected_sig.append((df_sel["p_corrected"] <= 0.05).sum() / df.shape[0])
            else:
                props_uncorrected_sig.append(np.nan)
                props_corrected_sig.append(np.nan)
            
pd.DataFrame({
    "source": sources,
    "sample_type": sample_types,
    "sample_condition": sample_conditions,
    "prop_uncorrected_sig": props_uncorrected_sig,
    "prop_corrected_sig": props_corrected_sig,
})

Unnamed: 0,source,sample_type,sample_condition,prop_uncorrected_sig,prop_corrected_sig
0,mm,boost,healthy,0.00902,0.002197
1,mm,boost,unhealthy,0.007633,0.002313
2,mm,hfl1,healthy,0.135885,0.124899
3,mm,hfl1,unhealthy,0.158668,0.151151
4,mm,pbulk,healthy,0.008442,0.002313
5,mm,pbulk,unhealthy,0.006823,0.002197
6,pd,boost,healthy,0.012839,0.002842
7,pd,boost,unhealthy,0.009311,0.001568
8,pd,hfl1,healthy,0.106439,0.070862
9,pd,hfl1,unhealthy,0.129668,0.093012


In [17]:
sw["pd"][sw["pd"]["protein"] == "O75323"]

Unnamed: 0,sample_type,sample_condition,protein,p_uncorrected,p_corrected
995,boost,healthy,O75323,0.357152,0.612661
3893,boost,unhealthy,O75323,0.753336,0.885786
6791,hfl1,healthy,O75323,0.000109,0.001582
9689,hfl1,unhealthy,O75323,0.025394,0.11426
12587,pbulk,healthy,O75323,0.019047,0.092893


## t-test for differential expression

In [13]:
def diff_expr_ttest(inputs_dict):
    
    results = {}
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]

        sample_types = []
        prots = []
        pvals = []
        fcs = []

        for sample_type in df.index.get_level_values("sample_type").unique():
            healthy = df.xs((sample_type, "healthy"), level=("sample_type", "sample_condition"), drop_level=True)
            unhealthy = df.xs((sample_type, "unhealthy"), level=("sample_type", "sample_condition"), drop_level=True)

            for prot in healthy.columns:
                a = healthy[prot].dropna()
                b = unhealthy[prot].dropna()

                if sample_type in ["boost", "pbulk"]:
                    min_count = 2
                else:
                    min_coun = 15
                    
                if len(a) >= min_count and len(b) >= min_count:
                    t, p = scipy.stats.mannwhitneyu(x=a, y=b)#, equal_var=True)
                else:
                    p = np.nan
                    
                fc = np.log2(b.mean() / a.mean())

                sample_types.append(sample_type)
                prots.append(prot)
                pvals.append(p)
                fcs.append(fc)

        raw_pvals = pd.DataFrame({
            "sample_type": sample_types,
            "protein": prots,
            "p_uncorrected": pvals,
            "log2_fold_change": fcs,
        })

        pvals = raw_pvals[raw_pvals["p_uncorrected"].notna()]

        # Correct the p values
        reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
            pvals=pvals["p_uncorrected"].dropna(),
            alpha=0.05,
            method="fdr_bh",
        )

        pvals = pvals.assign(p_corrected=pvals_corrected) 
        
        results[source] = {
            "pvals": pvals,
            "raw_pvals": raw_pvals,
        }
    
    return results

results = diff_expr_ttest(inputs)

In [14]:
alpha = 0.05

for source in sources:
    print(source)
    print(results[source]["raw_pvals"].shape)
    print((results[source]["pvals"]["p_uncorrected"] <= alpha).sum())
    print((results[source]["pvals"]["p_corrected"] <= alpha).sum())
    print(results[source]["pvals"]["p_corrected"].min())
    print(results[source]["pvals"][results[source]["pvals"]["p_corrected"] <= alpha])
    print()

mm
(5151, 4)
192
0
0.11005467532360468
Empty DataFrame
Columns: [sample_type, protein, p_uncorrected, log2_fold_change, p_corrected]
Index: []

mm
(5151, 4)
192
0
0.11005467532360468
Empty DataFrame
Columns: [sample_type, protein, p_uncorrected, log2_fold_change, p_corrected]
Index: []

mm
(5151, 4)
192
0
0.11005467532360468
Empty DataFrame
Columns: [sample_type, protein, p_uncorrected, log2_fold_change, p_corrected]
Index: []

mm
(5151, 4)
192
0
0.11005467532360468
Empty DataFrame
Columns: [sample_type, protein, p_uncorrected, log2_fold_change, p_corrected]
Index: []

mm
(5151, 4)
192
0
0.11005467532360468
Empty DataFrame
Columns: [sample_type, protein, p_uncorrected, log2_fold_change, p_corrected]
Index: []

mm
(5151, 4)
192
0
0.11005467532360468
Empty DataFrame
Columns: [sample_type, protein, p_uncorrected, log2_fold_change, p_corrected]
Index: []

pd
(8694, 4)
270
22
0.004525293464946904
     sample_type protein  p_uncorrected  log2_fold_change  p_corrected
3057        hfl1  P02788