In [1]:
import altair as alt
import numpy as np
import oscutils
import scipy
import statsmodels.stats.multitest

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
sources = ["mm", "pd"]

In [3]:
inputs = {}

for source in sources:
    df = oscutils.\
    load_protein_table(source, "quant", clean=True).\
    drop(columns="sample").\
    set_index(["sample_type", "sample_condition", "sample_num"])
    
    inputs[source] = df

## Sample counts

See how many samples there are in each condition, so we have an idea of what might be a good cutoff for how many samples a protein has to be found in in each condition before we do a t-test.

In [4]:
def get_sample_counts(inputs_dict):
    
    sources = []
    sample_types = []
    sample_conditions = []
    counts = []
    
    df = inputs_dict["mm"] # Doesn't matter which we use, sample counts are the same for both
    for sample_type in df.index.get_level_values("sample_type").unique():
        for sample_condition in df.index.get_level_values("sample_condition").unique():
            df_sel = df.xs((sample_type, sample_condition), level=("sample_type", "sample_condition"), drop_level=False)

            sources.append(source)
            sample_types.append(sample_type)
            sample_conditions.append(sample_condition)
            counts.append(df_sel.shape[0])
    
    return pd.DataFrame({
        "sample_type": sample_types,
        "sample_condition": sample_conditions,
        "count": counts,
    })
                
get_sample_counts(inputs)

Unnamed: 0,sample_type,sample_condition,count
0,boost,healthy,3
1,boost,unhealthy,3
2,hfl1,healthy,36
3,hfl1,unhealthy,35
4,pbulk,healthy,3
5,pbulk,unhealthy,3


## Histogram of proportion of proteins with different coverages

Make a histogram showing, for each number of samples in each condition, what proportion of proteins are found in that number of samples

In [5]:
def get_protein_coverages(inputs_dict):
    
    sources = []
    sample_types = []
    sample_conditions = []
    samples_counts = []
    proteins = []
    samples_found_counts = [] 
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]
        for sample_type in df.index.get_level_values("sample_type").unique():
            for sample_condition in df.index.get_level_values("sample_condition").unique():
                df_sel = df.xs((sample_type, sample_condition), level=("sample_type", "sample_condition"), drop_level=False)
                for protein in df_sel.columns:
                    
                    sources.append(source)
                    sample_types.append(sample_type)
                    sample_conditions.append(sample_condition)
                    samples_counts.append(df_sel.shape[0])
                    proteins.append(protein)
                    samples_found_counts.append(df_sel[protein].notna().sum())
    
    coverages_df = pd.DataFrame({
        "source": sources,
        "sample_type": sample_types,
        "sample_condition": sample_conditions,
        "samples_count": samples_counts,
        "protein": proteins,
        "samples_found_count": samples_found_counts,
    })
    
    coverages_df = coverages_df.assign(
        samples_found_prop=coverages_df["samples_found_count"] / coverages_df["samples_count"],
        source_type_condition=coverages_df["source"] + "_" + coverages_df["sample_type"] + "_" + coverages_df["sample_condition"],
        source_condition=coverages_df["source"] + "_" + coverages_df["sample_condition"],
    )
    
    return coverages_df
                
coverages = get_protein_coverages(inputs)

In [6]:
alt.vconcat(*[
    alt.hconcat(*[
        alt.Chart(coverages[
            (coverages["sample_type"] == sample_type) &
            (coverages["source_condition"] == source_condition)
        ]).transform_joinaggregate(
            total="count(*)",
        ).transform_calculate(
            pct="1 / datum.total",
        ).mark_bar().encode(
            x=alt.X(
                "samples_found_count",
                bin=alt.Bin(
                    extent=[0, 37] if sample_type == "hfl1" else [0, 4], 
                    step=1
                ),
                title=[
                    "Number of samples each protein is found in",
                    f"{sample_type}: {source_condition}",
                ],
            ),
            y=alt.Y(
                "sum(pct):Q",
                title="Percentage of all proteins",
                axis=alt.Axis(
                    format="%",
                ),
            ),
            color="sample_condition",
        )
        
        for source_condition in coverages["source_condition"].unique()
    ]).resolve_scale(
        x="independent",
        y="shared",
    )
    
    for sample_type in coverages["sample_type"].unique()
])

## Distributions of abundances

For each protein in each sample, show what the distribution of abundances looks like.

In [16]:
def distributions_plot(df):
    
    df.index = df.index.get_level_values("sample_type")
    df = df.sample(
        n=50,
        replace=False,
        random_state=0,
        axis=1,
        ignore_index=False,
    )
    
    df = pd.melt(
        frame=df,
        var_name="protein",
        value_name="abundance",
        ignore_index=False,
    ).\
    reset_index(drop=False).\
    sort_values(by=["sample_type", "protein"]).\
    dropna(
        axis=0,
        how="all",
        subset="abundance",
    )
    
    return alt.vconcat(*[
        alt.hconcat(*[
            
            alt.Chart(df[
                (df["sample_type"] == sample_type) &
                (df["protein"] == protein)
            ]).mark_bar().encode(
                x=alt.X(
                    "abundance:Q",
                    title=protein,
                    bin=alt.Bin(
                    ),
                ),
                y="count()",
                color="sample_type:N"
            )
            
            for protein in df["protein"].unique()
        ]) 
        for sample_type in df["sample_type"].unique()
    ])

distributions_plot(inputs["mm"])

In [None]:
# Plan: Test for normality, try with log transform if fail, MWU if still fail; check variance if succeed, Welch if different variance
# Confirm PD data already normalized

## t-test for differential expression

In [8]:
def diff_expr_ttest(inputs_dict):
    
    results = {}
    
    for source in inputs_dict.keys():
        df = inputs_dict[source]

        sample_types = []
        prots = []
        pvals = []
        fcs = []

        for sample_type in df.index.get_level_values("sample_type").unique():
            healthy = df.xs((sample_type, "healthy"), level=("sample_type", "sample_condition"), drop_level=True)
            unhealthy = df.xs((sample_type, "unhealthy"), level=("sample_type", "sample_condition"), drop_level=True)

            for prot in healthy.columns:
                a = healthy[prot].dropna()
                b = unhealthy[prot].dropna()

                if sample_type in ["boost", "pbulk"]:
                    min_count = 2
                else:
                    min_coun = 15
                    
                if len(a) >= min_count and len(b) >= min_count:
                    t, p = scipy.stats.mannwhitneyu(x=a, y=b)#, equal_var=True)
                else:
                    p = np.nan
                    
                fc = np.log2(b.mean() / a.mean())

                sample_types.append(sample_type)
                prots.append(prot)
                pvals.append(p)
                fcs.append(fc)

        raw_pvals = pd.DataFrame({
            "sample_type": sample_types,
            "protein": prots,
            "p_uncorrected": pvals,
            "log2_fold_change": fcs,
        })

        pvals = raw_pvals[raw_pvals["p_uncorrected"].notna()]

        # Correct the p values
        reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
            pvals=pvals["p_uncorrected"].dropna(),
            alpha=0.05,
            method="fdr_bh",
        )

        pvals = pvals.assign(p_corrected=pvals_corrected) 
        
        results[source] = {
            "pvals": pvals,
            "raw_pvals": raw_pvals,
        }
    
    return results

results = diff_expr_ttest(inputs)

TypeError: Index must be a MultiIndex

In [None]:
alpha = 0.05

for source in sources:
    print(source)
    print(results[source]["raw_pvals"].shape)
    print((results[source]["pvals"]["p_uncorrected"] <= alpha).sum())
    print((results[source]["pvals"]["p_corrected"] <= alpha).sum())
    print(results[source]["pvals"]["p_corrected"].min())
    print(results[source]["pvals"][results[source]["pvals"]["p_corrected"] <= alpha])
    print()