In [115]:
import pandas as pd
import numpy as np
from local.constants import WORKSPACE_ROOT

In [116]:
df = pd.read_csv(WORKSPACE_ROOT/"data/proteomics/epi300.protein.tsv", sep="\t")
print(df.shape)
df.head(2)

(2467, 60)


Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Length,Organism,Protein Existence,Description,Protein Probability,Top Peptide Probability,...,76_Pic1011_4_F4_7G_100ng MaxLFQ Intensity,77_Pic1011_5_F5_8G_100ng MaxLFQ Intensity,78_Pic1011_6_F6_9G_100ng MaxLFQ Intensity,79_Pic1011_7_F7_10G_EP1300FOS_AA_2_100ng MaxLFQ Intensity,80_Pic1011_8_F8_11G_EP1300FOS_AA_2_100ng MaxLFQ Intensity,81_Pic1011_9_F9_12G_EP1300FOS_100ng MaxLFQ Intensity,82_Pic1011_1_D1_1GL_EP1300_PCC1_3_100ng MaxLFQ Intensity,83_Pic1011_2_D2_2GL_EP1300_PCC1_1_100ng MaxLFQ Intensity,84_Pic1011_3_D3_3GL_EP1300_PCC1_2_100ng MaxLFQ Intensity,Indistinguishable Proteins
0,C1_10,C1_10,C1_10,,69,,,C1_10,1.0,0.999,...,80808260.0,168583200.0,115466600.0,176202600.0,135864100.0,132182100.0,75039190.0,90813260.0,92232700.0,
1,C1_1001,C1_1001,C1_1001,,302,,,C1_1001,1.0,0.999,...,1331771000.0,1086445000.0,1342404000.0,1503298000.0,1393202000.0,1383123000.0,2536239000.0,2525745000.0,2103290000.0,


In [117]:
cols = [c for c in df.columns if ("MaxLFQ" not in c and "Intensity" in c)]
cols

['76_Pic1011_4_F4_7G_100ng Intensity',
 '77_Pic1011_5_F5_8G_100ng Intensity',
 '78_Pic1011_6_F6_9G_100ng Intensity',
 '79_Pic1011_7_F7_10G_EP1300FOS_AA_2_100ng Intensity',
 '80_Pic1011_8_F8_11G_EP1300FOS_AA_2_100ng Intensity',
 '81_Pic1011_9_F9_12G_EP1300FOS_100ng Intensity',
 '82_Pic1011_1_D1_1GL_EP1300_PCC1_3_100ng Intensity',
 '83_Pic1011_2_D2_2GL_EP1300_PCC1_1_100ng Intensity',
 '84_Pic1011_3_D3_3GL_EP1300_PCC1_2_100ng Intensity']

In [118]:
df_controls = df[df["Protein"].str.contains("sp")]
print(df_controls.shape)
print(df_controls["Entry Name"])

(8, 60)
2459    K1C15_SHEEP
2460     TRY1_BOVIN
2461       TRYP_PIG
2462     K2C1_HUMAN
2463     CATG_HUMAN
2464    K1C10_HUMAN
2465     K1C9_HUMAN
2466     K22E_HUMAN
Name: Entry Name, dtype: object


In [119]:
df_exp = df[~df["Protein"].str.contains("sp")]
print(df_exp.shape)

(2459, 60)


In [120]:
mat_exp = df_exp[cols].values
mat_exp.shape

(2459, 9)

In [121]:
mat_control = df_controls[cols].values
mat_control.shape

(8, 9)

In [122]:
pairs = {}
for e in mat_control:
    for i, a in enumerate(e):
        if a == 0: continue
        for j, b in enumerate(e):
            if b == 0: continue
            if i >= j: continue
            k = (i, j)
            pairs[k] = pairs.get(k, []) + [(a, b)]
pairs = sorted([(k, v) for k, v in pairs.items()], key=lambda x: x[0], reverse=False)
len(pairs) # expect 9 choose 2 = 36

36

In [123]:
from local.figures.template import BaseFigure, ApplyTemplate, go
from local.figures.colors import Color, COLORS

In [124]:
fig = BaseFigure()
medians = []
xlabels = []
BLACK = Color.Hex("212121")
for k, v in pairs:
    k1, k2 = k
    ratios = np.array([b/a for a, b in v])
    median = np.median(ratios)
    if median < 1:
        ratios = 1/ratios
        k = (k2, k1)
        median = 1/median
    xlabels.append(k)
    medians.append(median)

    fig.add_trace(go.Box(
        y=ratios,
        marker=dict(
            size=5,
            color=BLACK.Fade(0.5).color_value,
        ),
        line=dict(
            width=0.5,
            color=BLACK.color_value,
        ),
        fillcolor = COLORS.TRANSPARENT,
        pointpos = 0,
        boxpoints='all',
        jitter=0.5,
        name=f"{k}",
        showlegend=False,
    ))

fig.add_trace(go.Scatter(
    x=[f"{k}" for k in xlabels],
    y=[v for v in medians],
    mode='markers',
    marker=dict(
        size=15,
        symbol="line-ew",
        line=dict(
            width=2,
            color=COLORS.RED,
        ),
        # color=BLACK.color_value,
    ),
    showlegend=False,
))

fig.add_trace(go.Scatter(
    x=[f"{k}" for k in xlabels],
    y=[1 for _ in xlabels],
    mode='lines',
    line=dict(
        width=1,
        color=BLACK.Fade(0.5).color_value,
    ),
    showlegend=False,
))
fig = ApplyTemplate(
    fig,
    axis={
        "1 1 y": dict(type="log")
    }
)
fig.show()

### controls only
pairwise median ratios of intensity between samples for non-zero controls (where both samples have non-zero intensity)\
red indicates median ratio\
where ratio < 1, the reciprocal is plotted

In [125]:
pairs = {}
for e in mat_exp:
    for i, a in enumerate(e):
        if a == 0: continue
        for j, b in enumerate(e):
            if b == 0: continue
            if i >= j: continue
            k = (i, j)
            pairs[k] = pairs.get(k, []) + [(a, b)]
pairs = sorted([(k, v) for k, v in pairs.items()], key=lambda x: x[0], reverse=False)
print(len(pairs)) # expect 9 choose 2 = 36

36


In [140]:
fig = BaseFigure()
medians_exp = []
xlabels_exp = []
BLACK = Color.Hex("212121")
SUBSAMPLE = 256
np.random.seed(42)
for k, v in pairs:
    k1, k2 = k
    ratios = np.array([b/a for a, b in v])
    median = np.median(ratios)
    if median < 1:
        ratios = 1/ratios
        k = (k2, k1)
        median = 1/median
    xlabels_exp.append(k)
    medians_exp.append(median)
    if SUBSAMPLE > 0 and len(ratios) > SUBSAMPLE:
        ratios = np.random.choice(ratios, size=SUBSAMPLE, replace=False)

    fig.add_trace(go.Box(
        y=ratios,
        marker=dict(
            size=5,
            color=BLACK.Fade(0.5).color_value,
        ),
        line=dict(
            width=0.5,
            color=BLACK.color_value,
        ),
        fillcolor = COLORS.TRANSPARENT,
        pointpos = 0,
        boxpoints='all',
        jitter=0.5,
        name=f"{k}",
        showlegend=False,
    ))

fig.add_trace(go.Scatter(
    x=[f"{k}" for k in xlabels_exp],
    y=[v for v in medians_exp],
    mode='markers',
    marker=dict(
        size=15,
        symbol="line-ew",
        line=dict(
            width=2,
            color=COLORS.RED,
        ),
        # color=BLACK.color_value,
    ),
    showlegend=False,
))

fig.add_trace(go.Scatter(
    x=[f"{k}" for k in xlabels_exp],
    y=[1 for _ in xlabels_exp],
    mode='lines',
    line=dict(
        width=1,
        color=BLACK.Fade(0.5).color_value,
    ),
    showlegend=False,
))
fig = ApplyTemplate(
    fig,
    axis={
        "1 1 y": dict(type="log")
    }
)
fig.show()

### non controls
pairwise median ratios of non-zero intensities between samples for proteins (where both samples have non-zero intensity)\
red indicates median ratio\
where ratio < 1, the reciprocal is plotted

note that plot is thinned out by subsampling to ~25%

In [135]:
# fig = BaseFigure()
# fig.add_trace(go.Scatter(

# ))

controls = {}
for k, v in zip(xlabels, medians):
    controls[k] = v
controls_vs_norm = []
for k, v in zip(xlabels_exp, medians_exp):
    v_exp = v
    k1, k2 = k
    kinv = (k2, k1)
    if k not in controls:
        k = kinv
    if k not in controls:
        print(f"Skipping {k}")
        continue
    v_control = controls[k]
    controls_vs_norm.append(((k1, k2) if k1 < k2 else (k2, k1), v_control, v_exp))

fig = BaseFigure()
fig.add_trace(go.Scatter(
    x=[x for k, x, y in controls_vs_norm],
    y=[y for k, x, y in controls_vs_norm],
    mode='markers',
    marker=dict(
        size=15,
        color = COLORS.TRANSPARENT,
        line = dict(
            width=2,
            color= Color.Hex("212121").color_value,
        ),
    ),
    showlegend=False,
))
fig = ApplyTemplate(fig)
fig.show()

Controls alone disagree with using median expression levels across proteins

## verdict: use Max LFQ intensity
https://doi.org/10.1074/mcp.M113.031591