In [75]:
import pandas as pd
import numpy as np

from local.constants import WORKSPACE_ROOT
from local.figures.template import BaseFigure, ApplyTemplate, go
from local.figures.colors import COLORS, Color, Palettes

In [4]:
df = pd.read_csv(WORKSPACE_ROOT/"data/proteomics/epi300.protein.tsv", sep="\t")
print(df.shape)
df.head(2)

(2467, 60)


Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Length,Organism,Protein Existence,Description,Protein Probability,Top Peptide Probability,...,76_Pic1011_4_F4_7G_100ng MaxLFQ Intensity,77_Pic1011_5_F5_8G_100ng MaxLFQ Intensity,78_Pic1011_6_F6_9G_100ng MaxLFQ Intensity,79_Pic1011_7_F7_10G_EP1300FOS_AA_2_100ng MaxLFQ Intensity,80_Pic1011_8_F8_11G_EP1300FOS_AA_2_100ng MaxLFQ Intensity,81_Pic1011_9_F9_12G_EP1300FOS_100ng MaxLFQ Intensity,82_Pic1011_1_D1_1GL_EP1300_PCC1_3_100ng MaxLFQ Intensity,83_Pic1011_2_D2_2GL_EP1300_PCC1_1_100ng MaxLFQ Intensity,84_Pic1011_3_D3_3GL_EP1300_PCC1_2_100ng MaxLFQ Intensity,Indistinguishable Proteins
0,C1_10,C1_10,C1_10,,69,,,C1_10,1.0,0.999,...,80808260.0,168583200.0,115466600.0,176202600.0,135864100.0,132182100.0,75039190.0,90813260.0,92232700.0,
1,C1_1001,C1_1001,C1_1001,,302,,,C1_1001,1.0,0.999,...,1331771000.0,1086445000.0,1342404000.0,1503298000.0,1393202000.0,1383123000.0,2536239000.0,2525745000.0,2103290000.0,


In [10]:
cols = [c for c in df.columns if ("MaxLFQ" in c and "Intensity" in c)]
cols

['76_Pic1011_4_F4_7G_100ng MaxLFQ Intensity',
 '77_Pic1011_5_F5_8G_100ng MaxLFQ Intensity',
 '78_Pic1011_6_F6_9G_100ng MaxLFQ Intensity',
 '79_Pic1011_7_F7_10G_EP1300FOS_AA_2_100ng MaxLFQ Intensity',
 '80_Pic1011_8_F8_11G_EP1300FOS_AA_2_100ng MaxLFQ Intensity',
 '81_Pic1011_9_F9_12G_EP1300FOS_100ng MaxLFQ Intensity',
 '82_Pic1011_1_D1_1GL_EP1300_PCC1_3_100ng MaxLFQ Intensity',
 '83_Pic1011_2_D2_2GL_EP1300_PCC1_1_100ng MaxLFQ Intensity',
 '84_Pic1011_3_D3_3GL_EP1300_PCC1_2_100ng MaxLFQ Intensity']

In [12]:
# https://docs.google.com/spreadsheets/d/1pc1L5qF5SErQMXlbOnYmGvAAvPalSgh2/edit?gid=2054370611#gid=2054370611
original_names = """
LB CM_EPFOS_C_water_1
LB CM_EPFOS_C_water_2
LB CM_EPFOS_C_water_3
LB CM_EPFOS_C_iAAl_1
LB CM_EPFOS_C_iAAl_2
LB CM_EPFOS_C_iAAl_3
LB CM_EPPCC1_C_1
LB CM_EPPCC1_C_2
LB CM_EPPCC1_C_3
""".split("\n")[1:-1]
name_map = {c: n for c, n in zip(cols, original_names)}
name_map

{'76_Pic1011_4_F4_7G_100ng MaxLFQ Intensity': 'LB CM_EPFOS_C_water_1',
 '77_Pic1011_5_F5_8G_100ng MaxLFQ Intensity': 'LB CM_EPFOS_C_water_2',
 '78_Pic1011_6_F6_9G_100ng MaxLFQ Intensity': 'LB CM_EPFOS_C_water_3',
 '79_Pic1011_7_F7_10G_EP1300FOS_AA_2_100ng MaxLFQ Intensity': 'LB CM_EPFOS_C_iAAl_1',
 '80_Pic1011_8_F8_11G_EP1300FOS_AA_2_100ng MaxLFQ Intensity': 'LB CM_EPFOS_C_iAAl_2',
 '81_Pic1011_9_F9_12G_EP1300FOS_100ng MaxLFQ Intensity': 'LB CM_EPFOS_C_iAAl_3',
 '82_Pic1011_1_D1_1GL_EP1300_PCC1_3_100ng MaxLFQ Intensity': 'LB CM_EPPCC1_C_1',
 '83_Pic1011_2_D2_2GL_EP1300_PCC1_1_100ng MaxLFQ Intensity': 'LB CM_EPPCC1_C_2',
 '84_Pic1011_3_D3_3GL_EP1300_PCC1_2_100ng MaxLFQ Intensity': 'LB CM_EPPCC1_C_3'}

In [6]:
df_exp = df[~df["Protein"].str.contains("sp")]
print(df_exp.shape)

(2459, 60)


In [15]:
mat = df_exp[cols].to_numpy()
mat.shape

(2459, 9)

In [103]:
proteins = df_exp["Protein"].values
annotation_map = {p: dict() for p in proteins}
len(proteins)

2459

In [108]:
proteins

array(['C1_10', 'C1_1001', 'C1_1002', ..., 'C1_99', 'C1_993', 'C1_994'],
      dtype=object)

In [120]:
df_swissprot = pd.read_csv(WORKSPACE_ROOT/"data/annotations/epi300/epi300.swissprot.csv")
print(df_swissprot.shape)
for _, r in df_swissprot.iterrows():
    k = f"{r.contig}_{r.orf}"
    desc = r.description

    meta = dict(t.split("=") for t in desc.split(" ") if "=" in t)
    gene = meta.get("GN", None)
    if k not in annotation_map: continue
    annotation_map[k]["swissprot"] = [gene, desc]

df_swissprot.head(2)

(3796, 7)


Unnamed: 0,contig,orf,ref_id,description,bsr,evalue,percent_identity
0,C1,1,sp|P16917|RHSB_ECOLI,Protein RhsB OS=Escherichia coli (strain K12) ...,0.88135,0.0,98.0
1,C1,10,sp|P0C266|YIBT_SHIFL,Uncharacterized protein YibT OS=Shigella flexn...,1.0,1.3437e-45,100.0


In [16]:
mat_control = mat[:, [0, 1, 2]]
mat_atf1 = mat[:, [3, 4, 5]]
mat_blank = mat[:, [6, 7, 8]]

In [19]:
from scipy.stats import ttest_ind

In [44]:
def ttest(matA: np.ndarray, matB: np.ndarray, labels: list[str]):
    combined = np.hstack([matA, matB])
    missing_vals = (combined == 0).sum(axis=1) > 0
    matA_nonz = matA[~missing_vals]
    matB_nonz = matB[~missing_vals]
    labels = np.array(labels)
    removed = labels[missing_vals]
    kept = labels[~missing_vals]

    res = ttest_ind(matA_nonz, matB_nonz, axis=1, equal_var=False)
    pvals = res.pvalue

    valA = matA_nonz.mean(axis=1)
    valB = matB_nonz.mean(axis=1)
    fold_change = valB/valA

    return fold_change, pvals, kept, removed

blank_v_control = ttest(mat_blank, mat_control, proteins)
blank_v_atf1 = ttest(mat_blank, mat_atf1, proteins)

In [126]:
fig = BaseFigure(shape=(2, 1))

def _add(results, col):
    fold_change, pvals, kept, removed = results
    fc_good = np.abs(np.log2(fold_change))>1
    pval_good = pvals < 0.001
    selected = fc_good & pval_good
    fig.add_trace(go.Scatter(
        x=np.log2(fold_change[~selected]),
        y=-np.log10(pvals[~selected]),
        mode="markers",
        marker=dict(
            color=Color.Hex("AAAAAA").color_value,
            size=4,
        ),
        text=[annotation_map.get(k, {}).get("swissprot", [k, None])[0] for k in kept],
        showlegend=False,
    ), row=1, col=col)

    fig.add_trace(go.Scatter(
        x=np.log2(fold_change[selected]),
        y=-np.log10(pvals[selected]),
        mode="markers",
        marker=dict(
            color=COLORS.RED,
            # color=Palettes.PLOTLY[1].color_value,
            size=4,
        ),
        text=[annotation_map.get(k, {}).get("swissprot", [None, k]) for k in kept],
        showlegend=False,
    ), row=1, col=col)

    pv, fc = 3, 1
    fig.add_trace(go.Scatter(
        x=[-fc, -fc, None, fc, fc, None, -100, 100, None],
        y=[-100, 100, None, -100, 100, None, pv, pv, None],
        mode="lines",
        line=dict(
            color=Color.Hex("212121").color_value,
            dash="dash",
            width=0.5,
        ),
        showlegend=False,
    ), row=1, col=col)

_add(blank_v_control, 1)
_add(blank_v_atf1, 2)

xlabels = [f"{2**v}" for v in range(-6, 7)]
fig = ApplyTemplate(
    fig,
    default_yaxis=dict(range=[-0.1, 5.1]),
    default_xaxis=dict(range=[-5.5, 5.5]),
    axis={
        "2 1 y": dict(showticklabels=False, showline=False, ticks=None),
        "1 1 y": dict(title="-log10 p-value"),
        "1 1 x": dict(title="control vs blank"),
        "2 1 x": dict(title="control vs atf1"),
    },
    layout=dict(
        width=1200, height=600,
    ),
)

fig.write_html("./cache/volcano.html")
fig.show()