# Find proteins within the event (cis) that are commonly equivalent between samples with and without the event

## Setup

In [1]:
import cnvutils
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
params = cnvutils.load_params(os.path.join("..", "data", "params.json"))
CHROMOSOME = params["CHROMOSOME"]
ARM = params["ARM"]
CIS_OR_TRANS = "cis"

equiv_results_file = os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_equiv.tsv")

In [3]:
equiv_results = pd.\
read_csv(equiv_results_file, sep="\t").\
rename(columns={"Name": "protein"})

In [4]:
equiv_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p
0,brca,ADAMDEC1,NP_055294.1|NP_001138744.1,0.016401
1,brca,AGPAT5,NP_060831.2,0.690930
2,brca,ANGPT2,NP_001138.1|NP_001112360.1|NP_001112359.1,0.067187
3,brca,ARHGEF10,NP_001295082.1|NP_055444.2|NP_001295081.1,0.325628
4,brca,ASAH1,NP_004306.3|NP_808592.2,0.204879
5,brca,ATP6V1B2,NP_001684.2,0.982561
6,brca,BIN3,NP_061158.1,0.236663
7,brca,BLK,NP_001706.2|NP_001317394.1,0.217724
8,brca,BMP1,NP_006120.1|NP_001190.1,0.011296
9,brca,BNIP3L,NP_004322.1|NP_001317420.1,0.297309


## Select the equivalent proteins, and take a detour to make some plots

In [5]:
prots = equiv_results[equiv_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = equiv_results[equiv_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "Equivalent")
fail_cts.insert(0, "count_type", "Not equivalent")

counts = prots_cts.append(fail_cts).sort_index().reset_index(drop=False)

alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "count_type",
        axis=alt.Axis(
            title=None,
            labels=False
        ),
        sort=["Equivalent"]
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        )
    ),
    color=alt.Color(
        "count_type",
        title=None,
        sort=["Equivalent"],
        scale=alt.Scale(
            domain=["Equivalent", "Not equivalent"],
            range=["#2d3da4", "#d1d1d1"]
        )
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None
    )
).properties(
    title="Chr 8p cis equivalence"
).configure_title(
    anchor="middle"
)

## Find how many cancers each protein was different in

In [6]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist())
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
MSR1,"[brca, colon, hnscc, lscc, luad, ovarian]",6
CLU,"[brca, colon, hnscc, lscc]",4
DEFA4,"[brca, colon, hnscc, lscc]",4
TTI2,"[brca, colon, hnscc, lscc]",4
POLR3D,"[brca, colon, lscc, luad]",4
PDLIM2,"[brca, colon, lscc, ovarian]",4
ADAMDEC1,"[brca, hnscc, lscc, ovarian]",4
LOXL2,"[brca, colon, lscc]",3
DOK2,"[brca, colon, ovarian]",3
LPL,"[brca, hnscc, lscc]",3


## Check that there isn't overlap between proteins found as different and proteins found as equivalent

If there is overlap, we probably were too lenient in our bounds for the TOST test.

In [7]:
diff_summary_path = os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_common_diff.tsv")
df = pd.read_csv(diff_summary_path, sep="\t")
df = df.assign(cancers=df["cancers"].apply(lambda x: x.split("_")))

shared = df[df["protein"].isin(prots_summary.index)]
shared = shared.merge(
    right=prots_summary.reset_index(drop=False),
    on="protein",
    how="left", 
    suffixes=["_diff", "_equiv"],
)

ints = []
cts = []

for prot in shared["protein"]:
    diff_cancers = set(shared.loc[shared["protein"] == prot, "cancers_diff"].iloc[0])
    equiv_cancers = set(shared.loc[shared["protein"] == prot, "cancers_equiv"].iloc[0])
    
    intersection = diff_cancers.intersection(equiv_cancers)
    
    ints.append(intersection)
    cts.append(len(intersection))
    
shared = shared.assign(
    intersection=ints,
    intersection_size=cts
)

shared.sort_values(by="intersection_size", ascending=False)

Unnamed: 0,protein,cancers_diff,mean_simp_change,num_cancers_diff,cancers_equiv,num_cancers_equiv,intersection,intersection_size
0,MTMR9,"[brca, hnscc, lscc, luad, ovarian]",-1.0,5,[colon],1,{},0
16,KBTBD11,"[hnscc, luad]",-1.0,2,"[brca, lscc, ovarian]",3,{},0
29,PNMA2,[luad],-1.0,1,[hnscc],1,{},0
28,PBK,[luad],-1.0,1,[lscc],1,{},0
27,LPL,[luad],-1.0,1,"[brca, hnscc, lscc]",3,{},0
26,KCTD9,[luad],-1.0,1,"[hnscc, lscc, ovarian]",3,{},0
25,DEFA4,[luad],-1.0,1,"[brca, colon, hnscc, lscc]",4,{},0
24,BLK,[luad],-1.0,1,[ovarian],1,{},0
23,TDRP,[hnscc],-1.0,1,[brca],1,{},0
22,SORBS3,[hnscc],-1.0,1,"[colon, lscc]",2,{},0


## Out of curiousity, look at the ttest pvalues for difference, for the equivalent proteins

In [8]:
ttest_diff_res_path = os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_ttest.tsv")

ttres = pd.read_csv(ttest_diff_res_path, sep="\t")

# Get the difference p values for all proteins that passed
diffs = pd.DataFrame()
for prot_idx in prots.index:
    
    prot = prots.loc[prot_idx, "protein"]
    cancer_type = prots.loc[prot_idx, "cancer_type"]
    
    diffs = diffs.append(ttres[
        (ttres["protein"] == prot) &
        (ttres["cancer_type"] == cancer_type)
    ])

# Plot the distribution of the difference p values for the equivalent proteins
alt.Chart(diffs).mark_bar().encode(
    x=alt.X(
        "adj_p",
        bin=alt.Bin(step=0.05),
        scale=alt.Scale(
            domain=[0, 1]
        )
    ),
    y=alt.Y(
        "count()"
    )
).properties(
    title="Distribution of difference p values for equivalent proteins"
)

In [9]:
# Plot the distribution of all the difference p values
alt.Chart(ttres).mark_bar().encode(
    x=alt.X(
        "adj_p",
        bin=alt.Bin(step=0.05),
        scale=alt.Scale(
            domain=[0, 1]
        )
    ),
    y=alt.Y(
        "count()"
    )
).properties(
    title="Distribution of difference p values for all proteins"
)

## Save results

In [10]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_common_equiv.tsv")
prots_summary.to_csv(output_file, sep="\t")