# Find proteins within the event (cis) that are commonly different between samples with and without the event

## Setup

In [1]:
import cnvutils
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
# Load parameters
gen_params = cnvutils.load_params(os.path.join("..", "..", "..", "data", "gen_params.json"))
PANCAN = gen_params["PANCAN"]

chr_params = cnvutils.load_params(os.path.join("..", "..", "data", "chr_params.json"))
CHROMOSOME = chr_params["CHROMOSOME"]

arm_params = cnvutils.load_params(os.path.join("..", "data", "arm_params.json"))
ARM = arm_params["ARM"]

# Set cis or trans
CIS_OR_TRANS = "cis"

read_path = os.path.join(
    "..", 
    "data", 
    f"chr{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_02_ttest_{'harmonized' if PANCAN else 'AWG'}.tsv",
)

ttest_results = pd.\
read_csv(read_path, sep="\t").\
rename(columns={"Name": "protein"})

In [3]:
ttest_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,brca,AARD,ENSP00000367528.3,5.345104e-01,-0.213243
1,brca,ADCK5,ENSP00000310547.6,2.427739e-01,0.111281
2,brca,ADHFE1,ENSP00000379865.3,2.003101e-01,
3,brca,AGO2,ENSP00000220592.5,9.647512e-05,0.241091
4,brca,ANKRD46,ENSP00000335287.3,2.721848e-01,0.205505
5,brca,ARFGEF1,ENSP00000262215.3,9.401769e-06,0.462920
6,brca,ARHGAP39,ENSP00000276826.5,5.480157e-02,
7,brca,ARMC1,ENSP00000276569.3,1.210775e-05,0.282442
8,brca,ASAP1,ENSP00000429900.1,9.952948e-03,
9,brca,ATAD2,ENSP00000287394.5,1.077610e-09,0.798935


## Select the proteins with a significant change, and take a detour to make some plots

In [4]:
prots = ttest_results[ttest_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = ttest_results[ttest_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "p <= 0.05")
fail_cts.insert(0, "count_type", "p > 0.05")

counts = prots_cts.\
append(fail_cts).\
sort_index().\
reset_index(drop=False)

counts = counts.\
assign(
    cancer_type=counts["cancer_type"].replace({
        "colon": "coad",
        "ovarian": "ov",
    }).str.upper(),
)

cis_effects_barchart = alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "cancer_type",
        axis=alt.Axis(
            title="Cancer type",
            labelAngle=40,
        ),
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        ),
    ),
    color=alt.Color(
        "count_type",
        title=None,
        scale=alt.Scale(
            domain=["p <= 0.05", "p > 0.05"],
            range=["#2d3da4", "#d1d1d1"]
        ),
    ),
    order=alt.Order(
        "count_type",
        sort="ascending",
    ),
).properties(
    title=f"Chr {CHROMOSOME}{ARM} {CIS_OR_TRANS} effects"
).configure_title(
    anchor="middle"
)

cis_effects_barchart

In [5]:
# Save the chart
chart_fmt = gen_params["CHART_FORMAT"]
chart_scale = gen_params["CHART_SCALE"]

chart_dir = os.path.join("..", "data", "charts_img")
cis_effects_barchart_path = os.path.join(
    chart_dir,
    f"chr{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_05_sig_effects_barchart_{'harmonized' if PANCAN else 'AWG'}_altair.{chart_fmt}"
)

cis_effects_barchart.save(cis_effects_barchart_path, scale_factor=chart_scale)

INFO:tornado.access:200 GET / (::1) 4.56ms
INFO:tornado.access:200 GET / (::1) 4.56ms
INFO:tornado.access:200 GET /vega.js (::1) 3.00ms
INFO:tornado.access:200 GET /vega.js (::1) 3.00ms
INFO:tornado.access:200 GET /vega-lite.js (::1) 1.74ms
INFO:tornado.access:200 GET /vega-lite.js (::1) 1.74ms
INFO:tornado.access:200 GET /vega-embed.js (::1) 5.32ms
INFO:tornado.access:200 GET /vega-embed.js (::1) 5.32ms


## Find how many cancers each protein was different in

In [6]:
def make_simple_change(change_val):
    if change_val == 0:
        return 0
    if change_val > 0:
        return 1
    if change_val < 0:
        return -1

prots = prots.assign(
    simplified_change=prots["change"].apply(make_simple_change)
)

In [7]:
prots

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change,simplified_change
0,brca,AGO2,ENSP00000220592.5,9.647512e-05,0.241091,1.0
1,brca,ARFGEF1,ENSP00000262215.3,9.401769e-06,0.462920,1.0
2,brca,ARMC1,ENSP00000276569.3,1.210775e-05,0.282442,1.0
3,brca,ASAP1,ENSP00000429900.1,9.952948e-03,,
4,brca,ATAD2,ENSP00000287394.5,1.077610e-09,0.798935,1.0
5,brca,ATP6V1C1,ENSP00000379203.3,1.575602e-04,0.250548,1.0
6,brca,BOP1,ENSP00000455106.1,9.887669e-05,0.321610,1.0
7,brca,C8orf33,ENSP00000330361.6,5.632266e-05,0.513392,1.0
8,brca,C8orf82,ENSP00000436621.1,4.452831e-04,0.373747,1.0
9,brca,CHCHD7,ENSP00000306425.3,5.890290e-04,0.388467,1.0


In [8]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist()),
    "mean_simp_change": ("simplified_change", np.mean)
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARFGEF1,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
CPNE3,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
GGH,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
LACTB2,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
MTERF3,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
MTFR1,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
OTUD6B,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
POP1,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
RIDA,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6
STK3,"[brca, colon, hnscc, lscc, luad, ovarian]",1.0,6


In [9]:
prots_summary["num_cancers"].value_counts().sort_index(ascending=False)

6    15
5    16
4    21
3    25
2    29
1    48
Name: num_cancers, dtype: int64

## Save results

In [10]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = os.path.join(
    "..",
    "data",
    f"chr{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_05_common_diff_{'harmonized' if PANCAN else 'AWG'}.tsv",
)
prots_summary.to_csv(output_file, sep="\t")

## Info for most common proteins

- [AGPAT5](https://www.uniprot.org/uniprot/Q9NUQ2)  (colon, hnscc, lscc, luad, ovarian)     5
    - Converts 1-acyl-sn-glycerol-3-phosphate (lysophosphatidic acid or LPA) into 1,2-diacyl-sn-glycerol-3-phosphate (phosphatidic acid or PA) by incorporating an acyl moiety at the sn-2 position of the glycerol backbone (PubMed:21173190). Acts on LPA containing saturated or unsaturated fatty acids C15:0-C20:4 at the sn-1 position using C18:1-CoA as the acyl donor (PubMed:21173190). Also acts on lysophosphatidylethanolamine using oleoyl-CoA, but not arachidonoyl-CoA, and lysophosphatidylinositol using arachidonoyl-CoA, but not oleoyl-CoA (PubMed:21173190). Activity toward lysophosphatidylglycerol not detectable (PubMed:21173190).
- ATP6V1B2 	(colon, hnscc, lscc, luad, ovarian) 	5
- CCDC25 	(colon, hnscc, lscc, luad, ovarian) 	5
- CHMP7 	(colon, hnscc, lscc, luad, ovarian) 	5
- [ERI1](https://www.uniprot.org/uniprot/Q8IV48)    (colon, hnscc, lscc, luad, ovarian)     5
    - RNA exonuclease that binds to the 3'-end of histone mRNAs and degrades them, suggesting that it plays an essential role in histone mRNA decay after replication. A 2' and 3'-hydroxyl groups at the last nucleotide of the histone 3'-end is required for efficient degradation of RNA substrates. Also able to degrade the 3'-overhangs of short interfering RNAs (siRNAs) in vitro, suggesting a possible role as regulator of RNA interference (RNAi). Requires for binding the 5'-ACCCA-3' sequence present in stem-loop structure. Able to bind other mRNAs. Required for 5.8S rRNA 3'-end processing. Also binds to 5.8s ribosomal RNA. Binds with high affinity to the stem-loop structure of replication-dependent histone pre-mRNAs.
- [PPP2CB](https://www.uniprot.org/uniprot/P62714)  (colon, hnscc, lscc, luad, ovarian)     5
    - PP2A can modulate the activity of phosphorylase B kinase casein kinase 2, mitogen-stimulated S6 kinase, and MAP-2 kinase.
- [PPP2R2A](https://www.uniprot.org/uniprot/P63151)     (colon, hnscc, lscc, luad, ovarian)     5
    - The B regulatory subunit might modulate substrate selectivity and catalytic activity, and also might direct the localization of the catalytic enzyme to a particular subcellular compartment.
- [VPS37A](https://www.uniprot.org/uniprot/Q8NEZ2)  (colon, hnscc, lscc, luad, ovarian)     5
    - Component of the ESCRT-I complex, a regulator of vesicular trafficking process. Required for the sorting of endocytic ubiquitinated cargos into multivesicular bodies. May be involved in cell growth and differentiation.
- [XPO7](https://www.uniprot.org/uniprot/Q9UIA9)    (colon, hnscc, lscc, luad, ovarian)  5
    - Mediates the nuclear export of proteins (cargos) with broad substrate specificity. In the nucleus binds cooperatively to its cargo and to the GTPase Ran in its active GTP-bound form. Docking of this trimeric complex to the nuclear pore complex (NPC) is mediated through binding to nucleoporins. Upon transit of a nuclear export complex into the cytoplasm, disassembling of the complex and hydrolysis of Ran-GTP to Ran-GDP (induced by RANBP1 and RANGAP1, respectively) cause release of the cargo from the export receptor. XPO7 then return to the nuclear compartment and mediate another round of transport. The directionality of nuclear export is thought to be conferred by an asymmetric distribution of the GTP- and GDP-bound forms of Ran between the cytoplasm and nucleus.