# Find proteins outside the event (trans) that are commonly different between samples with and without the event

## Setup

In [1]:
import cnvutils
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
# Load parameters
gen_params = cnvutils.load_params(os.path.join("..", "..", "..", "data", "gen_params.json"))
PANCAN = gen_params["PANCAN"]

chr_params = cnvutils.load_params(os.path.join("..", "..", "data", "chr_params.json"))
CHROMOSOME = chr_params["CHROMOSOME"]

arm_params = cnvutils.load_params(os.path.join("..", "data", "arm_params.json"))
ARM = arm_params["ARM"]

# Set cis or trans
CIS_OR_TRANS = "trans"

read_path = os.path.join(
    "..", 
    "data", 
    f"chr{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_00_ttest_{'harmonized' if PANCAN else 'AWG'}.tsv",
)

ttest_results = pd.\
read_csv(read_path, sep="\t").\
rename(columns={"Name": "protein"})

In [3]:
ttest_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,brca,A1BG,ENSP00000263100.2,0.974239,-0.065965
1,brca,A2M,ENSP00000323929.8,0.945808,0.114934
2,brca,A2ML1,ENSP00000299698.7,0.974559,0.151738
3,brca,AAAS,ENSP00000209873.4,0.815264,0.135414
4,brca,AACS,ENSP00000324842.6,0.974239,0.078964
5,brca,AADAT,ENSP00000226840.4,0.993179,
6,brca,AAGAB,ENSP00000261880.5,0.995042,0.020828
7,brca,AAK1,ENSP00000386456.3,0.986692,-0.019177
8,brca,AAMDC,ENSP00000377078.2,0.945808,-0.126203
9,brca,AAMP,ENSP00000416394.1,0.996273,-0.007667


## Select the proteins with a significant change, and take a detour to make some plots

In [4]:
prots = ttest_results[ttest_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = ttest_results[ttest_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "Significant difference")
fail_cts.insert(0, "count_type", "No significant difference")

counts = prots_cts.append(fail_cts).sort_index().reset_index(drop=False)

trans_effects_barchart = alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "count_type",
        axis=alt.Axis(
            title=None,
            labels=False
        ),
        sort=["Significant difference"]
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        )
    ),
    color=alt.Color(
        "count_type",
        title=None,
        sort=["Significant difference"],
        scale=alt.Scale(
            domain=["Significant difference", "No significant difference"],
            range=["#2d3da4", "#d1d1d1"]
        )
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None
    )
).properties(
    title=f"Chr {CHROMOSOME}{ARM} {CIS_OR_TRANS} effects"
).configure_title(
    anchor="middle"
)

trans_effects_barchart

In [5]:
# Save the chart
chart_fmt = gen_params["CHART_FORMAT"]
chart_scale = gen_params["CHART_SCALE"]

chart_dir = os.path.join("..", "data", "charts_img")
trans_effects_barchart_path = os.path.join(
    chart_dir,
    f"chr{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_05_sig_effects_barchart_{'harmonized' if PANCAN else 'AWG'}_altair.{chart_fmt}"
)

trans_effects_barchart.save(trans_effects_barchart_path, scale_factor=chart_scale)

INFO:tornado.access:200 GET / (::1) 13.76ms
INFO:tornado.access:200 GET / (::1) 13.76ms
INFO:tornado.access:200 GET /vega.js (::1) 10.10ms
INFO:tornado.access:200 GET /vega.js (::1) 10.10ms
INFO:tornado.access:200 GET /vega-lite.js (::1) 15.38ms
INFO:tornado.access:200 GET /vega-lite.js (::1) 15.38ms
INFO:tornado.access:200 GET /vega-embed.js (::1) 16.09ms
INFO:tornado.access:200 GET /vega-embed.js (::1) 16.09ms


## Find how many cancers each protein was different in

In [6]:
def make_simple_change(change_val):
    if change_val == 0:
        return 0
    if change_val > 0:
        return 1
    if change_val < 0:
        return -1

prots = prots.assign(
    simplified_change=prots["change"].apply(make_simple_change)
)

In [7]:
prots

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change,simplified_change
0,brca,CNOT8,ENSP00000285896.6,1.927575e-08,,
1,colon,AGO2,ENSP00000220592.5,3.322784e-02,0.179126,1.0
2,colon,ATP6V1A,ENSP00000273398.3,1.242824e-03,-0.214406,-1.0
3,colon,ATP6V1D,ENSP00000216442.7,2.816767e-02,-0.190343,-1.0
4,colon,ATP6V1E1,ENSP00000253413.5,1.242824e-03,-0.255901,-1.0
5,colon,ATP6V1H,ENSP00000347359.3,8.030057e-03,-0.220439,-1.0
6,colon,COQ8A,ENSP00000355739.3,4.589173e-02,0.320267,1.0
7,colon,CRELD2,ENSP00000332223.4,3.427443e-02,-0.470108,-1.0
8,colon,CSNK2A2,ENSP00000262506.3,9.966115e-03,0.163760,1.0
9,colon,DICER1,ENSP00000343745.3,3.427443e-02,-0.221849,-1.0


In [8]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist()),
    "mean_simp_change": ("simplified_change", np.mean)
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATP6V1E1,"[colon, hnscc, lscc, luad]",-1.0,4
CNOT8,"[brca, lscc, luad]",,3
ATP6V1H,"[colon, lscc, luad]",-1.0,3
AGO2,"[colon, lscc]",1.0,2
ATP6V1D,"[colon, lscc]",-1.0,2
COQ8A,"[colon, lscc]",1.0,2
CRELD2,"[colon, lscc]",-1.0,2
CSNK2A2,"[colon, lscc]",1.0,2
DNAJC19,"[colon, lscc]",1.0,2
FNDC3A,"[colon, lscc]",-1.0,2


In [9]:
prots_summary["num_cancers"].value_counts().sort_index(ascending=False)

4       1
3       2
2      21
1    3221
Name: num_cancers, dtype: int64

## Save results

In [10]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = os.path.join(
    "..", 
    "data", 
    f"chr{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_02_common_diff_{'harmonized' if PANCAN else 'AWG'}.tsv",
)
prots_summary.to_csv(output_file, sep="\t")