# Investigate normal expression of cis proteins that show equivalence between samples with and without the event

We want to know why these proteins don't show a significant change even when the arm is deleted. It's possible that they have low expression to begin with, so the change from the deletion isn't noticeable.

Unfortunately, we can only get relative expression levels from the CPTAC data, not absolute expression levels. So instead of using CPTAC data, we're going to be working with tissue-specific absolute expression data from this paper: Wang D, Eraslan B, Wieland T, et al. A deep proteome and transcriptome abundance atlas of 29 healthy human 
tissues. Mol Syst Biol. 2019;15(2):e8503. Published 2019 Feb 18. doi:10.15252/msb.20188503

We downloaded the specific table, Table 5, from https://www.embopress.org/action/downloadSupplement?doi=10.15252%2Fmsb.20188503&file=msb188503-sup-0007-TableEV5.zip

In [1]:
import pandas as pd
import numpy as np
import os
import cnvutils
import scipy.stats
import altair as alt
from toolz import pipe

In [2]:
alt.data_transformers.disable_max_rows()

def json_dir(data, data_dir):
    os.makedirs(data_dir, exist_ok=True)
    return pipe(data, alt.to_json(filename=os.path.join(data_dir, "{prefix}-{hash}.{extension}")) )

alt.data_transformers.register("json_dir", json_dir)
alt.data_transformers.enable("json_dir", data_dir="plot_data")

DataTransformerRegistry.enable('json_dir')

In [3]:
# These variables specify which chromosome and arm we're working on, and whether to do cis or trans effects
params = cnvutils.load_params(os.path.join("..", "data", "params.json"))
CHROMOSOME = params["CHROMOSOME"]
ARM = params["ARM"]
CIS_OR_TRANS = "cis"

## Transform to log(x + 1), and exclude zeros

Based on the plots in the `00_setup/00_normal_dist_exploration` notebook, it looks like our best option is to use a log(x + 1) scale, and exclude zeros. From a biological perspective, I'm fine with excluding the zeros, because proteins that aren't expressed seem a different class from proteins that just have low expression. Although it is important to remember that it's also possible that proteins with zero copies may have just been too low in expression to be detected. Nevertheless, the fact that there's such a huge number of proteins with zero copies suggests that they aren't just the few escaping detection.

In [4]:
expr = cnvutils.get_normal_expr_table()

expr_long = expr.\
drop(
    columns=["Gene_ID", "Protein_ID"]
).\
melt(
    id_vars="Gene_name",
    var_name="tissue_type",
    value_name="prot_copy_count"
)

expr_plus1_log10 = expr_long.assign(prot_copy_count=np.log10(expr_long["prot_copy_count"] + 1))
expr_transf = expr_plus1_log10[expr_plus1_log10["prot_copy_count"] > 0]

## Calculate "low" cutoff for each tissue type

In [5]:
low_cutoffs = expr_transf.groupby("tissue_type").quantile(0.25)
low_cutoffs

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,prot_copy_count
tissue_type,Unnamed: 1_level_1
Adrenal gland,3.639864
Appendix,3.580872
Brain,3.593914
Colon,3.513825
Duodenum,3.851583
Endometrium,3.617557
Esophagus,3.797054
Fallopian tube,3.501528
Fat,3.843306
Gallbladder,3.787582


## Compare expression of equivalent genes

In [6]:
equiv_prots = pd.read_csv(
    os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_common_equiv.tsv"), 
    sep="\t"
)
equiv_prots

Unnamed: 0,protein,cancers,num_cancers
0,MSR1,brca_colon_hnscc_lscc_luad_ovarian,6
1,CLU,brca_colon_hnscc_lscc,4
2,DEFA4,brca_colon_hnscc_lscc,4
3,TTI2,brca_colon_hnscc_lscc,4
4,POLR3D,brca_colon_lscc_luad,4
5,PDLIM2,brca_colon_lscc_ovarian,4
6,ADAMDEC1,brca_hnscc_lscc_ovarian,4
7,LOXL2,brca_colon_lscc,3
8,DOK2,brca_colon_ovarian,3
9,LPL,brca_hnscc_lscc,3


In [7]:
commonly_equiv = equiv_prots[equiv_prots["num_cancers"] >= 3]
sel_expr = expr.loc[
    expr["Gene_name"].isin(commonly_equiv["protein"]),
    ["Gene_name", "Protein_ID", "Colon", "Esophagus", "Lung", "Ovary"]
].\
set_index(["Gene_name", "Protein_ID"]).\
transpose()

sel_expr.columns = sel_expr.columns.to_flat_index()

# Transform the data to log10(x + 1), join in cutoffs column
sel_expr = sel_expr.\
apply(lambda x: np.log10(x + 1), axis="columns").\
join(low_cutoffs, how="left")

sel_expr = sel_expr.apply(lambda x: x <= sel_expr["prot_copy_count"]).\
drop(columns="prot_copy_count").\
transpose()

sel_expr.index = pd.MultiIndex.from_tuples(sel_expr.index).droplevel(1)

sel_expr

Unnamed: 0,Colon,Esophagus,Lung,Ovary
ADAMDEC1,False,True,False,True
CLU,False,False,False,False
CLU,True,True,True,True
DEFA4,True,False,False,True
DLC1,True,True,True,True
DOK2,False,False,False,False
KBTBD11,False,False,False,False
KCTD9,True,False,True,True
LOXL2,False,False,False,False
LPL,False,False,True,True


In [8]:
sel_expr.shape

(19, 4)

In [9]:
sel_expr.sum(axis="index")

Colon        8
Esophagus    6
Lung         7
Ovary        9
dtype: int64

### Plot on the distribution

In [10]:
def plot_cancer_type_normal_dist(cancer_type, tissue_type, expr_data, sel_genes):
    
    # Select the expression data for our chosen tissue type
    expr_data = expr_data[expr_data["tissue_type"] == tissue_type]
    
    dist = alt.Chart().mark_bar().encode(
        x=alt.X(
            "prot_copy_count",
            bin=alt.Bin(step=0.25),
            title="Protein copy counts (log transformed)"
        ),
        y=alt.Y(
            "count()",
            title="Number of proteins"
        )
    ).properties(
        width=800
    )

    # Add vertical lines at the expression levels of the proteins that didn't show a cis effect
    lines_no_cis = [
        alt.Chart().mark_rule(color=alt.Value("#d10000")).encode(
            x=f"{gene}:Q"
        )

        for gene in sel_genes
    ]

    # Get the expression levels for those proteins
    expr_no_cis = expr_data.\
    loc[expr_data["Gene_name"].isin(sel_genes)].\
    set_index("Gene_name")["prot_copy_count"].\
    astype(str).\
    to_dict()

    # Layer all the charts together
    chart = alt.layer(
        dist,
        *lines_no_cis,
        data=expr_data
    ).transform_calculate(
        **expr_no_cis
    ).properties(
        title=cancer_type
    )
    
    return chart

In [11]:
cancer_tissue = {
    "colon": "Colon",
    "hnscc": "Esophagus",
    "luad": "Lung",
    "lscc": "Lung",
    "ovarian": "Ovary"
}

alt.vconcat(*[
    plot_cancer_type_normal_dist(
        cancer_type=cancer, 
        tissue_type=tissue,
        expr_data=expr_transf,
        sel_genes=sel_expr.index.tolist()
    )
    
    for cancer, tissue in cancer_tissue.items()
]).properties(
    title=f"Normal expression of proteins that showed no cis effect for {CHROMOSOME}{ARM}"
).configure_title(
    anchor="middle",
    fontSize=14
)

## Look at cancers individually

In [12]:
each_cancer_equiv = pd.\
read_csv(os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_equiv.tsv"), sep="\t")

In [13]:
cancer_tissue = {
    "colon": "Colon",
    "hnscc": "Esophagus",
    "luad": "Lung",
    "lscc": "Lung",
    "ovarian": "Ovary"
}

cancer_expr = {}

for cancer_type, tissue_type in cancer_tissue.items():
    
    cancer_equiv = each_cancer_equiv[each_cancer_equiv["cancer_type"] == cancer_type]
    sel_expr = expr.loc[
        expr["Gene_name"].isin(cancer_equiv["protein"]),
        ["Gene_name", "Protein_ID", tissue_type]
    ].\
    set_index(["Gene_name", "Protein_ID"]).\
    transpose()

    sel_expr.columns = sel_expr.columns.to_flat_index()

    # Transform the data to log10(x + 1), join in cutoffs column
    sel_expr = sel_expr.\
    apply(lambda x: np.log10(x + 1), axis="columns").\
    join(low_cutoffs, how="left")

    sel_expr = sel_expr.apply(lambda x: x <= sel_expr["prot_copy_count"]).\
    drop(columns="prot_copy_count").\
    transpose()

    sel_expr.index = pd.MultiIndex.from_tuples(sel_expr.index).droplevel(1)
    
    print(f"{cancer_type}: {sel_expr.sum()[0]}/{sel_expr[tissue_type].size} ({sel_expr.sum()[0] / sel_expr[tissue_type].size:.2f}) below cutoff")

    cancer_expr[cancer_type] = sel_expr

colon: 12/53 (0.23) below cutoff
hnscc: 24/76 (0.32) below cutoff
luad: 20/74 (0.27) below cutoff
lscc: 19/76 (0.25) below cutoff
ovarian: 22/74 (0.30) below cutoff


In [14]:
cancer_expr["colon"]

Unnamed: 0,Colon
ADAMDEC1,False
AGPAT5,False
ARHGEF10,False
ASAH1,False
ATP6V1B2,False
BIN3,False
CHMP7,False
CLU,False
CLU,True
CNOT7,False


In [15]:
cancer_expr["hnscc"]

Unnamed: 0,Esophagus
ADAMDEC1,True
AGPAT5,False
ANGPT2,False
ARHGEF10,True
ASAH1,False
ATP6V1B2,False
BIN3,False
BLK,True
BMP1,True
BNIP3L,True


In [16]:
cancer_expr["lscc"]

Unnamed: 0,Lung
ADAMDEC1,False
AGPAT5,False
ANGPT2,True
ARHGEF10,True
ASAH1,False
ATP6V1B2,False
BIN3,False
BMP1,False
BNIP3L,False
CDCA2,True


In [17]:
cancer_expr["luad"]

Unnamed: 0,Lung
ADAMDEC1,False
AGPAT5,False
ANGPT2,True
ARHGEF10,True
ASAH1,False
ATP6V1B2,False
BIN3,False
BLK,True
BMP1,False
BNIP3L,False


In [18]:
cancer_expr["ovarian"]

Unnamed: 0,Ovary
ADAMDEC1,True
AGPAT5,False
ANGPT2,True
ARHGEF10,False
ASAH1,False
ATP6V1B2,False
BIN3,False
BLK,True
BMP1,True
BNIP3L,False
