# Find proteins within the 8p loss event (cis) that are commonly equivalent between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
CHROMOSOME = "8"
ARM = "p"
TRANS_OR_CIS = "cis"

equiv_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}_equiv.tsv"

equiv_results = pd.\
read_csv(equiv_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
equiv_results

Unnamed: 0_level_0,luad_Database_ID,brca_Database_ID,lscc_Database_ID,ovarian_Database_ID,brca_pvalue,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ADAMDEC1,NP_055294.1|NP_001138743.1,NP_055294.1|NP_001138744.1,NP_055294.1|NP_001138743.1,NP_001138744,0.001354,0.006969,0.007044,0.001102,0.071167,0.011597
AGPAT5,NP_060831.2,NP_060831.2,NP_060831.2,NP_060831,0.366590,0.448896,0.952008,0.665325,0.892635,0.267639
ANGPT2,NP_001112359.1|NP_001138.1|NP_001112360.1,NP_001138.1|NP_001112360.1|NP_001112359.1,NP_001112359.1|NP_001138.1,NP_001112359,0.002926,,0.006749,0.001102,0.013633,0.031824
ARHGEF10,NP_001295082.1|NP_055444.2|NP_001295081.1,NP_001295082.1|NP_055444.2|NP_001295081.1,NP_001295082.1,NP_001295081,0.452544,0.005955,0.203088,0.006497,0.398274,0.032587
ASAH1,NP_808592.2|NP_004306.3|NP_001350672.1|NP_0011...,NP_004306.3|NP_808592.2,NP_004306.3|NP_808592.2|NP_001350672.1|NP_0011...,NP_004306,0.064974,0.245433,0.134137,0.001102,0.049744,0.062068
ATP6V1B2,NP_001684.2,NP_001684.2,NP_001684.2,NP_001684,0.762998,0.999964,0.328423,0.097147,0.999909,0.158119
BIN3,NP_061158.1|NP_001349975.1,NP_061158.1,NP_061158.1|NP_001349975.1,NP_061158,0.082042,0.076491,0.006749,0.005178,0.015125,0.010347
BLK,NP_001706.2|NP_001317394.1,NP_001706.2|NP_001317394.1,,NP_001317394,0.011716,,0.039119,,0.189800,0.003461
BMP1,NP_006120.1|NP_001190.1,NP_006120.1|NP_001190.1,NP_006120.1|NP_001190.1,NP_006120,0.030219,,0.134137,0.009623,0.049418,0.027416
BNIP3L,NP_001317420.1|NP_004322.1,NP_004322.1|NP_001317420.1,NP_001317420.1|NP_004322.1,NP_001317420,0.152779,,0.007044,0.001102,0.049193,0.062267


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(equiv_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = equiv_results.\
    loc[:, equiv_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue column to not have the cancer type
    cancer_df = cancer_df.\
    rename(columns={f"{cancer_type}_pvalue": "adj_p"}).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p
0,brca,ADAMDEC1,NP_055294.1|NP_001138744.1,0.001354
1,brca,AGPAT5,NP_060831.2,0.366590
2,brca,ANGPT2,NP_001138.1|NP_001112360.1|NP_001112359.1,0.002926
3,brca,ARHGEF10,NP_001295082.1|NP_055444.2|NP_001295081.1,0.452544
4,brca,ASAH1,NP_004306.3|NP_808592.2,0.064974
5,brca,ATP6V1B2,NP_001684.2,0.762998
6,brca,BIN3,NP_061158.1,0.082042
7,brca,BLK,NP_001706.2|NP_001317394.1,0.011716
8,brca,BMP1,NP_006120.1|NP_001190.1,0.030219
9,brca,BNIP3L,NP_004322.1|NP_001317420.1,0.152779


## Select the equivalent proteins, and take a detour to make some plots

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = long_results[long_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "Equivalent")
fail_cts.insert(0, "count_type", "Not equivalent")

counts = prots_cts.append(fail_cts).sort_index().reset_index(drop=False)

alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "count_type",
        axis=alt.Axis(
            title=None,
            labels=False
        ),
        sort=["Equivalent"]
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        )
    ),
    color=alt.Color(
        "count_type",
        title=None,
        sort=["Equivalent"],
        scale=alt.Scale(
            domain=["Equivalent", "Not equivalent"],
            range=["#2d3da4", "#d1d1d1"]
        )
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None
    )
).properties(
    title="Chr 8p cis equivalence"
).configure_title(
    anchor="middle"
)

## Find how many cancers each protein was different in

In [7]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist())
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
LOXL2,"[brca, colon, hnscc, lscc, luad, ovarian]",6
MSR1,"[brca, colon, hnscc, lscc, luad, ovarian]",6
POLR3D,"[brca, colon, hnscc, lscc, luad, ovarian]",6
CLU,"[brca, colon, hnscc, lscc, luad]",5
ADAMDEC1,"[brca, colon, hnscc, lscc, ovarian]",5
RBPMS,"[brca, colon, hnscc, lscc, ovarian]",5
DOK2,"[brca, colon, lscc, luad, ovarian]",5
PDLIM2,"[brca, colon, lscc, luad, ovarian]",5
ANGPT2,"[brca, hnscc, lscc, luad, ovarian]",5
LZTS1,"[brca, hnscc, lscc, luad, ovarian]",5


## Save results

In [8]:
# prots_summary = prots_summary.assign(
#     cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
# )

output_file = f"pancancer_summary_equiv_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")

In [9]:
df = pd.read_csv(f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t")
df = df.assign(cancers=df["cancers"].apply(lambda x: x.split("_")))

shared = df[df["protein"].isin(prots_summary.index)]
shared = shared.merge(
    right=prots_summary.reset_index(drop=False),
    on="protein",
    how="left", 
    suffixes=["_diff", "_equiv"],
)

ints = []
cts = []

for prot in shared["protein"]:
    diff_cancers = set(shared.loc[shared["protein"] == prot, "cancers_diff"].iloc[0])
    equiv_cancers = set(shared.loc[shared["protein"] == prot, "cancers_equiv"].iloc[0])
    
    intersection = diff_cancers.intersection(equiv_cancers)
    
    ints.append(intersection)
    cts.append(len(intersection))
    
shared = shared.assign(
    intersection=ints,
    intersection_size=cts
)

shared.sort_values(by="intersection_size", ascending=False)

Unnamed: 0,protein,cancers_diff,mean_simp_change,num_cancers_diff,cancers_equiv,num_cancers_equiv,intersection,intersection_size
32,PCM1,[lscc],-1,1,[lscc],1,{lscc},1
0,FAM160B2,"[brca, colon, hnscc, lscc, luad]",-1,5,[ovarian],1,{},0
29,DOK2,[hnscc],-1,1,"[brca, colon, lscc, luad, ovarian]",5,{},0
22,MTMR7,"[brca, luad]",-1,2,[lscc],1,{},0
23,DCTN6,"[colon, luad]",-1,2,"[brca, lscc, ovarian]",3,{},0
24,TDRP,"[hnscc, lscc]",-1,2,"[brca, luad, ovarian]",3,{},0
25,KBTBD11,"[hnscc, luad]",-1,2,"[brca, colon, lscc, ovarian]",4,{},0
26,BIN3,[brca],-1,1,"[hnscc, lscc, luad, ovarian]",4,{},0
27,BNIP3L,[brca],-1,1,"[hnscc, lscc, luad]",3,{},0
28,SLC7A2,[brca],-1,1,"[hnscc, ovarian]",2,{},0
