# Find proteins within the 8p loss event (cis) that are commonly equivalent between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
CHROMOSOME = "8"
ARM = "p"
TRANS_OR_CIS = "cis"

equiv_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}_equiv.tsv"

equiv_results = pd.\
read_csv(equiv_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
equiv_results

Unnamed: 0_level_0,lscc_Database_ID,luad_Database_ID,brca_Database_ID,ovarian_Database_ID,brca_pvalue,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ADAMDEC1,NP_055294.1|NP_001138743.1,NP_055294.1|NP_001138743.1,NP_055294.1|NP_001138744.1,NP_001138744,0.012576,0.027877,0.028263,0.006516,0.176926,0.056115
AGPAT5,NP_060831.2,NP_060831.2,NP_060831.2,NP_060831,0.708783,0.727818,0.999902,0.363398,0.999991,0.547255
ANGPT2,NP_001112359.1|NP_001138.1,NP_001112359.1|NP_001138.1|NP_001112360.1,NP_001138.1|NP_001112360.1|NP_001112359.1,NP_001112359,0.020605,,0.025181,0.013959,0.059177,0.100698
ARHGEF10,NP_001295082.1,NP_001295082.1|NP_055444.2|NP_001295081.1,NP_001295082.1|NP_055444.2|NP_001295081.1,NP_001295081,0.787799,0.032912,0.352783,0.048191,0.629034,0.137225
ASAH1,NP_004306.3|NP_808592.2|NP_001350672.1|NP_0011...,NP_808592.2|NP_004306.3|NP_001350672.1|NP_0011...,NP_004306.3|NP_808592.2,NP_004306,0.254359,0.570605,0.245731,0.006516,0.156887,0.216140
ATP6V1B2,NP_001684.2,NP_001684.2,NP_001684.2,NP_001684,0.978609,0.999997,0.492331,0.938964,0.999991,0.390438
BIN3,NP_061158.1|NP_001349975.1,NP_061158.1|NP_001349975.1,NP_061158.1,NP_061158,0.294580,0.227093,0.025181,0.006516,0.061991,0.056716
BLK,,NP_001706.2|NP_001317394.1,NP_001706.2|NP_001317394.1,NP_001317394,0.070929,,0.144195,,0.396812,0.022442
BMP1,NP_006120.1|NP_001190.1,NP_006120.1|NP_001190.1,NP_006120.1|NP_001190.1,NP_006120,0.140264,,0.223697,0.117558,0.150507,0.117063
BNIP3L,NP_001317420.1|NP_004322.1,NP_001317420.1|NP_004322.1,NP_004322.1|NP_001317420.1,NP_001317420,0.425157,,0.025915,0.026081,0.138173,0.215351


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(equiv_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = equiv_results.\
    loc[:, equiv_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue column to not have the cancer type
    cancer_df = cancer_df.\
    rename(columns={f"{cancer_type}_pvalue": "adj_p"}).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p
0,brca,ADAMDEC1,NP_055294.1|NP_001138744.1,0.012576
1,brca,AGPAT5,NP_060831.2,0.708783
2,brca,ANGPT2,NP_001138.1|NP_001112360.1|NP_001112359.1,0.020605
3,brca,ARHGEF10,NP_001295082.1|NP_055444.2|NP_001295081.1,0.787799
4,brca,ASAH1,NP_004306.3|NP_808592.2,0.254359
5,brca,ATP6V1B2,NP_001684.2,0.978609
6,brca,BIN3,NP_061158.1,0.294580
7,brca,BLK,NP_001706.2|NP_001317394.1,0.070929
8,brca,BMP1,NP_006120.1|NP_001190.1,0.140264
9,brca,BNIP3L,NP_004322.1|NP_001317420.1,0.425157


## Select the equivalent proteins, and take a detour to make some plots

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = long_results[long_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "Equivalent")
fail_cts.insert(0, "count_type", "Not equivalent")

counts = prots_cts.append(fail_cts).sort_index().reset_index(drop=False)

alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "count_type",
        axis=alt.Axis(
            title=None,
            labels=False
        ),
        sort=["Equivalent"]
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        )
    ),
    color=alt.Color(
        "count_type",
        title=None,
        sort=["Equivalent"],
        scale=alt.Scale(
            domain=["Equivalent", "Not equivalent"],
            range=["#2d3da4", "#d1d1d1"]
        )
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None
    )
).properties(
    title="Chr 8p cis equivalence"
).configure_title(
    anchor="middle"
)

## Find how many cancers each protein was different in

In [7]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist())
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
MSR1,"[brca, colon, hnscc, lscc, luad, ovarian]",6
ADAMDEC1,"[brca, colon, hnscc, lscc]",4
CLU,"[brca, colon, hnscc, lscc]",4
DEFA4,"[brca, colon, hnscc, lscc]",4
POLR3D,"[brca, colon, lscc, luad]",4
KCTD9,"[colon, hnscc, lscc, ovarian]",4
DOK2,"[brca, colon, ovarian]",3
ANGPT2,"[brca, hnscc, lscc]",3
LPL,"[brca, hnscc, lscc]",3
MFHAS1,"[brca, hnscc, lscc]",3


## Check that there isn't overlap between proteins found as different and proteins found as equivalent

If there is overlap, we probably were too lenient in our bounds for the TOST test.

In [8]:
df = pd.read_csv(f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t")
df = df.assign(cancers=df["cancers"].apply(lambda x: x.split("_")))

shared = df[df["protein"].isin(prots_summary.index)]
shared = shared.merge(
    right=prots_summary.reset_index(drop=False),
    on="protein",
    how="left", 
    suffixes=["_diff", "_equiv"],
)

ints = []
cts = []

for prot in shared["protein"]:
    diff_cancers = set(shared.loc[shared["protein"] == prot, "cancers_diff"].iloc[0])
    equiv_cancers = set(shared.loc[shared["protein"] == prot, "cancers_equiv"].iloc[0])
    
    intersection = diff_cancers.intersection(equiv_cancers)
    
    ints.append(intersection)
    cts.append(len(intersection))
    
shared = shared.assign(
    intersection=ints,
    intersection_size=cts
)

shared.sort_values(by="intersection_size", ascending=False)

Unnamed: 0,protein,cancers_diff,mean_simp_change,num_cancers_diff,cancers_equiv,num_cancers_equiv,intersection,intersection_size
0,MTMR9,"[brca, hnscc, lscc, luad, ovarian]",-1,5,[colon],1,{},0
1,PINX1,"[brca, colon, hnscc, ovarian]",-1,4,[lscc],1,{},0
26,PNMA2,[luad],-1,1,"[hnscc, lscc]",2,{},0
25,LPL,[luad],-1,1,"[brca, hnscc, lscc]",3,{},0
24,DEFA4,[luad],-1,1,"[brca, colon, hnscc, lscc]",4,{},0
23,BLK,[luad],-1,1,[ovarian],1,{},0
22,TDRP,[hnscc],-1,1,[brca],1,{},0
21,SORBS3,[hnscc],-1,1,[colon],1,{},0
20,DOK2,[hnscc],-1,1,"[brca, colon, ovarian]",3,{},0
19,TNKS,[brca],-1,1,"[lscc, ovarian]",2,{},0


## Out of curiousity, look at the ttest pvalues for difference, for the equivalent proteins

In [9]:
ttres = pd.\
read_csv(f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}effects_ttest.tsv", sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

cancer_types = sorted(ttres.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

ttres_long = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttres.\
    loc[:, ttres.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue column to not have the cancer type
    cancer_df = cancer_df.\
    rename(columns={f"{cancer_type}_pvalue": "adj_p"}).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p"]]
    
    # Append to the overall dataframe
    ttres_long = ttres_long.append(cancer_df)

# Drop duplicate rows and reset the index
ttres_long = ttres_long[~ttres_long.duplicated(keep=False)].\
reset_index(drop=True)

# Get the difference p values for all proteins that passed
diffs = pd.DataFrame()
for prot_idx in prots.index:
    
    prot = prots.loc[prot_idx, "protein"]
    cancer_type = prots.loc[prot_idx, "cancer_type"]
    
    diffs = diffs.append(ttres_long[
        (ttres_long["protein"] == prot) &
        (ttres_long["cancer_type"] == cancer_type)
    ])

# Plot the distribution of the difference p values for the equivalent proteins
alt.Chart(diffs).mark_bar().encode(
    x=alt.X(
        "adj_p",
        bin=alt.Bin(step=0.05),
        scale=alt.Scale(
            domain=[0, 1]
        )
    ),
    y=alt.Y(
        "count()"
    )
).properties(
    title="Distribution of difference p values for equivalent proteins"
)

In [10]:
# Plot the distribution of all the difference p values
alt.Chart(ttres_long).mark_bar().encode(
    x=alt.X(
        "adj_p",
        bin=alt.Bin(step=0.05),
        scale=alt.Scale(
            domain=[0, 1]
        )
    ),
    y=alt.Y(
        "count()"
    )
).properties(
    title="Distribution of difference p values for all proteins"
)

## Save results

In [11]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = f"pancancer_summary_equiv_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")