# Find proteins within the 8q gain event (cis) that are commonly equivalent between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
CHROMOSOME = "8"
ARM = "q"
TRANS_OR_CIS = "cis"

equiv_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}_equiv.tsv"

equiv_results = pd.\
read_csv(equiv_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
equiv_results

Unnamed: 0_level_0,brca_Database_ID,lscc_Database_ID,luad_Database_ID,ovarian_Database_ID,brca_pvalue,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ANKRD46,NP_001257308.1|NP_001257307.1,NP_001257308.1|NP_001257306.1,NP_001257308.1|NP_001257306.1,NP_001257307,0.049869,,0.002131,0.002797,0.029985,0.032647
ASAP1,NP_060952.2|NP_001234925.1,NP_060952.2|NP_001234925.1|NP_001349854.1|NP_0...,NP_060952.2|NP_001234925.1|NP_001349854.1|NP_0...,NP_001234925,0.027263,0.002539,0.001589,0.005436,0.003312,0.018906
ATAD2,NP_054828.2,NP_054828.2,NP_054828.2|NP_001341036.1|NP_060022.2|NP_0012...,NP_054828,0.999758,0.017157,0.013088,0.003363,0.539404,0.001669
ATP6V1C1,NP_001686.1,NP_001686.1,NP_001686.1,NP_001686,0.664311,0.004435,0.012249,0.003363,0.016822,0.730693
C8orf37,NP_808880.1,NP_808880.1|NP_001350189.1,NP_808880.1|NP_001350189.1,NP_808880,0.018550,,0.003032,0.020490,0.109227,0.004773
CA1,NP_001158302.1|NP_001278896.1,NP_001122301.1|NP_001278896.1,NP_001122301.1|NP_001278896.1,NP_001158302,0.022621,0.003350,0.001380,0.002716,0.003312,0.001904
CA1,NP_001158302.1|NP_001278896.1,NP_001122301.1|NP_001278896.1,NP_001278897.1,NP_001158302,0.022621,0.003350,0.001380,0.002716,0.003312,0.001904
CA1,NP_001158302.1|NP_001278896.1,NP_001278897.1,NP_001122301.1|NP_001278896.1,NP_001158302,0.022621,0.003350,0.001380,0.002705,0.003312,0.001904
CA1,NP_001158302.1|NP_001278896.1,NP_001278897.1,NP_001278897.1,NP_001158302,0.022621,0.003350,0.001380,0.002705,0.003312,0.001904
CA13,NP_940986.1,NP_940986.1,NP_940986.1,NP_940986,0.018550,0.076230,0.004464,0.082083,0.142302,0.022882


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(equiv_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = equiv_results.\
    loc[:, equiv_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue column to not have the cancer type
    cancer_df = cancer_df.\
    rename(columns={f"{cancer_type}_pvalue": "adj_p"}).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p
0,brca,ANKRD46,NP_001257308.1|NP_001257307.1,0.049869
1,brca,ASAP1,NP_060952.2|NP_001234925.1,0.027263
2,brca,ATAD2,NP_054828.2,0.999758
3,brca,ATP6V1C1,NP_001686.1,0.664311
4,brca,C8orf37,NP_808880.1,0.018550
5,brca,CA13,NP_940986.1,0.018550
6,brca,CA3,NP_005172.1,0.014583
7,brca,CHMP4C,NP_689497.1,0.018550
8,brca,CNGB3,NP_061971.3,0.132031
9,brca,COL14A1,NP_066933.1,0.739474


## Select the equivalent proteins, and take a detour to make some plots

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = long_results[long_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "Equivalent")
fail_cts.insert(0, "count_type", "Not equivalent")

counts = prots_cts.append(fail_cts).sort_index().reset_index(drop=False)

alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "count_type",
        axis=alt.Axis(
            title=None,
            labels=False
        ),
        sort=["Equivalent"]
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        )
    ),
    color=alt.Color(
        "count_type",
        title=None,
        sort=["Equivalent"],
        scale=alt.Scale(
            domain=["Equivalent", "Not equivalent"],
            range=["#2d3da4", "#d1d1d1"]
        )
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None
    )
).properties(
    title="Chr 8p cis equivalence"
).configure_title(
    anchor="middle"
)

## Find how many cancers each protein was different in

In [7]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist())
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
ASAP1,"[brca, colon, hnscc, lscc, luad, ovarian]",6
CA3,"[brca, colon, hnscc, lscc, luad, ovarian]",6
EMC2,"[brca, colon, hnscc, lscc, luad, ovarian]",6
INTS8,"[brca, colon, hnscc, lscc, luad, ovarian]",6
PAG1,"[brca, colon, hnscc, lscc, luad, ovarian]",6
RPL30,"[brca, colon, hnscc, lscc, luad, ovarian]",6
CPQ,"[brca, colon, hnscc, lscc, luad]",5
DEPTOR,"[brca, colon, hnscc, lscc, ovarian]",5
NCALD,"[brca, colon, hnscc, lscc, ovarian]",5
SNTB1,"[brca, colon, hnscc, lscc, ovarian]",5


## Save results

In [8]:
# prots_summary = prots_summary.assign(
#     cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
# )

output_file = f"pancancer_summary_equiv_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")

In [9]:
df = pd.read_csv(f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t")
df = df.assign(cancers=df["cancers"].apply(lambda x: x.split("_")))

shared = df[df["protein"].isin(prots_summary.index)]
shared = shared.merge(
    right=prots_summary.reset_index(drop=False),
    on="protein",
    how="left", 
    suffixes=["_diff", "_equiv"],
)

ints = []
cts = []

for prot in shared["protein"]:
    diff_cancers = set(shared.loc[shared["protein"] == prot, "cancers_diff"].iloc[0])
    equiv_cancers = set(shared.loc[shared["protein"] == prot, "cancers_equiv"].iloc[0])
    
    intersection = diff_cancers.intersection(equiv_cancers)
    
    ints.append(intersection)
    cts.append(len(intersection))
    
shared = shared.assign(
    intersection=ints,
    intersection_size=cts
)

shared.sort_values(by="intersection_size", ascending=False)

Unnamed: 0,protein,cancers_diff,num_cancers_diff,cancers_equiv,num_cancers_equiv,intersection,intersection_size
0,FAM91A1,"[brca, colon, hnscc, lscc, luad]",5,[ovarian],1,{},0
46,NDUFB9,[brca],1,"[colon, hnscc, lscc, luad, ovarian]",5,{},0
33,ATP6V1C1,"[brca, ovarian]",2,"[colon, hnscc, lscc, luad]",4,{},0
34,IMPA1,"[brca, ovarian]",2,"[colon, hnscc, lscc]",3,{},0
35,CA13,"[colon, luad]",2,"[brca, hnscc, ovarian]",3,{},0
36,CHMP4C,"[colon, luad]",2,"[brca, hnscc, lscc, ovarian]",4,{},0
37,FABP5,"[hnscc, lscc]",2,"[brca, colon, luad, ovarian]",4,{},0
38,RNF139,"[luad, ovarian]",2,"[brca, hnscc, lscc]",3,{},0
39,CNGB3,[brca],1,"[lscc, luad]",2,{},0
40,COL14A1,[brca],1,"[colon, hnscc, lscc, luad, ovarian]",5,{},0
