# Find proteins within the 8p loss event (cis) that are commonly different between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
CHROMOSOME = "8"
ARM = "q"
TRANS_OR_CIS = "trans"

ttest_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}effects_ttest.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
ttest_results

Unnamed: 0_level_0,brca_Database_ID,lscc_Database_ID,luad_Database_ID,ovarian_Database_ID,brca_pvalue,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue,brca_diff,colon_diff,hnscc_diff,lscc_diff,luad_diff,ovarian_diff
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A1BG,NP_570602.2,NP_570602.2,NP_570602.2,NP_570602,0.131677,0.079431,0.976312,0.927643,0.909141,0.992171,-0.533259,-0.428771,-0.008962,0.066905,-0.078772,-0.057824
A2M,NP_000005.2,NP_000005.2|NP_001334353.1|NP_001334354.1,NP_000005.2|NP_001334353.1|NP_001334354.1|K4JD...,NP_000005,0.877474,0.428456,0.927672,0.693427,0.980958,0.975926,0.084998,-0.183967,-0.037379,-0.305739,0.024023,0.116565
A2ML1,NP_653271.2|NP_001269353.1,NP_653271.2|NP_001269353.1,,NP_653271,0.071926,,0.231914,0.805748,,0.999630,1.612571,,0.471799,0.689950,,-0.019953
AAAS,NP_056480.1|NP_001166937.1,NP_056480.1|NP_001166937.1,NP_056480.1|NP_001166937.1,NP_056480,0.391388,0.543239,0.891341,0.574088,0.438142,0.987260,0.140291,0.096396,0.020821,0.130807,0.194825,0.023217
AACS,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417,0.212549,0.913075,0.380856,0.336538,0.214353,0.993143,-0.547616,0.022432,0.180175,0.586550,0.800036,-0.045828
AADAT,NP_057312.1|NP_001273611.1,,NP_001273611.1|NP_001273612.1,,0.969351,,,,0.628508,,-0.048311,,-0.041260,,0.676167,
AAED1,NP_714542.1,NP_714542.1,NP_714542.1,,0.761070,,0.811712,0.961436,0.984537,,-0.158518,,0.115157,0.040241,-0.020807,
AAGAB,NP_078942.3|NP_001258815.1,NP_078942.3|NP_001258814.1,NP_078942.3|NP_001258814.1,NP_078942,0.085504,0.317664,0.936622,0.938040,0.800233,0.757591,-0.415548,-0.135984,0.021428,0.044889,0.115646,-0.204098
AAK1,NP_055726.3,NP_055726.3,,NP_055726,0.494125,0.228099,0.580395,0.073746,,0.943728,-0.127223,-0.100817,-0.061994,-0.423898,,-0.102736
AAMDC,NP_078960.1|NP_001303886.1|NP_001303887.1,NP_001303889.1|NP_001350493.1,NP_001303889.1|NP_001350493.1|NP_001303886.1|N...,NP_078960,0.137811,0.378058,0.981265,0.970932,0.629879,0.969393,-0.688753,0.150737,-0.009407,0.032567,-0.352333,-0.122755


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,brca,A1BG,NP_570602.2,0.131677,-0.533259
1,brca,A2M,NP_000005.2,0.877474,0.084998
2,brca,A2ML1,NP_653271.2|NP_001269353.1,0.071926,1.612571
3,brca,AAAS,NP_056480.1|NP_001166937.1,0.391388,0.140291
4,brca,AACS,NP_076417.2|NP_001306769.1|NP_001306768.1,0.212549,-0.547616
5,brca,AADAT,NP_057312.1|NP_001273611.1,0.969351,-0.048311
6,brca,AAED1,NP_714542.1,0.761070,-0.158518
7,brca,AAGAB,NP_078942.3|NP_001258815.1,0.085504,-0.415548
8,brca,AAK1,NP_055726.3,0.494125,-0.127223
9,brca,AAMDC,NP_078960.1|NP_001303886.1|NP_001303887.1,0.137811,-0.688753


## Select the proteins with a significant change, and take a detour to make some plots

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = long_results[long_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "Significant difference")
fail_cts.insert(0, "count_type", "No significant difference")

counts = prots_cts.append(fail_cts).sort_index().reset_index(drop=False)

alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "count_type",
        axis=alt.Axis(
            title=None,
            labels=False
        ),
        sort=["Significant difference"]
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        )
    ),
    color=alt.Color(
        "count_type",
        title=None,
        sort=["Significant difference"],
        scale=alt.Scale(
            domain=["Significant difference", "No significant difference"],
            range=["#2d3da4", "#d1d1d1"]
        )
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None
    )
).properties(
    title=f"Chr {CHROMOSOME}{ARM} {TRANS_OR_CIS} effects"
).configure_title(
    anchor="middle"
)

## Find how many cancers each protein was different in

In [7]:
def make_simple_change(change_val):
    if change_val == 0:
        return 0
    if change_val > 0:
        return 1
    if change_val < 0:
        return -1

prots = prots.assign(
    simplified_change=prots["change"].apply(make_simple_change)
)

In [8]:
prots

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change,simplified_change
0,brca,AASS,NP_005754.2,1.850564e-04,-1.191925,-1
1,brca,AATF,NP_036270.1,4.430556e-02,0.461839,1
2,brca,ABAT,NP_065737.2,3.676510e-02,-1.442132,-1
3,brca,ABCA13,NP_689914.3,1.464544e-02,-0.876727,-1
4,brca,ABHD3,NP_612213.2|NP_001295185.1|NP_001295186.1,4.240049e-02,-1.092605,-1
5,brca,ABT1,NP_037507.1,1.635870e-02,0.604259,1
6,brca,ACO1,NP_001265281.1,4.264670e-03,-0.805504,-1
7,brca,ACTR1A,NP_005727.1,2.789881e-02,-0.314172,-1
8,brca,ACTR1B,NP_005726.1,1.378409e-02,-0.537328,-1
9,brca,ACY1,NP_001185824.1|NP_001303260.1|NP_001185827.1|N...,2.314318e-02,-0.790663,-1


In [9]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist()),
    "mean_simp_change": ("simplified_change", np.mean)
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARFGEF1,"[brca, colon, hnscc, lscc, luad]",1,5
HGH1,"[brca, colon, hnscc, lscc, luad]",1,5
STAU2,"[brca, colon, hnscc, lscc, luad]",1,5
YTHDF3,"[brca, colon, hnscc, lscc, luad]",1,5
GGH,"[brca, colon, hnscc, luad]",1,4
PRKDC,"[brca, colon, lscc, luad]",1,4
MTFR1,"[brca, hnscc, lscc, luad]",1,4
ANKFY1,"[brca, colon, lscc]",-1,3
BOP1,"[brca, colon, luad]",1,3
C8orf82,"[brca, colon, luad]",1,3


In [10]:
prots_summary["num_cancers"].value_counts().sort_index(ascending=False)

5       4
4       3
3      18
2     122
1    1508
Name: num_cancers, dtype: int64

## Save results

In [11]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")