# Compare scaled to binary

This notebook compares the two methods of defining which samples have the event. The binary method counts the number of genes within the region that are amplified or deleted, and says that the sample has the event if 80% or more of the genes have variation in the direction we're looking for. The scaled method does the same thing, but weights the genes by their length, so that it looks not at the proportion of genes that have the variation we're looking for, but rather at the proportion of the coding length that has the desired variation.

In [12]:
import cnvutils
import pandas as pd
import os
import altair as alt

In [2]:
cancer_types = [
    "brca",
    "colon",
    "hnscc",
    "lscc",
    "luad",
    "ovarian"
]

In [3]:
def read_event_tables(dir_path):
    return {
    cancer_type:pd.read_csv(
        os.path.join(dir_path, f"{cancer_type}_has_event.tsv"), 
        sep="\t",
        index_col=0
    )
    for cancer_type in cancer_types}

In [4]:
bin_event_tables = read_event_tables(os.path.join("..", "chromosome_8"))
scl_event_tables = read_event_tables(".")

## Summarize counts
First we'll compare how many samples were counted as having or not having the event with each method.

In [5]:
bin_gain_ct = []
scl_gain_ct = []
bin_loss_ct = []
scl_loss_ct = []

for cancer_type in cancer_types:
    bin_df = bin_event_tables[cancer_type]
    scl_df = scl_event_tables[cancer_type]
    
    bin_sums = bin_df.sum()
    scl_sums = scl_df.sum()
    
    bin_gain_ct.append(bin_sums["gain_event"])
    scl_gain_ct.append(scl_sums["gain_event"])
    bin_loss_ct.append(bin_sums["loss_event"])
    scl_loss_ct.append(scl_sums["loss_event"])
    
events_summary = pd.DataFrame({
    "cancer_type": cancer_types,
    "bin_gain_ct": bin_gain_ct,
    "scl_gain_ct": scl_gain_ct,
    "bin_loss_ct": bin_loss_ct,
    "scl_loss_ct": scl_loss_ct
})

In [9]:
summ = events_summary.\
melt(
    id_vars="cancer_type",
    value_name="count"
)

label_split = summ["variable"].str.split("_", expand=True)

summ = summ.\
assign(
    bin_or_scl=label_split[0], 
    gain_or_loss=label_split[1]
).\
drop(columns="variable").\
set_index(["cancer_type", "gain_or_loss", "bin_or_scl"]).\
sort_index()

summ

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
cancer_type,gain_or_loss,bin_or_scl,Unnamed: 3_level_1
brca,gain,bin,56
brca,gain,scl,60
brca,loss,bin,56
brca,loss,scl,56
colon,gain,bin,47
colon,gain,scl,47
colon,loss,bin,39
colon,loss,scl,38
hnscc,gain,bin,35
hnscc,gain,scl,39


In [20]:
chart_sum = summ.reset_index()
gain_df = chart_sum[chart_sum["gain_or_loss"] == "gain"]
loss_df = chart_sum[chart_sum["gain_or_loss"] == "loss"]

In [21]:
gain_df

Unnamed: 0,cancer_type,gain_or_loss,bin_or_scl,count
0,brca,gain,bin,56
1,brca,gain,scl,60
4,colon,gain,bin,47
5,colon,gain,scl,47
8,hnscc,gain,bin,35
9,hnscc,gain,scl,39
12,lscc,gain,bin,34
13,lscc,gain,scl,36
16,luad,gain,bin,28
17,luad,gain,scl,27


In [22]:
loss_df

Unnamed: 0,cancer_type,gain_or_loss,bin_or_scl,count
2,brca,loss,bin,56
3,brca,loss,scl,56
6,colon,loss,bin,39
7,colon,loss,scl,38
10,hnscc,loss,bin,29
11,hnscc,loss,scl,28
14,lscc,loss,bin,50
15,lscc,loss,scl,50
18,luad,loss,bin,34
19,luad,loss,scl,32


In [24]:
def plot_gain_or_loss(gain_or_loss_table):
    return alt.Chart(gain_or_loss_table).mark_bar().encode(
        x="bin_or_scl",
        y="count",
        color="bin_or_scl",
        column="cancer_type"
    )

In [25]:
plot_gain_or_loss(gain_df)

In [26]:
plot_gain_or_loss(loss_df)

## Which samples switched?

Now we'll examine how many samples switched groups with the different methods.

In [35]:
def find_switched(cancer_type):
    b = bin_event_tables[cancer_type]
    s = scl_event_tables[cancer_type]
    
    both = b.join(
        s,
        lsuffix="_bin",
        rsuffix="_scl"
    )
    
    gain_same = both["gain_event_bin"].eq(both["gain_event_scl"])
    loss_same = both["loss_event_bin"].eq(both["loss_event_scl"])
    
    return (~gain_same).sum(), (~loss_same).sum(), 

gain_changed = []
loss_changed = []

for cancer_type in cancer_types:
    gain, loss = find_switched(cancer_type)
    gain_changed.append(gain)
    loss_changed.append(loss)
    
changed_sum = pd.DataFrame({
    "cancer_type": cancer_types,
    "gain_changed": gain_changed,
    "loss_changed": loss_changed
})

changed_sum

Unnamed: 0,cancer_type,gain_changed,loss_changed
0,brca,4,2
1,colon,0,3
2,hnscc,4,1
3,lscc,2,4
4,luad,1,2
5,ovarian,0,6
