# SCNA analysis step 3: Plot summaries

- From the summary table, create one big histogram
    - X axis is proportion of arm that's covered by an event (take absolute value of event magnitudes so amplifications and deletions are treated the same)
    - Y axis would be number of chromosome arms that have that bin's proportion
    - Sanity check: Do separate plots for amplified/deleted
- From this we can decide what proportion of arm deletion to call a whole arm amplification or deletion
- If a patient has both arms amplified or both arms deleted, we can call it a whole chromosome event 

## Setup

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
from toolz.curried import pipe

In [2]:
# Altair options
alt.data_transformers.disable_max_rows()

# def json_dir(data, data_dir):
#     os.makedirs(data_dir, exist_ok=True)
#     return pipe(data, alt.to_json(filename=os.path.join(data_dir, "{prefix}-{hash}.{extension}")) )

# alt.data_transformers.register("json_dir", json_dir)
# alt.data_transformers.enable("json_dir", data_dir="plot_data")

DataTransformerRegistry.enable('default')

## Load data

In [3]:
cancer_types = [
    "brca",
    "ccrcc",
    "colon",
    "endometrial",
    "gbm",
    "hnscc",
    "lscc",
    "luad",
    "ovarian"
]

In [4]:
summaries_dir = "summary_tables"
arm_summary = pd.DataFrame()

for cancer_type in cancer_types:
    file_path = os.path.join(summaries_dir, f"{cancer_type}_cna_summary.tsv.gz")
    df = pd.read_csv(file_path, sep="\t").\
        assign(cancer_type=cancer_type)
    
    arm_summary = arm_summary.append(df)

In [5]:
arm_summary.head()

Unnamed: 0,Patient_ID,chromosome,arm,prop_arm_amplified,prop_arm_deleted,cancer_type
0,CPT000814,1,p,0.021756,0.028943,brca
1,CPT000814,1,q,0.782801,0.01951,brca
2,CPT000814,10,p,0.338837,0.0,brca
3,CPT000814,11,p,0.083562,0.005349,brca
4,CPT000814,11,q,2e-06,0.982518,brca


In [6]:
chr_summary = arm_summary.\
    groupby(["cancer_type", "Patient_ID", "chromosome"])[["prop_arm_amplified", "prop_arm_deleted"]].\
    sum().\
    reset_index(drop=False).\
    rename(columns={"prop_arm_amplified": "prop_chr_amplified", "prop_arm_deleted": "prop_chr_deleted"})

chr_summary.head()

Unnamed: 0,cancer_type,Patient_ID,chromosome,prop_chr_amplified,prop_chr_deleted
0,brca,CPT000814,1,0.804557,0.048453
1,brca,CPT000814,10,0.338837,0.0
2,brca,CPT000814,11,0.083563,0.987867
3,brca,CPT000814,12,1.005555,0.310725
4,brca,CPT000814,13,0.226643,0.695223


In [7]:
def plot_histogram(table, bin_step, title, xrange):
    chart = alt.Chart(table).mark_bar().encode(
        x=alt.X(
            table.columns[0],
            bin=alt.Bin(step=bin_step),
            axis=alt.Axis(
                values=np.array(range(0, 20 * xrange)) / 20,
                labelAngle=70
            )
        ),
        y=alt.Y(
            "count()"
        )
    ).properties(
        width=800,
        title=title
    )
    
    return chart

## Plot 1: Overall distribution of proportions of arms amplified or deleted

In [8]:
prop_arm_events = pd.DataFrame({
    "prop_covered": arm_summary["prop_arm_amplified"].append(arm_summary["prop_arm_deleted"])
})

In [9]:
plot_histogram(
    prop_arm_events,
    bin_step=0.005,
    title="Distribution of amplification and deletion sizes across arms",
    xrange=1
)

## Plot 2: Proportion of chromosome amplified or deleted

In [10]:
prop_chr_events = pd.DataFrame({
    "prop_covered": chr_summary["prop_chr_amplified"].append(chr_summary["prop_chr_deleted"])
})

In [11]:
plot_histogram(
    prop_chr_events, 
    bin_step=0.005, 
    title="Distribution of amplification and deletion sizes across chromosomes", 
    xrange=2
)

## Plots 3a and 3b: Same as above, but just amplifications

In [12]:
plot_histogram(
    arm_summary[["prop_arm_amplified"]],
    bin_step=0.005,
    title="Distribution of amplification sizes across arms",
    xrange=1
)

In [13]:
plot_histogram(
    chr_summary[["prop_chr_amplified"]],
    bin_step=0.005,
    title="Distribution of amplification sizes across chromosomes",
    xrange=2
)

## Plots 4a and 4b: Same as above, but just deletions

In [14]:
plot_histogram(
    arm_summary[["prop_arm_deleted"]],
    bin_step=0.005,
    title="Distribution of deletion sizes across arms",
    xrange=1
)

In [15]:
plot_histogram(
    chr_summary[["prop_chr_deleted"]],
    bin_step=0.005,
    title="Distribution of deletion sizes across chromosomes",
    xrange=2
)