In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
from toolz.curried import pipe

In [2]:
# Altair options
alt.data_transformers.disable_max_rows()

def json_dir(data, data_dir):
    os.makedirs(data_dir, exist_ok=True)
    return pipe(data, alt.to_json(filename=os.path.join(data_dir, "{prefix}-{hash}.{extension}")) )

alt.data_transformers.register("json_dir", json_dir)
alt.data_transformers.enable("json_dir", data_dir="plot_data")

DataTransformerRegistry.enable('json_dir')

In [4]:
cancer_types = [
    "brca",
    "ccrcc",
    "colon",
    "endometrial",
    "gbm",
    "hnscc",
    "lscc",
    "luad",
    "ovarian"
]

In [11]:
summaries_dir = "summary_tables"
arm_summary = pd.DataFrame()

for cancer_type in cancer_types:
    file_path = os.path.join(summaries_dir, f"{cancer_type}_cna_summary.tsv.gz")
    df = pd.read_csv(file_path, sep="\t", dtype={"chromosome": "O"}).\
        assign(cancer_type=cancer_type)
    
    arm_summary = arm_summary.append(df)

In [15]:
arm_summary = arm_summary.assign(
    arm_amp=arm_summary["prop_arm_amplified"] >= 0.9,
    arm_del=arm_summary["prop_arm_deleted"] >= 0.9,
)

In [16]:
arm_summary

Unnamed: 0,Patient_ID,chromosome,arm,prop_arm_amplified,prop_arm_deleted,cancer_type,arm_amp,arm_del
0,CPT000814,1,p,0.021756,0.028943,brca,False,False
1,CPT000814,1,q,0.782801,0.019510,brca,False,False
2,CPT000814,10,p,0.338837,0.000000,brca,False,False
3,CPT000814,11,p,0.083562,0.005349,brca,False,False
4,CPT000814,11,q,0.000002,0.982518,brca,False,True
5,CPT000814,12,p,1.000000,0.000000,brca,True,False
6,CPT000814,12,q,0.005555,0.310725,brca,False,False
7,CPT000814,13,q,0.226643,0.695223,brca,False,False
8,CPT000814,14,q,0.023545,0.976437,brca,False,True
9,CPT000814,15,q,0.000000,1.000000,brca,False,True


In [43]:
cancer_summary = arm_summary.\
    groupby(["cancer_type", "chromosome", "arm"])[["arm_amp", "arm_del"]].\
    sum().\
    reset_index(drop=False).\
    rename(columns={"arm_amp": "count_arm_amp", "arm_del": "count_arm_del"})

cancer_summary = cancer_summary.assign(
    count_arm_del=0 - cancer_summary["count_arm_del"],
    chr_arm=cancer_summary["chromosome"] + cancer_summary["arm"]
)

samples_per_cancer = arm_summary[["cancer_type", "Patient_ID"]].\
    drop_duplicates(keep="first").\
    groupby("cancer_type").\
    count().\
    reset_index(drop=False).\
    rename(columns={"Patient_ID": "cancer_type_samples"})

cancer_summary = cancer_summary.merge(
        right=samples_per_cancer,
        on="cancer_type",
        how="outer",
        validate="many_to_one"
)

cancer_summary = cancer_summary.assign(
    prop_arm_amp=cancer_summary["count_arm_amp"] / cancer_summary["cancer_type_samples"],
    prop_arm_del=cancer_summary["count_arm_del"] / cancer_summary["cancer_type_samples"]
)

In [48]:
cancer_summary

Unnamed: 0,cancer_type,chromosome,arm,count_arm_amp,count_arm_del,chr_arm,cancer_type_samples,prop_arm_amp,prop_arm_del
0,brca,1,p,2.0,-11.0,1p,122,0.016393,-0.090164
1,brca,1,q,67.0,0.0,1q,122,0.549180,0.000000
2,brca,10,p,24.0,-8.0,10p,122,0.196721,-0.065574
3,brca,10,q,2.0,-11.0,10q,122,0.016393,-0.090164
4,brca,11,p,7.0,-8.0,11p,122,0.057377,-0.065574
5,brca,11,q,1.0,-5.0,11q,122,0.008197,-0.040984
6,brca,12,p,10.0,-5.0,12p,122,0.081967,-0.040984
7,brca,12,q,4.0,-3.0,12q,122,0.032787,-0.024590
8,brca,13,q,4.0,-24.0,13q,122,0.032787,-0.196721
9,brca,14,q,3.0,-18.0,14q,122,0.024590,-0.147541


In [50]:
cancer_summary = cancer_summary[["cancer_type", "chr_arm", "prop_arm_amp", "prop_arm_del"]]

cancer_summary = cancer_summary.melt(
    id_vars=["cancer_type", "chr_arm"],
    var_name="amp_or_del",
    value_name="prop"
)

In [51]:
cancer_summary

Unnamed: 0,cancer_type,chr_arm,amp_or_del,prop
0,brca,1p,prop_arm_amp,0.016393
1,brca,1q,prop_arm_amp,0.549180
2,brca,10p,prop_arm_amp,0.196721
3,brca,10q,prop_arm_amp,0.016393
4,brca,11p,prop_arm_amp,0.057377
5,brca,11q,prop_arm_amp,0.008197
6,brca,12p,prop_arm_amp,0.081967
7,brca,12q,prop_arm_amp,0.032787
8,brca,13q,prop_arm_amp,0.032787
9,brca,14q,prop_arm_amp,0.024590


In [53]:
alt.Chart(cancer_summary).mark_bar().encode(
    x="chr_arm",
    y="prop",
    color=alt.condition(
        alt.datum.prop > 0,
        alt.value("steelblue"),  # The positive color
        alt.value("red")  # The negative color
    )
).facet(
    row="cancer_type"
)