In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os

In [2]:
cancer_types = [
    "brca",
    "ccrcc",
    "colon",
    "endometrial",
    "gbm",
    "hnscc",
    "lscc",
    "luad",
    "ovarian"
]

In [3]:
summaries_dir = "summary_tables"
arm_summary = pd.DataFrame()

for cancer_type in cancer_types:
    file_path = os.path.join(summaries_dir, f"{cancer_type}_cna_summary.tsv.gz")
    df = pd.read_csv(file_path, sep="\t", dtype={"chromosome": "O"}).\
        assign(cancer_type=cancer_type)
    
    arm_summary = arm_summary.append(df)

In [4]:
arm_summary = arm_summary.assign(
    arm_amp=arm_summary["prop_arm_amplified"] >= 0.9,
    arm_del=arm_summary["prop_arm_deleted"] >= 0.9
)

In [5]:
a = arm_summary
a.loc[
    (a.chromosome == "7") &
    (a.arm == "q") &
    (a.cancer_type == "gbm"),
    "arm_amp"
].sum()

72

In [6]:
cancer_summary = arm_summary.\
    groupby(["cancer_type", "chromosome", "arm"])[["arm_amp", "arm_del"]].\
    sum().\
    reset_index(drop=False).\
    rename(columns={"arm_amp": "count_arm_amp", "arm_del": "count_arm_del"})

cancer_summary = cancer_summary.assign(
    count_arm_del=0 - cancer_summary["count_arm_del"]
)

samples_per_cancer = arm_summary[["cancer_type", "Patient_ID"]].\
    drop_duplicates(keep="first").\
    groupby("cancer_type").\
    count().\
    reset_index(drop=False).\
    rename(columns={"Patient_ID": "cancer_type_samples"})

cancer_summary = cancer_summary.merge(
        right=samples_per_cancer,
        on="cancer_type",
        how="outer",
        validate="many_to_one"
)

cancer_summary = cancer_summary.assign(
    prop_arm_amp=cancer_summary["count_arm_amp"] / cancer_summary["cancer_type_samples"],
    prop_arm_del=cancer_summary["count_arm_del"] / cancer_summary["cancer_type_samples"]
)

In [7]:
cancer_summary = cancer_summary[["cancer_type", "chromosome", "arm", "prop_arm_amp", "prop_arm_del"]]

cancer_summary = cancer_summary.melt(
    id_vars=["cancer_type", "chromosome", "arm"],
    var_name="amp_or_del",
    value_name="prop"
)

In [8]:
chart = alt.Chart(cancer_summary).mark_bar().encode(
    x=alt.X(
        "arm",
        axis=alt.Axis(
            labelAngle=0,
            title=None
        )
    ),
    y=alt.Y(
        "prop",
        scale=alt.Scale(domain=(-1, 1)),
        axis=alt.Axis(
            title="Proportion of samples with event"
        )
    ),
    color=alt.condition(
        alt.datum.prop > 0,
        alt.value("steelblue"),  # The positive color
        alt.value("red")  # The negative color
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None,
        header=alt.Header(labelOrient="bottom")
    )
)

alt.vconcat(
    *(
        chart.transform_filter(
            alt.datum.chromosome == val
        ).properties(
            title=f"Chromosome {val}"
        )
        
        for val in sorted([chrm for chrm in cancer_summary["chromosome"].unique() if chrm.isdigit()], key=int) + \
            [chrm for chrm in cancer_summary["chromosome"].unique() if not chrm.isdigit()]
    )
).configure_title(
    anchor="middle"
)