# Arm level counts

## Setup

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
import cptac

In [2]:
cancer_types = [
    "brca",
    "ccrcc",
    "colon",
    "endometrial",
    "gbm",
    "hnscc",
    "lscc",
    "luad",
    "ovarian"
]

## Plots

### Get summaries for all arms in all samples in all cancer types

In [3]:
summaries_dir = "summary_tables"
arm_summary = pd.DataFrame()

for cancer_type in cancer_types:
    file_path = os.path.join(summaries_dir, f"{cancer_type}_cna_summary.tsv.gz")
    df = pd.read_csv(file_path, sep="\t", dtype={"chromosome": "O"}).\
        assign(cancer_type=cancer_type)
    
    arm_summary = arm_summary.append(df)

In [4]:
arm_summary = arm_summary.assign(
    arm_amp=arm_summary["prop_arm_amplified"] >= 0.9,
    arm_del=arm_summary["prop_arm_deleted"] >= 0.9
)

### Summarize for each arm in each cancer type

In [5]:
cancer_summary = arm_summary.\
    groupby(["cancer_type", "chromosome", "arm"])[["arm_amp", "arm_del"]].\
    sum().\
    reset_index(drop=False).\
    rename(columns={"arm_amp": "count_arm_amp", "arm_del": "count_arm_del"})

cancer_summary = cancer_summary.assign(
    count_arm_del=0 - cancer_summary["count_arm_del"]
)

samples_per_cancer = arm_summary[["cancer_type", "Patient_ID"]].\
    drop_duplicates(keep="first").\
    groupby("cancer_type").\
    count().\
    reset_index(drop=False).\
    rename(columns={"Patient_ID": "cancer_type_samples"})

cancer_summary = cancer_summary.merge(
        right=samples_per_cancer,
        on="cancer_type",
        how="outer",
        validate="many_to_one"
)

cancer_summary = cancer_summary.assign(
    prop_arm_amp=cancer_summary["count_arm_amp"] / cancer_summary["cancer_type_samples"],
    prop_arm_del=cancer_summary["count_arm_del"] / cancer_summary["cancer_type_samples"]
)

cancer_summary = cancer_summary[["cancer_type", "chromosome", "arm", "prop_arm_amp", "prop_arm_del"]]

cancer_summary = cancer_summary.melt(
    id_vars=["cancer_type", "chromosome", "arm"],
    var_name="amp_or_del",
    value_name="prop"
)

In [6]:
samples_per_cancer

Unnamed: 0,cancer_type,cancer_type_samples
0,brca,122
1,ccrcc,106
2,colon,105
3,endometrial,86
4,gbm,97
5,hnscc,109
6,lscc,108
7,luad,109
8,ovarian,100


In [7]:
chart = alt.Chart(cancer_summary).mark_bar().encode(
    x=alt.X(
        "arm",
        axis=alt.Axis(
            labelAngle=0,
            title=None
        )
    ),
    y=alt.Y(
        "prop",
        scale=alt.Scale(domain=(-1, 1)),
        axis=alt.Axis(
            title="Proportion of samples with event"
        )
    ),
    color=alt.condition(
        alt.datum.prop > 0,
        alt.value("steelblue"),  # The positive color
        alt.value("red")  # The negative color
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None,
        header=alt.Header(labelOrient="bottom")
    )
)

alt.vconcat(
    *(
        chart.transform_filter(
            alt.datum.chromosome == val
        ).properties(
            title=f"Chromosome {val}"
        )
        
        for val in sorted([chrm for chrm in cancer_summary["chromosome"].unique() if chrm.isdigit()], key=int) + \
            [chrm for chrm in cancer_summary["chromosome"].unique() if not chrm.isdigit()]
    )
).configure_title(
    anchor="middle"
)

### Summarize for each arm across all cancer types

In [8]:
overall_summary = arm_summary.\
    groupby(["chromosome", "arm"])[["arm_amp", "arm_del"]].\
    sum().\
    reset_index(drop=False).\
    rename(columns={"arm_amp": "count_arm_amp", "arm_del": "count_arm_del"})

overall_summary = overall_summary.assign(
    count_arm_del=0 - overall_summary["count_arm_del"]
)

total_samples = arm_summary[["cancer_type", "Patient_ID"]].\
    drop_duplicates(keep="first").\
    shape[0]


overall_summary = overall_summary.assign(
    prop_arm_amp=overall_summary["count_arm_amp"] / total_samples,
    prop_arm_del=overall_summary["count_arm_del"] / total_samples
)

overall_summary

overall_summary = overall_summary[["chromosome", "arm", "prop_arm_amp", "prop_arm_del"]]

overall_summary = overall_summary.melt(
    id_vars=["chromosome", "arm"],
    var_name="amp_or_del",
    value_name="prop"
)

In [9]:
chart = alt.Chart(overall_summary).mark_bar().encode(
    x=alt.X(
        "arm",
        axis=alt.Axis(
            labelAngle=0,
            title=None
        )
    ),
    y=alt.Y(
        "prop",
        scale=alt.Scale(domain=(-1, 1)),
        axis=alt.Axis(
            title="Proportion of samples with event"
        )
    ),
    color=alt.condition(
        alt.datum.prop > 0,
        alt.value("steelblue"),  # The positive color
        alt.value("red")  # The negative color
    )
)

alt.hconcat(
    *(
        chart.transform_filter(
            alt.datum.chromosome == val
        ).properties(
            title=f"Chromosome {val}"
        )
        
        for val in sorted([chrm for chrm in overall_summary["chromosome"].unique() if chrm.isdigit()], key=int) + \
            [chrm for chrm in overall_summary["chromosome"].unique() if not chrm.isdigit()]
    )
).configure_title(
    anchor="middle"
)

### Sort the summary to show which arms are most frequently amplified or deleted

Our ordering is different from but similar to the ordering in [Genomic and Functional Approaches to Understanding Cancer Aneuploidy](https://www.cell.com/cancer-cell/fulltext/S1535-6108(18)30111-9#secsectitle0020), and the proportions for the top arms are only slightly smaller.

In [10]:
overall_summary = overall_summary.assign(
    amp_or_del=overall_summary["amp_or_del"].replace({"prop_arm_amp": "amp", "prop_arm_del": "del"})
)

overall_summary.reindex(overall_summary["prop"].abs().sort_values(ascending=False).index)

Unnamed: 0,chromosome,arm,amp_or_del,prop
89,Y,q,del,-0.322718
88,Y,p,del,-0.319533
35,7,p,amp,0.297240
58,17,p,del,-0.279193
38,8,q,amp,0.257962
22,20,q,amp,0.252654
72,3,p,del,-0.248408
31,5,p,amp,0.247346
1,1,q,amp,0.231423
36,7,q,amp,0.219745


### Make a histogram to show the frequency distribution of different alteration proportions

In [11]:
alt.Chart(overall_summary).mark_bar().encode(
    x=alt.X(
        "prop",
        bin=alt.Bin(step=0.05),
        axis=alt.Axis(
            title="Proportion of samples with an event"
        )
    ),
    y=alt.Y(
        "count()",
        axis=alt.Axis(
            title="Number of chromosome arms"
        )
    )
)