# SCNA analysis step 2: Summarize amplifications and deletions on each arm

We will create a table for each cancer type that has, for each chromosome arm in each patient, the proportion of the arm that was amplified and the proportion that was deleted.

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def summarize_arms(
    cancer_type, 
    cutoff, 
    input_dir, 
    output_dir
):
    
    input_file = os.path.join(input_dir, f"{cancer_type}_cna_long.tsv.gz")
    
    # It's important that we specify to read the chromosome column as 
    # objects, so that the numbers will be strings, not ints, and thus
    # merge correctly with the arm lengths table
    cna = pd.read_csv(input_file, sep='\t', dtype={"chromosome": "O"})
    
    # For each arm, calculate the length of genes on that arm that we 
    # have coverage for. We'll use this as the arm length when calculating
    # the proportion of the arm covered, instead of the actual arm length,
    # so that the proportions covered aren't biased against arms with 
    # fewer genes.
    cna = cna.assign(gene_length=(cna["end"] - cna["start"]).abs())
    
    arm_data_lens = cna.\
        groupby(["Patient_ID", "chromosome", "arm"])[["gene_length"]].\
        sum().\
        rename(columns={"gene_length": "arm_data_length"}).\
        reset_index(drop=False)
    
    cna = cna.merge(
        right=arm_data_lens,
        how="outer", 
        on=["Patient_ID", "chromosome", "arm"],
        validate="many_to_one"
    )
    
    # Slice out only the genes that pass the cutoff
    cna = cna[cna["cna_val"].abs() >= cutoff]
    
    # For each gene, calculate the proportion of its chromosome arm that it covers.
    cna = cna.assign(prop_arm=(cna["end"] - cna["start"]).abs() / cna["arm_data_length"])
    
    # Slice out just the columns we need now
    cna = cna[["Patient_ID", "cna_val", "chromosome", "arm", "prop_arm"]]
    
    # Separate the amplifications and the deletions
    amps = cna[cna["cna_val"] > 0]
    dels = cna[cna["cna_val"] < 0]
    
    # Calculate the proportion of each arm of each chromosome that is
    # amplified and deleted in each sample
    amp_summary = amps.\
        groupby(["Patient_ID", "chromosome", "arm"])[["prop_arm"]].\
        sum().\
        rename(columns={"prop_arm": "prop_arm_amplified"}).\
        reset_index(drop=False)
    
    del_summary = dels.\
        groupby(["Patient_ID", "chromosome", "arm"])[["prop_arm"]].\
        sum().\
        rename(columns={"prop_arm": "prop_arm_deleted"}).\
        reset_index(drop=False)
    
    # Merge them into one summary table
    summary = amp_summary.\
        merge(
            right=del_summary,
            how="outer",
            on=["Patient_ID", "chromosome", "arm"],
            validate="one_to_one"
        ).\
        fillna({"prop_arm_amplified": 0, "prop_arm_deleted": 0}).\
        sort_values(by=["Patient_ID", "chromosome", "arm"]).\
        reset_index(drop=True)
    
    # Save the output
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    output_file = os.path.join(output_dir, f"{cancer_type}_cna_summary.tsv.gz")
    summary.to_csv(output_file, index=False, compression="gzip", sep="\t")

    return summary

In [3]:
cancer_types = [
    "brca",
    "ccrcc",
    "colon",
    "endometrial",
    "gbm",
    "hnscc",
    "lscc",
    "luad",
    "ovarian"
]

In [4]:
for cancer_type in cancer_types:
    summarize_arms(
        cancer_type=cancer_type,
        cutoff=0.2,
        input_dir="long_cna_tables",
        output_dir="summary_tables"
    )