# SCNA analysis step 2: Summarize amplifications and deletions on each arm

We will create a table for each cancer type that has, for each chromosome arm in each patient, the proportion of the arm that was amplified and the proportion that was deleted.

## Setup

In [1]:
import pandas as pd
import numpy as np
import os

### Get chromosome arm lengths from cytoband table
Table downloaded from https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=856734045_VsAQvZHdBDGH2PuUBBiicYCEoYFm&clade=mammal&org=Human&db=hg38&hgta_group=allTables&hgta_track=hg38&hgta_table=cytoBand&hgta_regionType=genome&position=chrX%3A15%2C560%2C138-15%2C602%2C945&hgta_outputType=primaryTable&hgta_outFileName=ucsc_cytoband.tsv

In [2]:
# Read in the cytoband table
cytobands = pd.read_csv("ucsc_cytoband.tsv", sep="\t").\
    rename(columns={"#chrom": "chromosome"})


## Find the length of each chromosome by taking the maximum of all locations given for cytobands

chr_lengths = cytobands[
        ~cytobands["chromosome"].str.contains("_") & 
        ~cytobands["chromosome"].str.endswith("M")
    ].\
    rename(columns={"chromEnd": "chromosome_len"}).\
    groupby("chromosome")[["chromosome_len"]].\
    max().\
    reset_index()

# Trim the chromosome names to match other tables
chr_lengths = chr_lengths.assign(chromosome=chr_lengths["chromosome"].str[3:])

## Find the lengths of the p arms by getting the locations of the centromeres

# Select just the "acen" stained bands, which are the centromeres
centromeres = cytobands.\
    loc[cytobands.gieStain == "acen", ["chromosome", "chromStart", "chromEnd"]].\
    melt(id_vars="chromosome", value_name="centromere_location").\
    drop(columns="variable").\
    sort_values(by=["chromosome", "centromere_location"])

# Get the central location for each centromere
# Also trim the chromosome names to not start with "chr"
centromeres = centromeres[centromeres.duplicated(keep=False)].\
    drop_duplicates(keep="first").\
    assign(chromosome=centromeres["chromosome"].str[3:]).\
    sort_values(by=["chromosome", "centromere_location"]).\
    reset_index(drop=True)

p_arms = centromeres.\
    assign(arm="p").\
    rename(columns={"centromere_location": "arm_length"})[["chromosome", "arm", "arm_length"]]

## Find the lengths of the q arms by subtracting the position of the centromere
## from the overall length of the chromosome

q_arms = chr_lengths.merge(
    right=centromeres,
    how="outer",
    on="chromosome",
    validate="one_to_one"
)

q_arms = q_arms.assign(
    arm="q",
    arm_length=q_arms["chromosome_len"] - q_arms["centromere_location"]
)[["chromosome", "arm", "arm_length"]]

## Combine it all into one table
chr_arm_lengths = p_arms.\
    append(q_arms).\
    sort_values(by=["chromosome", "arm"]).\
    reset_index(drop=True)

chr_arm_lengths

Unnamed: 0,chromosome,arm,arm_length
0,1,p,123400000
1,1,q,125556422
2,10,p,39800000
3,10,q,93997422
4,11,p,53400000
5,11,q,81686622
6,12,p,35500000
7,12,q,97775309
8,13,p,17700000
9,13,q,96664328


## Generate summaries for each arm in each sample

In [27]:
def summarize_arms(
    cancer_type, 
    arm_lengths, 
    cutoff, 
    input_dir, 
    output_dir
):
    
    input_file = os.path.join(input_dir, f"{cancer_type}_cna_long.tsv.gz")
    
    # It's important that we specify to read the chromosome column as 
    # objects, so that the numbers will be strings, not ints, and thus
    # merge correctly with the arm lengths table
    cna = pd.read_csv(input_file, sep='\t', dtype={"chromosome": "O"})
    
    # Join in the chromosome lengths
    cna = cna.merge(
        right=arm_lengths,
        how="outer",
        on=["chromosome", "arm"],
        validate="many_to_one"
    )
    
    # Slice out only the genes that pass the cutoff
    cna = cna[cna["cna_val"].abs() >= cutoff]
    
    # For each gene, calculate the proportion of its chromosome arm that it covers.
    cna = cna.assign(prop_arm=(cna["end"] - cna["start"]).abs() / cna["arm_length"])
    
    # Slice out just the columns we need now
    cna = cna[["Patient_ID", "gene", "Database_ID", "cna_val", "chromosome", "arm", "prop_arm"]]
    
    # Separate the amplifications and the deletions
    amps = cna[cna["cna_val"] > 0]
    dels = cna[cna["cna_val"] < 0]
    
    # Calculate the proportion of each arm of each chromosome that is
    # amplified and deleted in each sample
    amp_summary = amps.\
        groupby(["Patient_ID", "chromosome", "arm"])[["prop_arm"]].\
        sum().\
        rename(columns={"prop_arm": "prop_arm_amplified"}).\
        reset_index(drop=False)
    
    del_summary = dels.\
        groupby(["Patient_ID", "chromosome", "arm"])[["prop_arm"]].\
        sum().\
        rename(columns={"prop_arm": "prop_arm_deleted"}).\
        reset_index(drop=False)
    
    # Merge them into one summary table
    summary = amp_summary.\
        merge(
            right=del_summary,
            how="outer",
            on=["Patient_ID", "chromosome", "arm"],
            validate="one_to_one"
        ).\
        fillna(0)
    
    # Save the output
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    file_path = os.path.join(output_dir, f"{cancer_type}_cna_long.tsv.gz")
    cna.to_csv(file_path, index=False, compression="gzip", sep="\t")

    return summary

In [None]:
cancer_types = [
    "brca",
    "ccrcc",
    "colon",
    "endometrial",
    "gbm",
    "hnscc",
    "lscc",
    "luad",
    "ovarian"
]

In [None]:
for cancer_type in cancer_types:
    summarize_arms(
        cancer_type=cancer_type,
        arm_lengths=chr_arm_lengths,
        cutoff=0.2,
        input_dir="long_cna_tables",
        output_dir="summary_tables"
    )