# Make CNA event tables: Chromosome 7

- For each cancer type, convert CNA table to long format (i.e., columns are [Patient_ID, gene, cna_val], so one row for each gene for each sample)
- Then, get the location and chromosome arm for each gene and record it.
- Then, we will create a table for each cancer type that has, for each chromosome arm in each patient, the proportion of the arm that was amplified and the proportion that was deleted.

## Setup

In [1]:
import cptac
import pandas as pd
import numpy as np
import pyensembl
import os

In [2]:
dss = {
    "brca": cptac.Brca,
    "ccrcc": cptac.Ccrcc,
    "colon": cptac.Colon,
    "endometrial": cptac.Endometrial,
    "gbm": cptac.Gbm,
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian
}

### Prepare Ensembl API

In [3]:
# Make sure we have the Ensembl data downloaded
# The most recent release is 100, but 99 is still
# recent (Jan 2020), and PyEnsembl only supports
# up to 99 right now.
ensembl = pyensembl.EnsemblRelease(99)

try:
    ensembl.genes() # If this fails, we need to download the data again.
except ValueError as e:
    if str(e).startswith("Missing genome data file from "):
        ensembl.download()
        ensembl.index()
    else:
        raise e from None

### Get centromere locations from cytoband table
Table downloaded from https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=856734045_VsAQvZHdBDGH2PuUBBiicYCEoYFm&clade=mammal&org=Human&db=hg38&hgta_group=allTables&hgta_track=hg38&hgta_table=cytoBand&hgta_regionType=genome&position=chrX%3A15%2C560%2C138-15%2C602%2C945&hgta_outputType=primaryTable&hgta_outFileName=ucsc_cytoband.tsv

Note that the p arm is always before the q arm. So if a gene's location is before the centromere location, it's on the the p arm; if it's after, it's on the q arm.

In [4]:
# Read in the cytoband table
cytobands = pd.read_csv("ucsc_cytoband.tsv", sep="\t").\
    rename(columns={"#chrom": "chromosome"})

# Select just the "acen" stained bands, which are the centromeres
centromeres = cytobands.\
    loc[cytobands.gieStain == "acen", ["chromosome", "chromStart", "chromEnd"]].\
    melt(id_vars="chromosome", value_name="centromere_location").\
    drop(columns="variable").\
    sort_values(by=["chromosome", "centromere_location"])

# Get the central location for each centromere
# Also trim the chromosome names to not start with "chr"
centromeres = centromeres[centromeres.duplicated(keep=False)].\
    drop_duplicates(keep="first").\
    assign(chromosome=centromeres["chromosome"].str[3:]).\
    sort_values(by=["chromosome", "centromere_location"]).\
    reset_index(drop=True)

centromeres.head()

Unnamed: 0,chromosome,centromere_location
0,1,123400000
1,10,39800000
2,11,53400000
3,12,35500000
4,13,17700000


## Format tables and get location data

In [5]:
def make_long_table_with_locs(
    cancer_type,
    datasets_dict,
    pyensembl_obj,
    centromere_locs
):
    """Pass None for output_dir if you don't want to re-save the output."""
    
    # Reformat the tables
    cna = dss[cancer_type]().\
        get_CNV().\
        reset_index().\
        melt(id_vars="Patient_ID", value_name="cna_val").\
        rename(columns={"Name": "gene"})
    
    # If there's a Database_ID column, format it; if there isn't one,
    # add an empty one for consistency
    if "Database_ID" in cna.columns:
        # Parse the database IDs to remove version numbers from Ensembl IDs, as they interfere with lookup.
        cna = cna.assign(
            Database_ID=cna["Database_ID"].str.rsplit(".", n=1, expand=True)[0]
        )
    else:
        cna.insert(2, "Database_ID", np.nan)
        
    # Now we'll get the location data for the genes
    chrs = []
    starts = []
    ends = []

    for row in cna.itertuples(index=False):
        db_id = row.Database_ID

        if pd.notnull(db_id):
            try:
                info = pyensembl_obj.gene_by_id(db_id)
            except ValueError as e:
                if str(e).startswith("Gene not found: "):
                    pass # This will go down to the next try/catch and attempt lookup by name instead of ID
                else:
                    raise e from None
            else:
                chrs.append(info.contig)
                starts.append(info.start)
                ends.append(info.end)
                continue

        # We get to the following try/catch either if Database_ID is null, or 
        # if nothing was found by querying by Database_ID. That way we if don't
        # find anything with the Database_ID, we try again with the gene name.
        
        # It appears that some genes have old names that are out of date,
        # such as LSMD1. If we want to get even better coverage, we could
        # try querying HGNC with old gene names, if the below returns nothing.
        # But we're getting fairly good coverage right now, and we don't
        # want to waste time going down an unnecessary rabbit hole.
        try:
            info = pyensembl_obj.genes_by_name(row.gene)
        except ValueError as e:
            if str(e).startswith("No results found for query"):
                chrs.append(np.nan)
                starts.append(np.nan)
                ends.append(np.nan)
            else:
                raise e from None
        else:
            chrs.append(info[0].contig)
            starts.append(info[0].start)
            ends.append(info[0].end)

    # Add the columns we created
    cna = cna.assign(
        chromosome=chrs,
        start=starts,
        end=ends
    )

    # Check what proportion of genes we didn't find info for
    not_found_prop = pd.isnull(cna["chromosome"]).sum() / cna.shape[0]

    # Select only the genes we found info for
    cna = cna[pd.notnull(cna["start"])]
    
    # Join in the centromere locations
    cna = cna.merge(
        right=centromere_locs,
        how="outer",
        on="chromosome",
        validate="many_to_one"
    )
    
    # Drop any rows where the centromere location is NaN--their chromosomes are listed
    # as things like KI270733.1, GL000194.1, and GL000220.1
    cna = cna[pd.notnull(cna["centromere_location"])]
    
    # Add a column identifying p or q arm. Then drop the 
    # (now unneeded) centromere location column
    cna = cna.\
        assign(arm=np.where(cna["start"] < cna["centromere_location"], "p", "q")).\
        drop(columns="centromere_location")
    
    return not_found_prop, cna

In [6]:
not_found = []
cnas_dfs = {}

for cancer_type in dss.keys():
    prop_not_found, cna = make_long_table_with_locs(
        cancer_type=cancer_type, 
        datasets_dict=dss, 
        pyensembl_obj=ensembl,
        centromere_locs=centromeres,
    )
    
    not_found.append(f"{cancer_type}: {prop_not_found * 100:.2f} % not found")
    cnas_dfs[cancer_type] = cna
    
print("\n".join(not_found))

Checking that brca index is up-to-date...

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


                                                



                                          



                                         



brca: 3.72 % not found                      
ccrcc: 0.14 % not found
colon: 12.51 % not found
endometrial: 9.35 % not found
gbm: 1.65 % not found
hnscc: 1.95 % not found
lscc: 13.23 % not found
luad: 3.76 % not found
ovarian: 5.88 % not found


## Summarize amplifications and deletions on each arm

In [7]:
def summarize_arms(
    cna, 
    cutoff, 
    input_dir, 
    output_dir
):  
    # For each arm, calculate the length of genes on that arm that we 
    # have coverage for. We'll use this as the arm length when calculating
    # the proportion of the arm covered, instead of the actual arm length,
    # so that the proportions covered aren't biased against arms with 
    # fewer genes.
    cna = cna.assign(gene_length=(cna["end"] - cna["start"]).abs())
    
    arm_data_lens = cna.\
        groupby(["Patient_ID", "chromosome", "arm"])[["gene_length"]].\
        sum().\
        rename(columns={"gene_length": "arm_data_length"}).\
        reset_index(drop=False)
    
    cna = cna.merge(
        right=arm_data_lens,
        how="outer", 
        on=["Patient_ID", "chromosome", "arm"],
        validate="many_to_one"
    )
    
    # Slice out only the genes that pass the cutoff
    cna = cna[cna["cna_val"].abs() >= cutoff]
    
    # For each gene, calculate the proportion of its chromosome arm that it covers.
    cna = cna.assign(prop_arm=(cna["end"] - cna["start"]).abs() / cna["arm_data_length"])
    
    # Slice out just the columns we need now
    cna = cna[["Patient_ID", "cna_val", "chromosome", "arm", "prop_arm"]]
    
    # Separate the amplifications and the deletions
    amps = cna[cna["cna_val"] > 0]
    dels = cna[cna["cna_val"] < 0]
    
    # Calculate the proportion of each arm of each chromosome that is
    # amplified and deleted in each sample
    amp_summary = amps.\
        groupby(["Patient_ID", "chromosome", "arm"])[["prop_arm"]].\
        sum().\
        rename(columns={"prop_arm": "prop_arm_amplified"}).\
        reset_index(drop=False)
    
    del_summary = dels.\
        groupby(["Patient_ID", "chromosome", "arm"])[["prop_arm"]].\
        sum().\
        rename(columns={"prop_arm": "prop_arm_deleted"}).\
        reset_index(drop=False)
    
    # Merge them into one summary table
    summary = amp_summary.\
        merge(
            right=del_summary,
            how="outer",
            on=["Patient_ID", "chromosome", "arm"],
            validate="one_to_one"
        ).\
        fillna({"prop_arm_amplified": 0, "prop_arm_deleted": 0}).\
        sort_values(by=["Patient_ID", "chromosome", "arm"]).\
        reset_index(drop=True)
    
    # Save the output
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    output_file = os.path.join(output_dir, f"{cancer_type}_cna_summary.tsv.gz")
    summary.to_csv(output_file, index=False, compression="gzip", sep="\t")

    return summary

In [8]:
for cancer_type in dss.keys():
    summarize_arms(
        cna=cnas_dfs[cancer_type],
        cutoff=0.2,
        input_dir="long_cna_tables",
        output_dir="01_event_tables"
    )