# SCNA analysis step 2: Get locations of genes

- For each cancer type:
    1. Assign locations to each gene
    2. Based on location, assign which chromosome arm it's on

## Setup

In [1]:
import cptac
import pandas as pd
import numpy as np
import pyensembl
import os

In [5]:
# Make sure we have the Ensembl data downloaded
# The most recent release is 100, but 99 is still
# recent (Jan 2020), and PyEnsembl only supports
# up to 99 right now.
ensembl = pyensembl.EnsemblRelease(99)

try:
    ensembl.genes() # If this fails, we need to download the data again.
except ValueError as e:
    if str(e).startswith("Missing genome data file from "):
        ensembl.download()
        ensembl.index()
    else:
        raise e from None

## Get gene locations

In [6]:
def add_genes_locations(cancer_type, data_dir):
    """For a particular cancer type, load the CNA table, add location and 
    chromosome information, and then save it back to the same location.
    
    Parameters:
    cancer_type (str): The name of the cancer type. Must match its name in the file name to read from.
    data_dir (str): The path to the directory with the CNA files from step 1.
    """
    
    file_path = os.path.join(data_dir, f"{cancer_type}_cna_long.tsv.gz")
    cna = pd.read_csv(file_path, sep='\t')

    chrs = []
    starts = []
    ends = []

    for row in cna.itertuples(index=False):
        db_id = row.Database_ID

        if pd.notnull(db_id):
            try:
                info = ensembl.gene_by_id(db_id)
            except ValueError as e:
                if str(e).startswith("Gene not found: "):
                    pass # This will go down to the next try/catch and attempt lookup by name instead of ID
                else:
                    raise e from None
            else:
                chrs.append(info.contig)
                starts.append(info.start)
                ends.append(info.end)
                continue

        # We get to the following try/catch either if Database_ID is null, or 
        # if nothing was found by querying by Database_ID. That way we if don't
        # find anything with the Database_ID, we try again with the gene name.
        
        # It appears that some genes have old names that are out of date,
        # such as LSMD1. If we want to get even better coverage, we could
        # try querying HGNC with old gene names, if the below returns nothing.
        # But we're getting fairly good coverage right now, and we don't
        # want to waste time going down an unnecessary rabbit hole.
        try:
            info = ensembl.genes_by_name(row.gene)
        except ValueError as e:
            if str(e).startswith("No results found for query"):
                chrs.append(np.nan)
                starts.append(np.nan)
                ends.append(np.nan)
            else:
                raise e from None
        else:
            chrs.append(info[0].contig)
            starts.append(info[0].start)
            ends.append(info[0].end)

    # Add the columns we created
    cna = cna.assign(
        chromosome=chrs,
        start=starts,
        end=ends
    )

    # Check what proportion of genes we didn't find info for
    not_found_prop = pd.isnull(cna["start"]).sum() / cna.shape[0]

    # Select only the genes we found info for
    cna = cna[pd.notnull(cna["start"])]

    # Save it back to the same file path
    cna.to_csv(file_path, index=False, compression="gzip", sep="\t")
    
    return not_found_prop

In [19]:
cancer_types = [
    "brca",
    "ccrcc",
    "colon",
    "endometrial",
    "gbm",
    "hnscc",
    "lscc",
    "luad",
    "ovarian"
]

In [None]:
for cancer_type in cancer_types: