# Create Gene Location DataFrame

Creates a dataframe using a list of all of the genes referenced in cptac and maps them to their locations in the geneome. The result will be a dataframe with the gene names as the index and the following columns:

* chromosome
* start_bp
* end_bp

The goal will be to create this dataframe so that it can simply be joined with any other dataframe you may need to attach gene location data to.

## Setup

In [1]:
import cptac
import pandas as pd
import numpy as np
import pyensembl
import time
import os

In [2]:
dss = {
    "brca": cptac.Brca(),
    "ccrcc": cptac.Ccrcc(),
    "colon": cptac.Colon(),
    "endometrial": cptac.Endometrial(),
    "gbm": cptac.Gbm(),
    "hnscc": cptac.Hnscc(),
    "lscc": cptac.Lscc(),
    "luad": cptac.Luad(),
    "ovarian": cptac.Ovarian()
}

Checking that brca index is up-to-date...

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Checking that hnscc index is up-to-date...      



Checking that lscc index is up-to-date... 



version 3scc v3.2.......                 
Checking that luad index is up-to-date...



                                            

## Prepare Ensembl API

In [3]:
# Make sure we have the Ensembl data downloaded
# The most recent release is 100, but 99 is still
# recent (Jan 2020), and PyEnsembl only supports
# up to 99 right now.
ensembl = pyensembl.EnsemblRelease(99)

try:
    ensembl.genes() # If this fails, we need to download the data again.
except ValueError as e:
    if str(e).startswith("Missing genome data file from "):
        ensembl.download()
        ensembl.index()
    else:
        raise e from None

## Compile list of genes

In [4]:
all_genes = set()

for df in dss.values():
    cnv = df.get_CNV()
    if isinstance(cnv.columns, pd.MultiIndex):
        all_genes = all_genes.union(set(cnv.columns))
    else:
        for gene in cnv.columns:
            all_genes.add((gene, None))


## Create Dataframe

In [5]:
genes = list()
chromosome = list()
start_bp = list()
end_bp = list()
not_found = list()
db_id = list()
s = time.time()
for gene in list(all_genes):
    genes.append(gene[0])
    db_id.append(gene[1])
    try:
        if gene[1]:
            try:
                info = ensembl.gene_by_id(gene[1])
            except:
                info = ensembl.genes_by_name(gene[0])
        else:
            info = ensembl.genes_by_name(gene[0])
        chromosome.append(info[0].contig)
        start_bp.append(info[0].start)
        end_bp.append(info[0].end)
    except:
        chromosome.append(None)
        start_bp.append(None)
        end_bp.append(None)
        not_found.append(gene)
gene_locations = pd.DataFrame({'Name': genes, 'Database_ID': db_id, 'chromosome': chromosome, 'start_bp': start_bp, 'end_bp': end_bp})
print(time.time() - s)

14.504034042358398


## Add Arm information

In [6]:
def get_arm(row, df):
    df = df[df['#chromosome'] == row.chromosome]
    df = df[df.bp_start < row.start_bp]
    df = df[df.bp_stop > row.end_bp]
    if len(df) > 0:
        return list(df.arm)[0]
    else:
        return None

## Save to file

In [7]:
cytoband = pd.read_csv('data/NCBI_ideogram.csv')

In [8]:
gene_locations['arm'] = gene_locations.apply(lambda x: get_arm(x, cytoband), axis=1)

In [9]:
gene_locations.to_csv('data/gene_locations.tsv', sep='\t')

In [10]:
len(not_found) / len(genes)

0.09137258252869634