# Crossover Location

## Install necessary packages

In [1]:
!pip install cptac
!pip install --upgrade cptac

Requirement already up-to-date: cptac in c:\users\chels\anaconda3\lib\site-packages (0.8.1)


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import cptac
import pyensembl
from scipy import stats

## Get the Data

In [3]:
luad = cptac.Luad()
hnscc = cptac.Hnscc()
ovarian = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
lscc = cptac.Lscc()

Checking that hnscc index is up-to-date...



Checking that ovarian index is up-to-date...



Loading ovarian v0.0.1........              

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


version 3scc v3.2.......                  
                            



In [4]:
luad_cnv = luad.get_CNV()
hnscc_cnv = hnscc.get_CNV()
ovarian_cnv =  ovarian.get_CNV()
colon_cnv = colon.get_CNV()
brca_cnv = brca.get_CNV()
lscc_cnv = lscc.get_CNV()

In [5]:
ensembl = pyensembl.EnsemblRelease()

## Create Counts Table

In [6]:
def separate(hiCut, lowCut, data):
  """
  Counts the Number of patients with high and low value for each gene.
  """
  sepDict = dict()
  for column in data:
    numHi = 0
    numLow = 0
    numNeu = 0
    for i in data[column]:
        if i > hiCut:
            numHi += 1
        elif i < lowCut:
            numLow += 1
        else:
            numNeu += 1 
    sepDict[column] = [numHi, numNeu, numLow]
  return sepDict

In [7]:
def get_counts_and_average(data):
  """
  Takes a CNV dataframe and generates a dataframe with the counts of high, low and neutral values
  """
  df = pd.DataFrame(data = separate(.3,-.2,data), index = ["amplifications", "neutral", "deletions"])
  df = df.append(pd.DataFrame(data.mean(0), columns=['average']).transpose(), sort=True)
  return df.transpose()

In [40]:
def add_chromo_and_loc(my_dict):
    chromo = list()
    locations = list()
    start = list()
    end = list()
    not_found = list()
    count = 0
    for gene in list(my_dict.index.get_level_values(0)):
        count += 1
        try:
            e_gene = ensembl.genes_by_name(gene)
            chromo.append(e_gene[0].contig)
            start.append(e_gene[0].start)
            end.append(e_gene[0].end)
            locations.append((e_gene[0].start + e_gene[0].end)/2)
        except:
            not_found.append(gene)
            chromo.append(None)
            locations.append(None)
            start.append(None)
            end.append(None)
    my_dict['chromo'] = chromo
    my_dict['location'] = locations
    my_dict['start'] = start
    my_dict['end'] = end
#     my_dict['cancer'] = cancer
#     print(cancer, len(not_found)/count)
    return my_dict

In [38]:
luad_counts = get_counts_and_average(luad_cnv)
hnscc_counts = get_counts_and_average(hnscc_cnv)
ovarian_counts = get_counts_and_average(ovarian_cnv)
colon_counts = get_counts_and_average(colon_cnv)
brca_counts = get_counts_and_average(brca_cnv)
lscc_counts = get_counts_and_average(lscc_cnv)

In [41]:
luad_counts = add_chromo_and_loc(luad_counts)
hnscc_counts = add_chromo_and_loc(hnscc_counts,)
ovarian_counts = add_chromo_and_loc(ovarian_counts)
colon_counts = add_chromo_and_loc(colon_counts)
brca_counts = add_chromo_and_loc(brca_counts)
lscc_counts = add_chromo_and_loc(lscc_counts)

In [42]:
luad_counts['cancer'] = 'LUAD'
hnscc_counts['cancer'] = 'HNSCC'
ovarian_counts['cancer'] = 'OVARIAN'
colon_counts['cancer'] = 'COLON'
brca_counts['cancer'] = 'BRCA'
lscc_counts['cancer'] = 'LSCC'

## Subset Chromosome 8

In [43]:
chromosome_num = '8'

In [44]:
# Drop any genes that we don't have a location for
luad_counts_cleaned = luad_counts.dropna(subset=['location'])
hnscc_counts_cleaned = hnscc_counts.dropna(subset=['location'])
ovarian_counts_cleaned = ovarian_counts.dropna(subset=['location'])
colon_counts_cleaned = colon_counts.dropna(subset=['location'])
brca_counts_cleaned = brca_counts.dropna(subset=['location'])
lscc_counts_cleaned = lscc_counts.dropna(subset=['location'])

In [45]:
luad_counts_8 = luad_counts_cleaned[luad_counts_cleaned.chromo==chromosome_num]
hnscc_counts_8 = hnscc_counts_cleaned[hnscc_counts_cleaned.chromo==chromosome_num]
ovarian_counts_8 = ovarian_counts_cleaned[ovarian_counts_cleaned.chromo==chromosome_num]
colon_counts_8 = colon_counts_cleaned[colon_counts_cleaned.chromo==chromosome_num]
brca_counts_8 = brca_counts_cleaned[brca_counts_cleaned.chromo==chromosome_num]
lscc_counts_8 = lscc_counts_cleaned[lscc_counts_cleaned.chromo==chromosome_num]

In [46]:
luad_counts_8.sort_values("location")

Unnamed: 0_level_0,amplifications,neutral,deletions,average,chromo,location,start,end,cancer
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
OR4F21,1.0,97.0,11.0,-0.016183,8,166546.0,166049.0,167043.0,LUAD
ZNF596,0.0,73.0,36.0,-0.127733,8,248420.0,232137.0,264703.0,LUAD
FBXO25,0.0,70.0,39.0,-0.137943,8,442197.5,406428.0,477967.0,LUAD
TDRP,0.0,70.0,39.0,-0.141256,8,517786.5,489792.0,545781.0,LUAD
ERICH1,0.0,70.0,39.0,-0.141256,8,676426.0,614746.0,738106.0,LUAD
...,...,...,...,...,...,...,...,...,...
ZNF7,25.0,83.0,1.0,0.183194,8,144837486.5,144827464.0,144847509.0,LUAD
COMMD5,25.0,83.0,1.0,0.186339,8,144847389.0,144841042.0,144853736.0,LUAD
ZNF250,26.0,82.0,1.0,0.188671,8,144889332.5,144876497.0,144902168.0,LUAD
ZNF16,25.0,83.0,1.0,0.189417,8,144940623.0,144930358.0,144950888.0,LUAD


In [47]:
def get_crossovers(df):
    crossover_genes = list()
    higher = 'del'
    prev = None
    for row in df.itertuples():
        if row[1] > row[3] and higher == 'del':
            higher = 'amp'
            crossover_genes.append((prev, row[0]))
        elif row[1] < row[3] and higher == 'amp':
            higher = 'del'
            crossover_genes.append((prev, row[0]))
        prev = row[0]
    return crossover_genes

In [87]:
get_crossovers(hnscc_counts_8.sort_values('location'))

[('ADGRA2', 'BRF2'),
 ('ADAM5', 'ADAM3A'),
 ('AC123767.1', 'ADAM18'),
 ('AC022616.2', 'AC022616.5'),
 ('AC022616.5', 'RNU6-104P'),
 ('POTEA', 'AC022616.7'),
 ('AC022616.7', 'AC022616.1')]

In [101]:
hnscc_counts_8.loc['AC022616.1'].start

43378297.0