# Make counts table

This counts how many patients have a CNV (amplification or deletion) at each gene in the selected chromosome.

In [1]:
import cnvutils
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np
import os
import pandas as pd
import seaborn as sns

## Import parameters

These were set in the set_parameters notebook.

In [2]:
gen_params = cnvutils.load_params(os.path.join("..", "data", "gen_params.json"))
PANCAN = gen_params["PANCAN"]
CUTOFF = gen_params["GENE_CNV_MAGNITUDE_CUTOFF"]

chr_params = cnvutils.load_params(os.path.join("data", "chr_params.json"))
CHROMOSOME = chr_params["CHROMOSOME"]
CANCER_TYPES = chr_params["CHR_CANCER_TYPES"]

## Load tables

In [3]:
data_types = ["CNV"]
tables = cnvutils.load_tables(CANCER_TYPES, data_types, pancan=PANCAN)
cnv = tables["CNV"]

Loading broadbrca v1.0...                     



  result = parse_gtf(


  result = parse_gtf(


                                                 

In [4]:
cnv = tables["CNV"]

## Get counts for genes on our chromosome

In [5]:
gene_locations = cnvutils.get_gene_locations()
chr_gene_locations = gene_locations[gene_locations["chromosome"] == CHROMOSOME]

In [6]:
def get_gain_counts(row):
    gain = len(row[row > CUTOFF])
    return gain

In [7]:
def get_loss_counts(row):
    loss = len(row[row < -CUTOFF])
    return loss

In [8]:
cnv_long = pd.DataFrame()
for cancer_type in CANCER_TYPES:
    
    df = cnv[cancer_type].transpose()
    num_patients = df.shape[1]
    
    # Get just our chromosome
    df = df[df.index.get_level_values(0).isin(chr_gene_locations.index.get_level_values(0))]
    
    # Calculate counts
    df['gain'] = df.apply(get_gain_counts, axis=1)
    df['loss'] = df.apply(get_loss_counts, axis=1)
    
    # Join in locations
    df = df.join(chr_gene_locations)
    
    df = df.melt(
        id_vars=['start_bp', 'end_bp'], 
        value_vars=['gain', 'loss'], 
        ignore_index=False
    )
    
    df = df.assign(
        cancer_type_total_patients=num_patients,
        cancer=cancer_type
    )
    
    cnv_long = cnv_long.append(df)

In [9]:
cnv_long

Unnamed: 0_level_0,Unnamed: 1_level_0,start_bp,end_bp,variable,value,cancer_type_total_patients,cancer
Name,Database_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AARD,ENSG00000205002.3,116938207.0,116944487.0,gain,56,122,brca
ABRA,ENSG00000174429.3,106759483.0,106770244.0,gain,55,122,brca
AC037459.4,ENSG00000248235.5,22545560.0,22548837.0,gain,16,122,brca
AC100821.2,ENSG00000280473.1,54042989.0,54045629.0,gain,26,122,brca
AC108925.1,ENSG00000279524.1,127663280.0,127670990.0,gain,56,122,brca
ADAM18,ENSG00000168619.14,39584489.0,39730065.0,gain,12,122,brca
ADAM2,ENSG00000104755.13,39743735.0,39838289.0,gain,12,122,brca
ADAM28,ENSG00000042980.11,24294069.0,24359014.0,gain,16,122,brca
ADAM32,ENSG00000197140.13,39106990.0,39284917.0,gain,10,122,brca
ADAM7,ENSG00000069206.14,24440930.0,24526970.0,gain,16,122,brca


## Save combined table

In [10]:
cnv_long = cnv_long.sort_values(['cancer', 'start_bp'])
cnv_long = cnv_long.reset_index()

In [11]:
cnv_long.to_csv(os.path.join("data", "cnv_counts.tsv"), sep='\t', index=False)