# Make counts table

This counts how many patients have a CNV (amplification or deletion) at each gene in the selected chromosome.

In [1]:
import cnvutils
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np
import os
import pandas as pd
import seaborn as sns

## Import parameters

These were set in the set_parameters notebook.

In [2]:
params = cnvutils.load_params(os.path.join("data", "params.json"))
CHROMOSOME = params["CHROMOSOME"]
CUTOFF = params["GENE_CNV_MAGNITUDE_CUTOFF"]
CANCER_TYPES = params["CANCER_TYPES"]

## Load tables

In [3]:
data_types = ["CNV"]
tables = cnvutils.load_tables(CANCER_TYPES, data_types, pancan=True)
cnv = tables["CNV"]

                                                 

In [4]:
cnv = tables["CNV"]

## Get counts for genes on our chromosome

In [5]:
gene_locations = cnvutils.get_gene_locations()
chr_gene_locations = gene_locations[gene_locations["chromosome"] == CHROMOSOME]

In [6]:
def get_gain_counts(row):
    gain = len(row[row > CUTOFF])
    return gain

In [7]:
def get_loss_counts(row):
    loss = len(row[row < -CUTOFF])
    return loss

In [8]:
cnv_long = pd.DataFrame()
for cancer_type in CANCER_TYPES:
    
    df = cnv[cancer_type].transpose()
    num_patients = df.shape[1]
    
    # Get just our chromosome
    df = df[df.index.get_level_values(0).isin(chr_gene_locations.index.get_level_values(0))]
    
    # Calculate counts
    df['gain'] = df.apply(get_gain_counts, axis=1)
    df['loss'] = df.apply(get_loss_counts, axis=1)
    
    # Join in locations
    df = df.join(chr_gene_locations)
    
    df = df.melt(
        id_vars=['start_bp', 'end_bp'], 
        value_vars=['gain', 'loss'], 
        ignore_index=False
    )
    
    df = df.assign(
        cancer_type_total_patients=num_patients,
        cancer=cancer_type
    )
    
    cnv_long = cnv_long.append(df)

In [9]:
cnv_long

Unnamed: 0_level_0,Unnamed: 1_level_0,start_bp,end_bp,variable,value,cancer_type_total_patients,cancer
Name,Database_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AASS,ENSG00000008311.13,122073549.0,122144255.0,gain,10,122,brca
ABCA13,ENSG00000179869.13,48171458.0,48647497.0,gain,11,122,brca
ABCB1,ENSG00000085563.13,87503017.0,87713323.0,gain,12,122,brca
ABCB4,ENSG00000005471.14,87401697.0,87480435.0,gain,12,122,brca
ABCB5,ENSG00000004846.15,20615207.0,20777038.0,gain,12,122,brca
ABCB8,ENSG00000197150.11,151028422.0,151047782.0,gain,7,122,brca
ABCF2,ENSG00000033050.6,151207837.0,151227166.0,gain,7,122,brca
ABHD11,ENSG00000106077.17,73736094.0,73738867.0,gain,8,122,brca
AC010655.1,ENSG00000279916.1,128433422.0,128433713.0,gain,11,122,brca
AC073072.1,ENSG00000281889.1,22773646.0,22773993.0,gain,13,122,brca


## Save combined table

In [10]:
cnv_long = cnv_long.sort_values(['cancer', 'start_bp'])
cnv_long = cnv_long.reset_index()

In [11]:
cnv_long.to_csv(os.path.join("data", "cnv_counts_pancan.tsv"), sep='\t', index=False)