# Create Event Tables

The first thing we need to do is create tables that indicate whether a patient has the given event or not. A table should be created for each cancer type.

We do this based on the sum of the _length_ of all genes with amplification or deletion (whichever we're looking for), not just based on the number of genes.

## Setup

In [1]:
import cnvutils
import json
import os
import pandas as pd

In [2]:
# Load parameters
gen_params = cnvutils.load_params(os.path.join("..", "..", "..", "data", "gen_params.json"))
PANCAN = gen_params["PANCAN"]
INDIVIDUAL_GENE_CUTOFF = gen_params["GENE_CNV_MAGNITUDE_CUTOFF"]
PROPORTION_WITH_EVENT_CUTOFF = gen_params["PROPORTION_WITH_EVENT_CUTOFF"]

chr_params = cnvutils.load_params(os.path.join("..", "..", "data", "chr_params.json"))
CHROMOSOME = chr_params["CHROMOSOME"]

arm_params = cnvutils.load_params(os.path.join("..", "data", "arm_params.json"))
ARM = arm_params["ARM"]
CANCER_TYPES = arm_params["ARM_CANCER_TYPES"]
EVENT_START = arm_params["START"]
EVENT_END = arm_params["END"]
EVENT_TYPE = arm_params["TYPE"]

data_types = ["CNV"]
tables = cnvutils.load_tables(CANCER_TYPES, data_types, pancan=PANCAN)
cnv_tables = tables["CNV"]

Loading broadbrca v1.0...       



  result = parse_gtf(


  result = parse_gtf(


                                         

## Append Gene location Data

In [3]:
locations = cnvutils.get_gene_locations()

In [4]:
for cancer_type in cnv_tables.keys():
    df = cnv_tables[cancer_type]
    df = df.transpose()
    if not isinstance(df.index, pd.MultiIndex):
        new_df = df.join(locations.droplevel(1).drop_duplicates(keep="first"))
        cnv_tables[cancer_type] = new_df.dropna()
    else:
        new_df = df.join(locations)
        cnv_tables[cancer_type] = new_df.dropna()

In [5]:
cnv_tables["brca"]

Unnamed: 0_level_0,Unnamed: 1_level_0,01BR001,01BR008,01BR009,01BR010,01BR015,01BR017,01BR018,01BR020,01BR023,01BR025,...,21BR002,21BR010,22BR005,22BR006,CPT000814,CPT001846,chromosome,start_bp,end_bp,arm
Name,Database_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A1BG,ENSG00000121410.10,-0.07265,-0.06644,0.28468,0.00370,0.09610,-0.11901,0.07420,0.89563,-0.08158,0.00226,...,0.00071,0.09361,-0.08688,-0.12214,-0.28279,-0.33676,19,58345178.0,58353492.0,q
A1CF,ENSG00000148584.13,-0.00966,0.11550,-0.14743,-0.14666,0.13746,-0.05210,-0.08343,0.11000,0.13422,0.01699,...,0.02218,0.01611,0.14103,-0.01817,0.19699,0.00001,10,50799409.0,50885675.0,q
A2M,ENSG00000175899.13,-0.11622,0.31413,-0.26372,-0.00332,0.03751,-0.10123,0.23769,0.28245,0.00930,-0.00168,...,-0.01421,-0.28076,0.07224,-0.08264,0.78773,-0.19285,12,9067664.0,9116229.0,p
A2ML1,ENSG00000166535.18,-0.11622,0.31413,-0.26372,-0.00332,0.03751,-0.10123,0.23769,0.28245,0.00930,-0.00168,...,-0.01421,-0.28076,0.07224,-0.08264,0.78773,-0.19285,12,8822621.0,8887001.0,p
A3GALT2,ENSG00000184389.9,0.81754,0.08417,-0.13078,0.03008,-0.17893,-0.08954,0.29744,-0.07197,-0.40016,-0.00043,...,-0.28810,0.20480,0.19114,0.00284,0.28838,0.22849,1,33306766.0,33321098.0,p
A4GNT,ENSG00000118017.3,-0.06602,-0.09075,0.26289,-0.00744,0.09962,0.20366,-0.10268,-0.08447,-0.04538,0.00382,...,0.24626,0.14542,0.04622,0.16954,0.49232,-0.23250,3,138123713.0,138132390.0,q
AAAS,ENSG00000094914.11,-0.00563,-0.02567,-0.22827,-0.00332,0.12874,-0.10123,-0.54648,-0.33084,0.00930,-0.00168,...,-0.01421,-0.07210,0.07224,-0.08264,-0.21862,-0.19285,12,53307456.0,53324864.0,q
AACS,ENSG00000081760.15,-0.16064,-0.02567,-0.17576,-0.00332,-0.21528,-0.10123,-0.08692,-0.04714,0.00930,-0.00168,...,-0.01421,-0.27232,0.07224,-0.08264,0.26911,-0.19285,12,125065434.0,125143333.0,q
AADAC,ENSG00000114771.12,-0.06602,-0.09075,0.26289,-0.00744,0.09962,0.37290,0.20489,-0.08447,-0.04538,0.00382,...,0.24626,0.14542,0.04622,0.16954,0.49232,-0.23250,3,151814073.0,151828488.0,q
AADACL2,ENSG00000197953.5,-0.06602,-0.09075,0.26289,-0.00744,0.09962,0.37290,0.20489,-0.08447,-0.04538,0.00382,...,0.24626,0.14542,0.04622,0.16954,0.49232,-0.23250,3,151733916.0,151761339.0,q


In [6]:
cnv_tables["coad"]

Unnamed: 0_level_0,Unnamed: 1_level_0,01CO001,01CO005,01CO006,01CO008,01CO013,01CO014,01CO015,01CO019,01CO022,05CO002,...,21CO006,21CO007,22CO004,22CO006,24CO005,27CO004,chromosome,start_bp,end_bp,arm
Name,Database_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A1BG,ENSG00000121410.10,-0.28369,-0.10928,0.01686,0.14151,0.04654,-0.05172,-0.10882,-0.16253,-0.04239,0.05655,...,-0.09694,0.13708,-0.09947,0.09818,0.28880,0.00404,19,58345178.0,58353492.0,q
A1CF,ENSG00000148584.13,-0.00158,-0.27924,0.00518,-0.11407,-0.12382,-0.02054,-0.45875,-0.06762,0.00225,-0.09252,...,-0.05675,-0.13989,0.01844,0.00323,-0.05403,0.04800,10,50799409.0,50885675.0,q
A2M,ENSG00000175899.13,-0.02329,-0.67630,-0.00247,0.14100,-0.11993,0.01595,-0.00839,-0.03173,0.05491,-0.03927,...,0.12843,0.20379,0.01082,0.00318,-0.00720,0.04438,12,9067664.0,9116229.0,p
A2ML1,ENSG00000166535.18,-0.02329,-0.67630,-0.00247,0.14100,-0.11993,0.01595,-0.00839,-0.03173,0.05491,-0.03927,...,0.12843,0.20379,0.01082,0.00318,-0.00720,0.04438,12,8822621.0,8887001.0,p
A3GALT2,ENSG00000184389.9,-0.02078,-0.06621,0.00305,-0.11297,-0.37323,-0.02026,-0.08876,-0.07351,0.01716,-0.35915,...,-0.09657,-0.18605,0.00958,-0.00636,-0.08518,-0.37611,1,33306766.0,33321098.0,p
A4GALT,ENSG00000128274.14,-0.22379,-0.36262,0.00606,-0.15050,0.00807,-0.04322,-0.16316,-0.14542,-0.00398,-0.34570,...,-0.09322,0.07245,-0.04670,-0.01327,-0.32180,-0.38854,22,42692121.0,42721298.0,q
A4GNT,ENSG00000118017.3,0.00151,-0.00765,0.00211,0.16483,-0.02428,-0.00026,0.05699,0.33893,0.03392,0.03081,...,-0.01751,-0.12331,0.02966,0.01109,-0.01906,0.05550,3,138123713.0,138132390.0,q
AAAS,ENSG00000094914.11,-0.02329,-0.29858,-0.00247,0.14100,-0.11993,0.01595,-0.00839,-0.02421,0.05491,-0.03927,...,0.12843,-0.13139,0.01082,0.00318,-0.00720,0.04438,12,53307456.0,53324864.0,q
AACS,ENSG00000081760.15,-0.02329,-0.36438,-0.00247,0.14100,-0.11993,0.01595,-0.00839,-0.15549,-0.02610,-0.03927,...,0.00757,0.13431,0.01082,0.00318,-0.09636,0.04438,12,125065434.0,125143333.0,q
AADAC,ENSG00000114771.12,0.00151,-0.00765,0.00211,0.16483,-0.02428,-0.00026,0.05699,0.33893,0.03392,0.03081,...,-0.01751,-0.12331,0.02966,0.01109,-0.01906,0.05550,3,151814073.0,151828488.0,q


## Get percentage with event

In [7]:
def find_events(chr_df, event_start, event_end, loss_or_gain):
    
    # Slice out just the genes within the event
    event_df = chr_df[(chr_df.start_bp > event_start) & (chr_df.end_bp < event_end)]
    
    # Cut out gene lengths, drop other columns
    gene_lengths = event_df["end_bp"] - event_df["start_bp"]
    event_df = event_df.drop(columns=['chromosome', 'start_bp', 'end_bp', 'arm'])
    
    # Binarize all values to whether greater than/less than cutoff
    if loss_or_gain == "gain":
        bin_df = event_df.ge(INDIVIDUAL_GENE_CUTOFF).astype(int)
    elif loss_or_gain == "loss":
        bin_df = event_df.le(-INDIVIDUAL_GENE_CUTOFF).astype(int)
    else:
        raise ValueError(f"Invalid input '{loss_or_gain}' for loss_or_gain parameter")
    
    # Multiply every column by gene lengths
    scaled_df = bin_df.multiply(gene_lengths, axis="index")
    
    # Sum each column, and see what proportion of the overall event length it is. Keep if above cutoff.
    event_coding_length = gene_lengths.sum()
    proportions_df = scaled_df.sum(axis="index") / event_coding_length
    
    return pd.DataFrame({
        "event": proportions_df >= PROPORTION_WITH_EVENT_CUTOFF,
        "proportion": proportions_df,
    })

In [8]:
for cancer_type in cnv_tables.keys():
    
    df = cnv_tables[cancer_type]
    
    # Subset chromosome
    chr_df = df[df.chromosome == CHROMOSOME]
    
    # Find events
    event_table = find_events(chr_df, EVENT_START, EVENT_END, EVENT_TYPE)

    # Write to csv
    event_table.to_csv(os.path.join(
        "..", 
        "data", 
        f"chr{CHROMOSOME:0>2}{ARM}_{cancer_type}_has_event_{'harmonized' if PANCAN else 'AWG'}.tsv"
    ), sep='\t')