# Create Event Tables

The first thing we need to do is create tables that indicate whether a patient has the given event or not. A table should be created for each cancer type.

We do this based on the sum of the _length_ of all genes with amplification or deletion (whichever we're looking for), not just based on the number of genes.

## Setup

In [1]:
import cnvutils
import json
import os
import pandas as pd

In [2]:
# Load parameters
general_params = cnvutils.load_params(os.path.join("..", "..", "..", "data", "gen_params.json"))
PANCAN = general_params["PANCAN"]
INDIVIDUAL_GENE_CUTOFF = general_params["GENE_CNV_MAGNITUDE_CUTOFF"]
PROPORTION_WITH_EVENT_CUTOFF = general_params["PROPORTION_WITH_EVENT_CUTOFF"]

chr_params = cnvutils.load_params(os.path.join("..", "..", "data", "chr_params.json"))
CHROMOSOME = chr_params["CHROMOSOME"]

arm_params = cnvutils.load_params(os.path.join("..", "data", "arm_params.json"))
CANCER_TYPES = arm_params["ARM_CANCER_TYPES"]

data_types = ["CNV"]
tables = cnvutils.load_tables(CANCER_TYPES, data_types, pancan=False)
cnv_tables = tables["CNV"]

                                            

## Append Gene location Data

In [4]:
locations = cnvutils.get_gene_locations()

In [5]:
for cancer_type in cnv_tables.keys():
    df = cnv_tables[cancer_type]
    df = df.transpose()
    if not isinstance(df.index, pd.MultiIndex):
        new_df = df.join(locations.droplevel(1).drop_duplicates(keep="first"))
        cnv_tables[cancer_type] = new_df.dropna()
    else:
        new_df = df.join(locations)
        cnv_tables[cancer_type] = new_df.dropna()

In [6]:
cnv_tables["brca"]

Unnamed: 0_level_0,Unnamed: 1_level_0,CPT000814,CPT001846,X01BR001,X01BR008,X01BR009,X01BR010,X01BR015,X01BR017,X01BR018,X01BR020,...,X20BR008,X21BR001,X21BR002,X21BR010,X22BR005,X22BR006,chromosome,start_bp,end_bp,arm
Name,Database_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A1BG,ENSG00000121410.7,-0.278,-0.179,-0.108,-0.143,0.556,-0.064,0.006,-0.159,0.359,1.769,...,0.003,0.028,-0.026,0.064,-0.199,-0.173,19,58345178.0,58353492.0,q
A1CF,ENSG00000148584.9,0.018,0.000,-0.097,0.167,-0.180,-0.260,0.014,-0.151,-0.177,0.038,...,-0.239,0.113,-0.008,-0.047,-0.003,-0.021,10,50799409.0,50885675.0,q
A2M,ENSG00000175899.10,1.143,-0.135,0.017,0.498,-0.146,0.000,-0.051,-0.172,0.579,0.355,...,0.032,0.051,-0.211,-0.334,0.127,-0.091,12,9067664.0,9116229.0,p
A2ML1,ENSG00000166535.15,1.143,-0.135,0.017,0.498,-0.146,0.000,-0.051,-0.172,0.579,0.355,...,-0.049,0.051,-0.211,-0.334,0.127,-0.091,12,8822621.0,8887001.0,p
A2MP1,ENSG00000256069.3,1.143,-0.135,0.017,0.498,-0.146,0.000,-0.051,-0.172,0.579,0.355,...,0.032,0.051,-0.211,-0.334,0.127,-0.091,12,9228533.0,9275817.0,p
A3GALT2,ENSG00000184389.8,0.157,0.302,1.288,0.151,-0.085,0.118,-0.337,-0.131,0.516,-0.259,...,-0.206,-0.311,-0.346,0.252,0.048,-0.044,1,33306766.0,33321098.0,p
A4GALT,ENSG00000128274.11,-0.404,0.725,0.098,-0.010,0.287,-0.033,-0.310,-0.119,0.331,-0.396,...,-0.023,-0.121,-0.378,-0.035,0.201,-0.055,22,42692121.0,42721298.0,q
A4GNT,ENSG00000118017.3,0.819,-0.111,-0.015,-0.048,0.683,-0.015,0.071,0.304,0.006,0.032,...,0.258,0.213,0.452,0.303,0.136,0.379,3,138123713.0,138132390.0,q
AAAS,ENSG00000094914.8,-0.411,-0.101,0.017,0.000,-0.088,0.000,0.104,-0.109,-0.500,-0.386,...,0.032,0.151,0.032,0.068,0.127,0.011,12,53307456.0,53324864.0,q
AACS,ENSG00000081760.12,0.216,-0.109,-0.253,0.035,-0.089,-0.019,-0.361,-0.109,0.076,-0.114,...,0.032,0.151,0.032,-0.282,0.127,-0.019,12,125065434.0,125143333.0,q


In [7]:
cnv_tables["colon"]

Unnamed: 0_level_0,01CO001,01CO005,01CO006,01CO008,01CO013,01CO014,01CO015,01CO019,01CO022,05CO002,...,21CO006,21CO007,22CO004,22CO006,24CO005,27CO004,chromosome,start_bp,end_bp,arm
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.293,-0.139,-0.104,0.365,0.063,-0.189,-0.189,0.223,0.077,-0.032,...,-0.212,0.312,-0.235,0.516,0.304,-0.076,19,58345178.0,58353492.0,q
A1BG-AS1,-0.293,-0.139,-0.104,0.365,0.063,-0.189,-0.189,0.223,0.077,-0.032,...,-0.212,0.312,-0.235,0.516,0.304,-0.076,19,58347718.0,58355455.0,q
A1CF,0.297,-0.338,-0.006,-0.033,-0.172,-0.017,-0.561,0.052,0.045,-0.035,...,-0.023,-0.011,0.020,0.083,0.014,0.074,10,50799409.0,50885675.0,q
A2M,0.000,-0.819,0.070,0.399,-0.171,0.033,-0.152,0.004,0.025,-0.045,...,0.403,0.503,0.049,0.014,-0.069,0.007,12,9067664.0,9116229.0,p
A2M-AS1,0.000,-0.819,0.070,0.399,-0.171,0.033,-0.152,0.004,0.025,-0.045,...,0.118,0.503,0.049,0.014,-0.069,0.007,12,9065163.0,9068689.0,p
A2ML1,0.000,-0.819,0.070,0.399,-0.171,0.033,-0.152,0.004,0.025,-0.320,...,0.118,0.503,-0.324,0.014,-0.069,0.007,12,8822621.0,8887001.0,p
A2MP1,0.000,-0.819,0.070,0.399,-0.171,0.033,-0.152,0.004,0.025,-0.045,...,0.403,0.503,0.049,0.014,-0.069,0.007,12,9228533.0,9275817.0,p
A3GALT2,-0.008,-0.069,0.049,-0.030,-0.500,0.054,-0.150,0.159,0.076,-0.341,...,-0.015,-0.003,0.022,0.015,-0.315,-0.442,1,33306766.0,33321098.0,p
A4GALT,-0.286,-0.433,-0.038,-0.061,0.015,-0.144,-0.153,-0.191,0.051,-0.518,...,-0.164,0.256,-0.328,-0.068,-0.608,-0.549,22,42692121.0,42721298.0,q
A4GNT,0.002,0.003,0.021,0.415,-0.015,-0.154,0.090,0.247,-0.042,0.031,...,0.056,-0.023,-0.382,0.010,-0.198,-0.089,3,138123713.0,138132390.0,q


## Get percentage with event

In [8]:
def find_events(chr_df, event_start, event_end, loss_or_gain):
    
    # Slice out just the genes within the event
    event_df = chr_df[(chr_df.start_bp > event_start) & (chr_df.end_bp < event_end)]
    
    # Cut out gene lengths, drop other columns
    gene_lengths = event_df["end_bp"] - event_df["start_bp"]
    event_df = event_df.drop(columns=['chromosome', 'start_bp', 'end_bp', 'arm'])
    
    # Binarize all values to whether greater than/less than cutoff
    if loss_or_gain == "gain":
        bin_df = event_df.ge(INDIVIDUAL_GENE_CUTOFF).astype(int)
    elif loss_or_gain == "loss":
        bin_df = event_df.le(-INDIVIDUAL_GENE_CUTOFF).astype(int)
    else:
        raise ValueError(f"Invalid input '{loss_or_gain}' for loss_or_gain parameter")
    
    # Multiply every column by gene lengths
    scaled_df = bin_df.multiply(gene_lengths, axis="index")
    
    # Sum each column, and see what proportion of the overall event length it is. Keep if above cutoff.
    event_coding_length = gene_lengths.sum()
    proportions_df = scaled_df.sum(axis="index") / event_coding_length
    
    return proportions_df >= PROPORTION_WITH_EVENT_CUTOFF

In [9]:
for cancer_type in cnv_tables.keys():
    
    df = cnv_tables[cancer_type]
    
    # Subset Chromosome 8
    df_8 = df[df.chromosome == '8']
    
    # Find events
    event_table = find_events(df_8, event_metadata["START"], event_metadata["END"], event_metadata["TYPE"])
    
    # Create event table
    event_table = pd.DataFrame({'event': event_table})
    
    # Write to csv
    event_table.to_csv(os.path.join("..", "data", f'{cancer_type}_has_event.tsv'), sep='\t')