# Create Event Tables

This notebook creates tables that indicate whether a patient has the given event or not. A table should be created for each cancer type.

## Setup

In [1]:
import cptac
import pandas as pd
import cnvutils

In [2]:
CHROMOSOME = "8"

# For us to say that an individual gene is gained or lost, the magnitude
# of its log ratio CNV value needs to be at least this cutoff
INDIVIDUAL_GENE_CUTOFF = 0.2

# This is the proportion of genes in a chromosome arm that have to be
# amplified or deleted for us to say that the arm is gained or lost,
# respectively
PROPORTION_WITH_EVENT_CUTOFF = 0.8

# These are the event ranges
GAIN_EVENT_START = 52110839
GAIN_EVENT_END = 145052465
LOSS_EVENT_START = 202660
LOSS_EVENT_END = 37421341

In [3]:
# List here the types of cancer we want to look at for this event
cnv_tables = {
    "brca": cptac.Brca().get_CNV(),
    "colon": cptac.Colon().get_CNV(),
    "hnscc": cptac.Hnscc().get_CNV(),
    "lscc": cptac.Lscc().get_CNV(),
    "luad": cptac.Luad().get_CNV(),
    "ovarian": cptac.Ovarian().get_CNV()
}

Checking that luad index is up-to-date... 



                                            

## Append Gene location Data

In [4]:
locations = cnvutils.get_gene_locations()

In [5]:
for cancer_type in cnv_tables.keys():
    df = cnv_tables[cancer_type]
    df = df.transpose()
    if not isinstance(df.index, pd.MultiIndex):
        new_df = df.join(locations.droplevel(1).drop_duplicates(keep="first"))
        cnv_tables[cancer_type] = new_df.dropna(how="all")
    else:
        new_df = df.join(locations)
        cnv_tables[cancer_type] = new_df.dropna(how="all")

In [6]:
cnv_tables["brca"]

Unnamed: 0_level_0,Unnamed: 1_level_0,CPT000814,CPT001846,X01BR001,X01BR008,X01BR009,X01BR010,X01BR015,X01BR017,X01BR018,X01BR020,...,X20BR008,X21BR001,X21BR002,X21BR010,X22BR005,X22BR006,chromosome,start_bp,end_bp,arm
Name,Database_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
7SK,ENSG00000232512.2,-0.058,0.065,1.036,0.090,0.375,0.211,-0.086,0.192,-0.038,1.117,...,0.157,-0.069,-0.443,0.238,0.015,0.365,,,,
7SK,ENSG00000249352.3,-0.496,-0.126,-0.309,0.372,-0.135,-0.020,0.000,0.074,-0.519,-0.902,...,0.000,-0.109,0.408,0.282,0.072,-0.006,,,,
7SK,ENSG00000254144.2,0.466,0.148,1.265,0.602,3.657,0.284,1.316,-0.060,1.672,0.777,...,0.043,1.047,0.053,0.012,0.160,0.335,,,,
7SK,ENSG00000260682.2,0.058,-0.197,0.060,-0.060,-0.122,0.003,-0.370,0.118,-0.058,-0.025,...,0.004,-0.259,-0.458,-0.324,-0.290,-0.445,,,,
7SK,ENSG00000271765.1,0.453,-0.132,0.020,0.115,-0.138,-0.237,-0.391,-0.278,1.314,0.486,...,-0.219,0.136,-0.061,-0.003,0.083,0.027,,,,
7SK,ENSG00000271818.1,-1.019,0.273,-0.058,-0.091,0.104,0.190,0.357,0.034,-0.525,-0.055,...,-0.154,-0.289,-0.009,-0.016,-0.291,-0.001,,,,
A1BG,ENSG00000121410.7,-0.396,-0.187,-0.111,0.059,0.661,-0.007,0.071,-0.178,0.250,2.034,...,0.102,-0.183,0.098,0.203,0.029,-0.029,19,58345178.0,58353492.0,q
A1CF,ENSG00000148584.9,0.006,0.000,-0.051,0.238,-0.126,-0.229,0.046,0.004,0.001,-0.047,...,-0.206,0.060,-0.004,-0.008,0.079,0.052,10,50799409.0,50885675.0,q
A2M,ENSG00000175899.10,1.089,-0.146,-0.132,0.509,-0.126,-0.004,-0.390,-0.165,0.538,0.454,...,-0.033,0.104,0.008,-0.281,0.075,-0.012,12,9067664.0,9116229.0,p
A2ML1,ENSG00000166535.15,1.089,-0.146,-0.132,0.509,-0.126,-0.004,-0.007,-0.165,0.538,0.454,...,-0.033,0.104,0.008,-0.281,0.075,-0.012,12,8822621.0,8887001.0,p


In [7]:
cnv_tables["colon"]

Unnamed: 0_level_0,01CO001,01CO005,01CO006,01CO008,01CO013,01CO014,01CO015,01CO019,01CO022,05CO002,...,21CO006,21CO007,22CO004,22CO006,24CO005,27CO004,chromosome,start_bp,end_bp,arm
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.293,-0.139,-0.104,0.365,0.063,-0.189,-0.189,0.223,0.077,-0.032,...,-0.212,0.312,-0.235,0.516,0.304,-0.076,19,58345178.0,58353492.0,q
A1BG-AS1,-0.293,-0.139,-0.104,0.365,0.063,-0.189,-0.189,0.223,0.077,-0.032,...,-0.212,0.312,-0.235,0.516,0.304,-0.076,19,58347718.0,58355455.0,q
A1CF,0.297,-0.338,-0.006,-0.033,-0.172,-0.017,-0.561,0.052,0.045,-0.035,...,-0.023,-0.011,0.020,0.083,0.014,0.074,10,50799409.0,50885675.0,q
A2M,0.000,-0.819,0.070,0.399,-0.171,0.033,-0.152,0.004,0.025,-0.045,...,0.403,0.503,0.049,0.014,-0.069,0.007,12,9067664.0,9116229.0,p
A2M-AS1,0.000,-0.819,0.070,0.399,-0.171,0.033,-0.152,0.004,0.025,-0.045,...,0.118,0.503,0.049,0.014,-0.069,0.007,12,9065163.0,9068689.0,p
A2ML1,0.000,-0.819,0.070,0.399,-0.171,0.033,-0.152,0.004,0.025,-0.320,...,0.118,0.503,-0.324,0.014,-0.069,0.007,12,8822621.0,8887001.0,p
A2MP1,0.000,-0.819,0.070,0.399,-0.171,0.033,-0.152,0.004,0.025,-0.045,...,0.403,0.503,0.049,0.014,-0.069,0.007,12,9228533.0,9275817.0,p
A3GALT2,-0.008,-0.069,0.049,-0.030,-0.500,0.054,-0.150,0.159,0.076,-0.341,...,-0.015,-0.003,0.022,0.015,-0.315,-0.442,1,33306766.0,33321098.0,p
A4GALT,-0.286,-0.433,-0.038,-0.061,0.015,-0.144,-0.153,-0.191,0.051,-0.518,...,-0.164,0.256,-0.328,-0.068,-0.608,-0.549,22,42692121.0,42721298.0,q
A4GNT,0.002,0.003,0.021,0.415,-0.015,-0.154,0.090,0.247,-0.042,0.031,...,0.056,-0.023,-0.382,0.010,-0.198,-0.089,3,138123713.0,138132390.0,q


## Get percentage with event

In [8]:
def find_events(chr_df, event_start, event_end, loss_or_gain):
    
    # Slice out just the genes within the event
    event_df = chr_df[(chr_df.start_bp > event_start) & (chr_df.end_bp < event_end)]
    
    # Cut out gene lengths, drop other columns
    gene_lengths = event_df["end_bp"] - event_df["start_bp"]
    event_df = event_df.drop(columns=['chromosome', 'start_bp', 'end_bp', 'arm'])
    
    # Binarize all values to whether greater than/less than cutoff
    if loss_or_gain == "gain":
        bin_df = event_df.ge(INDIVIDUAL_GENE_CUTOFF).astype(int)
    elif loss_or_gain == "loss":
        bin_df = event_df.le(-INDIVIDUAL_GENE_CUTOFF).astype(int)
    else:
        raise ValueError(f"Invalid input '{loss_or_gain}' for loss_or_gain parameter")
    
    # Multiply every column by gene lengths
    scaled_df = bin_df.multiply(gene_lengths, axis="index")
    
    # Sum each column, and see what proportion of the overall event length it is. Keep if above cutoff.
    event_coding_length = gene_lengths.sum()
    proportions_df = scaled_df.sum(axis="index") / event_coding_length
    
    return proportions_df >= PROPORTION_WITH_EVENT_CUTOFF

In [9]:
for cancer_type in cnv_tables.keys():
    
    df = cnv_tables[cancer_type]
    
    # Subset our chromosome
    df = df[df.chromosome == CHROMOSOME]
    
    # Find Gain Events
    gain_event = find_events(df,  GAIN_EVENT_START, GAIN_EVENT_END, "gain")
    
    # Find Loss Events
    loss_event = find_events(df,  LOSS_EVENT_START, LOSS_EVENT_END, "loss")
    
    # Create event table
    event_table = pd.DataFrame({'gain_event': gain_event, 'loss_event': loss_event})
    
    # Write to csv
    event_table.to_csv(f'{cancer_type}_has_event.tsv', sep='\t')