# Create Event Tables

The first thing we need to do is create tables that indicate whether a patient has the given event or not. A table should be created for each 

## Setup

In [1]:
import pandas as pd
import cnvutils
import os

In [2]:
def read_cancer_type_tcga(cancer_type):
    
    df = pd.read_csv(
        os.path.join("..", cancer_type, "Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes.gz"), 
        sep="\t"
    )
    
    return df

In [3]:
# List here the types of cancer we want to look at for this event
cnv_tables = {
    'brca': read_cancer_type_tcga("brca"),
#     'colon': ,
#     'hnscc': ,
#     'lscc': ,
#     'luad': ,
#     'ovarian': 
}

## Append Gene location Data

In [4]:
locations = cnvutils.get_gene_locations()

In [6]:
for cancer_type in cnv_tables.keys():
    
    df = cnv_tables[cancer_type].copy()
    idx_split = df["Gene Symbol"].str.split("|", n=1, expand=True)
    df = df.drop(columns="Gene Symbol")
    idx_split = idx_split.fillna("unknown")
    df = df.assign(
        Name=idx_split[0],
        Database_ID=idx_split[1]
    )
    df = df.set_index(["Name", "Database_ID"])
    
    df = df.join(locations)
    
    cnv_tables[cancer_type] = df

## Get percentage with event

In [9]:
CUTOFF = 0.8
# These are the event ranges we were using. I am leaving them here for now in case we find we need them again.
# GAIN_EVENT_START = 80794385
# GAIN_EVENT_END = 130794385
# LOSS_EVENT_START = 0
# LOSS_EVENT_END = 30794385

#These are the new event ranges (30 Sep 2020)
GAIN_EVENT_START = 52110839
GAIN_EVENT_END = 145052465
LOSS_EVENT_START = 202660
LOSS_EVENT_END = 37421341

In [10]:
def has_gain_event(row, cutoff):
    values = list(row)
    percent = len([x for x in values if x > 0]) / len(values)
    return percent >= cutoff

In [11]:
def has_loss_event(row, cutoff):
    values = list(row)
    percent = len([x for x in values if x < 0]) / len(values)
    return percent >= cutoff

In [13]:
for cancer_type in cnv_tables.keys():
    df = cnv_tables[cancer_type]
    # Subset Chromosome 8
    df_8 = df[df.chromosome == '8']
    # Find Gain Events
    event_gain = df_8[df_8.start_bp > GAIN_EVENT_START]
    event_gain = event_gain[event_gain.end_bp < GAIN_EVENT_END]
    event_gain = event_gain.drop(columns=['chromosome', 'start_bp', 'end_bp', 'arm'])
    gain_event = event_gain.apply(lambda x: has_gain_event(x, CUTOFF))
    # Find Loss Events
    event_loss = df_8[df_8.start_bp > LOSS_EVENT_START]
    event_loss = event_loss[event_loss.end_bp < LOSS_EVENT_END]
    event_loss = event_loss.drop(columns=['chromosome', 'start_bp', 'end_bp', 'arm'])
    loss_event = event_loss.apply(lambda x: has_loss_event(x, CUTOFF))
    # Create event table
    event_table = pd.DataFrame({'gain_event': gain_event, 'loss_event': loss_event})
    # Write to csv
    event_table.to_csv(f'{cancer_type}_has_event.tsv', sep='\t')

In [16]:
event_table.shape

(1080, 2)

In [15]:
event_table["gain_event"].sum()

543

In [17]:
event_table["loss_event"].sum()

533

In [14]:
event_table

Unnamed: 0,gain_event,loss_event
TCGA-3C-AAAU-01,True,False
TCGA-3C-AALI-01,True,False
TCGA-3C-AALJ-01,True,True
TCGA-3C-AALK-01,False,False
TCGA-4H-AAAK-01,False,True
TCGA-5L-AAT0-01,False,False
TCGA-5L-AAT1-01,False,False
TCGA-5T-A9QA-01,True,True
TCGA-A1-A0SB-01,False,False
TCGA-A1-A0SD-01,False,False
