# Define Event

Before we can analyze the event, we need to identify the boundaries of the event. We will accomplish this by

1. Defining the values to be classified as gains and losses
2. Creating a counts table defining the various events
3. Defining the porportion of patients with gain or loss to be considered significant
4. Identifying regions of gain and loss
5. Identifying regions where all cancer types meet criteria for gain or loss event.

## Setup

In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import cptac
import numpy as np
import cnvutils



## Part 1: Define Parameters

These are the parameters that must be set for each analysis.

In [2]:
# # A list of cancer types to look at (should be all capital letter ex: BRCA)
# CANCER_TYPES = ['BRCA', 'COLON', 'HNSCC', 'LSCC', 'LUAD', 'OVARIAN']
# The porportion of patients with gain or loss of a given gene to be considered
PATIENT_CUTOFF = 0.2
# The chromosome to be analyzed (should be a string)
CHROMOSOME = '8'
# The arm to be analyzed. Options are: 'p', 'q' or 'both'.
ARM = 'both'

In [3]:
# Here you will need to load in all the cancer types to be considered. We have listed all for convenience, simply comment
# out the cancer types you do not want to consider.
cnv = {
    'BRCA': cptac.Brca().get_CNV(),
#     'CCRCC': cptac.Ccrcc().get_CNV(),
    'COLON': cptac.Colon().get_CNV(),
#     'ENDO': cptac.Endometrial().get_CNV(),
#     'GBM': cptac.Gbm().get_CNV(),
    'HNSCC': cptac.Hnscc().get_CNV(),
    'LSCC': cptac.Lscc().get_CNV(),
    'LUAD': cptac.Luad().get_CNV(),
    'OVARIAN': cptac.Ovarian().get_CNV()
}

Checking that lscc index is up-to-date... 



Checking that luad index is up-to-date...



                                            

## Part 2: Determine Cutoffs

In [4]:
counts = pd.read_csv("cnv_counts.tsv", sep='\t', usecols=['Name', 'Database_ID', 'start_bp', 'end_bp', 'variable', 'value', 'cancer', 'chromosome', 'arm'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
cutoffs = dict()
for cancer_type in cnv.keys():
    df = cnv[cancer_type]
    cutoffs[cancer_type] = len(df) * PATIENT_CUTOFF

## Part 3: Subset Event

In [6]:
# counts.cancer.unique()

In [7]:
counts = counts[(counts.chromosome == CHROMOSOME)]
if ARM != 'both':
    counts = counts[counts.arm == ARM]

In [8]:
# counts

## Find Loss Regions

In [9]:
df = counts
loss_event_locations = dict()
for cancer in cnv.keys():
    df_loss = df[(df.variable == 'loss') & (df.cancer == cancer)].sort_values('start_bp')
    print(len(df_loss))
    values = list(df_loss.value)
    loss_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                loss_events.append((start, i))
                start = None
    if start is not None:
        loss_events.append((start, len(values)-1))
    event_locations = list()
    for event in loss_events:
        start_bp = df_loss.iloc[event[0]].start_bp
        end_bp = df_loss.iloc[event[1]].start_bp
        event_locations.append((start_bp, end_bp-start_bp))
    loss_event_locations[cancer] = event_locations

784
765
2403
837
639
572


In [10]:
loss_event_locations['LUAD']

[(232137.0, 40298453.0)]

In [11]:
loss_event_patients = list()
for cancer in loss_event_locations.keys():
    events = loss_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        loss_event_patients.append((start, 1))
        loss_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
loss_event_patients.sort()

In [12]:
counts

Unnamed: 0,Name,Database_ID,start_bp,end_bp,cancer,chromosome,arm,variable,value
32,AARD,ENSG00000205002.3,116938207.0,116944487.0,BRCA,8,q,gain,79
33,AARD,ENSG00000205002.3,116938207.0,116944487.0,CCRCC,8,q,gain,7
129,ABRA,ENSG00000174429.3,106759483.0,106770244.0,BRCA,8,q,gain,79
130,ABRA,ENSG00000174429.3,106759483.0,106770244.0,CCRCC,8,q,gain,7
288,ADAM18,ENSG00000168619.11,39584489.0,39730065.0,BRCA,8,p,gain,49
...,...,...,...,...,...,...,...,...,...
476195,XRCC6P4,,62855068.0,62857134.0,HNSCC,8,q,loss,2
476514,ZNF317P1,,93646066.0,93647092.0,HNSCC,8,q,loss,2
476535,ZNF705CP,,12356136.0,12359391.0,HNSCC,8,p,loss,32
476575,ZNHIT1P1,,142858354.0,142858797.0,HNSCC,8,q,loss,2


In [13]:
count = 0
current_bp = 0
start = list()
end = list()
size = list()
total = list()
for patient in loss_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
loss_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})

In [14]:
loss_event_data

Unnamed: 0,start,end,counts,length
0,0.0,166049.0,0,166049.0
1,166049.0,202660.0,4,36611.0
2,202660.0,232137.0,5,29477.0
3,232137.0,35525176.0,6,35293039.0
4,35525176.0,37421341.0,5,1896165.0
5,37421341.0,38600661.0,4,1179320.0
6,38600661.0,38728186.0,3,127525.0
7,38728186.0,38901235.0,4,173049.0
8,38901235.0,38996869.0,3,95634.0
9,38996869.0,39314591.0,4,317722.0


## Find Gain Regions

In [15]:
df = counts
gain_event_locations = dict()
for cancer in cnv.keys():
    df_gain = df[(df.variable == 'gain') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_gain.value)
    gain_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                gain_events.append((start, i))
                start = None
    if start is not None:
        gain_events.append((start, len(values)-1))
    event_locations = list()
    for event in gain_events:
        start_bp = df_gain.iloc[event[0]].start_bp
        end_bp = df_gain.iloc[event[1]].start_bp
        
        event_locations.append((start_bp, end_bp-start_bp))
    gain_event_locations[cancer] = event_locations

In [16]:
gain_event_patients = list()
for cancer in gain_event_locations.keys():
    events = gain_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        gain_event_patients.append((start, 1))
        gain_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
gain_event_patients.sort()

In [17]:
count = 0
current_bp = 0
# results = list()
start = list()
end = list()
size = list()
total = list()
for patient in gain_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
#         results.append((current_bp, patient[0], count))
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
gain_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})

In [18]:
gain_event_data

Unnamed: 0,start,end,counts,length
0,0.0,166049.0,0,166049.0
1,166049.0,232137.0,2,66088.0
2,232137.0,2935353.0,1,2703216.0
3,2935353.0,31639222.0,0,28703869.0
4,31639222.0,32192028.0,1,552806.0
5,32192028.0,33370824.0,0,1178796.0
6,33370824.0,34784028.0,1,1413204.0
7,34784028.0,36784324.0,2,2000296.0
8,36784324.0,37695782.0,3,911458.0
9,37695782.0,38163335.0,4,467553.0
