# Define Event Boundaries

This notebook gives you an idea of where gain and loss regions are in a chromosome, so you can decide what specific boundaries to define for a graphically observed particular event.

Use the table to determine what boundaries make the most sense, referencing the plots from the previous notebooks for guidance. Also keep in mind where the centromere is.

For example if I was looking at 6 cancer types, I would ideally choose a boundary that included each of the regions where all 6 cancer types had the event while not including any regions with only a few. However, the regions in the table won't always coincide nicely with the plot, so sometimes you need to include a region with just 5 cancer types as a boundary to more closely resemble what you see.

## Setup

In [1]:
import cnvutils
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

In [2]:
params = cnvutils.load_params(os.path.join("data", "params.json"))
CANCER_TYPES = params["CANCER_TYPES"]
CHROMOSOME = params["CHROMOSOME"]
CUTOFF_PERCENT = params["GENE_CNV_PROPORTION_CUTOFF"]

In [3]:
counts = pd.read_csv(os.path.join("data", "cnv_counts.tsv"), sep='\t')

In [4]:
cutoffs = dict()

for cancer_type in CANCER_TYPES:
    cutoffs[cancer_type] = counts[counts["cancer"] == cancer_type]["cancer_type_total_patients"].iloc[0] * CUTOFF_PERCENT

## Find Loss Regions

In [5]:
df = counts
loss_event_locations = dict()
for cancer in CANCER_TYPES:
    
    df_loss = df[(df.variable == 'loss') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_loss.value)
    loss_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                loss_events.append((start, i))
                start = None
    if start is not None:
        loss_events.append((start, len(values)-1))
    event_locations = list()
    for event in loss_events:
        start_bp = df_loss.iloc[event[0]].start_bp
        end_bp = df_loss.iloc[event[1]].start_bp
        event_locations.append((start_bp, end_bp-start_bp))
    loss_event_locations[cancer] = event_locations

In [6]:
loss_event_locations["luad"]

[(232137.0, 40298453.0)]

In [7]:
loss_event_patients = list()
for cancer in loss_event_locations.keys():
    events = loss_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        loss_event_patients.append((start, 1))
        loss_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
loss_event_patients.sort()

In [8]:
count = 0
current_bp = 0
start = list()
end = list()
size = list()
total = list()
for patient in loss_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
loss_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})
loss_event_data

Unnamed: 0,start,end,counts,length
0,0.0,166049.0,0,166049.0
1,166049.0,202660.0,4,36611.0
2,202660.0,232137.0,5,29477.0
3,232137.0,2935353.0,6,2703216.0
4,2935353.0,6977649.0,7,4042296.0
5,6977649.0,7055304.0,6,77655.0
6,7055304.0,7414855.0,7,359551.0
7,7414855.0,8317736.0,6,902881.0
8,8317736.0,12104389.0,7,3786653.0
9,12104389.0,12425614.0,6,321225.0


## Find Gain Regions

In [9]:
df = counts
gain_event_locations = dict()
for cancer in CANCER_TYPES:
    df_gain = df[(df.variable == 'gain') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_gain.value)
    gain_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                gain_events.append((start, i))
                start = None
    if start is not None:
        gain_events.append((start, len(values)-1))
    event_locations = list()
    for event in gain_events:
        start_bp = df_gain.iloc[event[0]].start_bp
        end_bp = df_gain.iloc[event[1]].start_bp
        
        event_locations.append((start_bp, end_bp-start_bp))
    gain_event_locations[cancer] = event_locations
#     print(cancer)
#     print(event_locations)

In [10]:
gain_event_patients = list()
for cancer in gain_event_locations.keys():
    events = gain_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        gain_event_patients.append((start, 1))
        gain_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
gain_event_patients.sort()

In [11]:
count = 0
current_bp = 0
# results = list()
start = list()
end = list()
size = list()
total = list()
for patient in gain_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
#         results.append((current_bp, patient[0], count))
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
gain_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})

In [12]:
gain_event_data.sort_values('start')

Unnamed: 0,start,end,counts,length
0,0.0,166049.0,0,166049.0
1,166049.0,232137.0,2,66088.0
2,232137.0,2935353.0,1,2703216.0
3,2935353.0,35235475.0,0,32300122.0
4,35235475.0,36784324.0,2,1548849.0
5,36784324.0,37695782.0,3,911458.0
6,37695782.0,38163335.0,4,467553.0
7,38163335.0,38600661.0,5,437326.0
8,38600661.0,38728186.0,4,127525.0
9,38728186.0,39451045.0,5,722859.0
