# Define Events

## Setup

In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cnvutils
import os

In [2]:
CUTOFF_PERCENT = .2
CANCER_TYPES = ['BRCA']#, 'COLON', 'HNSCC', 'LSCC', 'LUAD', 'OVARIAN']

In [3]:
def read_cancer_type_tcga(cancer_type):
    
    df = pd.read_csv(
        os.path.join("..", cancer_type, "Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes.gz"), 
        sep="\t"
    ).transpose()
    
    return df

In [4]:
cnv = {
    'BRCA': read_cancer_type_tcga("brca"),
#     'CCRCC': ,
#     'COLON': ,
#     'ENDO': ,
#     'GBM': ,
#     'HNSCC': ,
#     'LSCC': ,
#     'LUAD': ,
#     'OVARIAN': 
}

In [5]:
counts = pd.read_csv("cnv_counts.tsv", sep='\t', usecols=['Name', 'Database_ID', 'start_bp', 'end_bp', 'variable', 'value', 'cancer'])

In [6]:
cutoffs = dict()
for cancer_type in cnv.keys():
    df = cnv[cancer_type]
    cutoffs[cancer_type] = len(df) * CUTOFF_PERCENT

## Find Loss Regions

In [7]:
df = counts
loss_event_locations = dict()
for cancer in CANCER_TYPES:
#     df = counts[cancer]
    df_loss = df[(df.variable == 'loss') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_loss.value)
    loss_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                loss_events.append((start, i))
                start = None
    if start is not None:
        loss_events.append((start, len(values)-1))
    event_locations = list()
    for event in loss_events:
        start_bp = df_loss.iloc[event[0]].start_bp
        end_bp = df_loss.iloc[event[1]].start_bp
        event_locations.append((start_bp, end_bp-start_bp))
    loss_event_locations[cancer] = event_locations

In [8]:
loss_event_locations["BRCA"]

[(166049.0, 41763430.0), (98042086.0, 0.0)]

In [9]:
loss_event_patients = list()
for cancer in loss_event_locations.keys():
    events = loss_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        loss_event_patients.append((start, 1))
        loss_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
loss_event_patients.sort()

In [10]:
loss_event_patients

[(166049.0, 1), (41929479.0, 0), (98042086.0, 0), (98042086.0, 1)]

In [11]:
count = 0
current_bp = 0
start = list()
end = list()
size = list()
total = list()
for patient in loss_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
loss_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})
loss_event_data

Unnamed: 0,start,end,counts,length
0,0.0,166049.0,0,166049.0
1,166049.0,41929479.0,1,41763430.0
2,41929479.0,98042086.0,0,56112607.0


## Find Gain Regions

In [12]:
df = counts
gain_event_locations = dict()
for cancer in CANCER_TYPES:
    df_gain = df[(df.variable == 'gain') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_gain.value)
    gain_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                gain_events.append((start, i))
                start = None
    if start is not None:
        gain_events.append((start, len(values)-1))
    event_locations = list()
    for event in gain_events:
        start_bp = df_gain.iloc[event[0]].start_bp
        end_bp = df_gain.iloc[event[1]].start_bp
        
        event_locations.append((start_bp, end_bp-start_bp))
    gain_event_locations[cancer] = event_locations
#     print(cancer)
#     print(event_locations)

In [13]:
gain_event_patients = list()
for cancer in gain_event_locations.keys():
    events = gain_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        gain_event_patients.append((start, 1))
        gain_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
gain_event_patients.sort()

In [14]:
count = 0
current_bp = 0
# results = list()
start = list()
end = list()
size = list()
total = list()
for patient in gain_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
#         results.append((current_bp, patient[0], count))
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
gain_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})

In [15]:
gain_event_data.sort_values('start')

Unnamed: 0,start,end,counts,length
0,0.0,31639222.0,0,31639222.0
1,31639222.0,32192028.0,1,552806.0
2,32192028.0,33370824.0,0,1178796.0
3,33370824.0,98042086.0,1,64671262.0
4,98042086.0,145052465.0,1,47010379.0


In [16]:
## Next we need to find a way to determine which regions to include in the
## event and make new has_event files

## Make Plot

In [17]:
# fig, ax = plt.subplots(nrows=8, sharex=True, sharey=False, gridspec_kw={'height_ratios': [1,1,1,1,1,1,10,1]})
# count = 0
# for cancer in gain_event_locations.keys():
#     ax[count].broken_barh(gain_event_locations[cancer], (0,1), facecolors='#E72121')
#     ax[count].broken_barh(loss_event_locations[cancer], (0,1), facecolors='#1B3DD2')
#     count += 1
# ax[count].bar(x= loss_event_data.start, height = loss_event_data.counts, width=loss_event_data.length, color='blue', align='edge')

# ax[count].bar(x= gain_event_data.start, height = gain_event_data.counts, width=gain_event_data.length, color='red', align='edge')
# cnvutils.make_chromosome_plot('8', ax=ax[count+1])
# ax.broken_barh(gain_event_locations['OVARIAN'], (0, 5), facecolors='red')
# ax.broken_barh(loss_event_locations['OVARIAN'], (0, 5), facecolors='blue')

In [18]:
# plt.bar(x= loss_event_data.start, height = loss_event_data.counts, width=loss_event_data.length, color='blue', align='edge')

In [19]:
# plt.bar(x= gain_event_data.start, height = gain_event_data.counts, width=gain_event_data.length, color='blue', align='edge')

In [20]:
# patients = list()
# for cancer in gain_event_locations.keys():
#     events = gain_event_locations[cancer]
#     for event in events:
#         start = event[0]
#         end = event[0] + event[1]
#         patients.append((start, 1))
#         patients.append((end, 0))
# #     patients += list(gain_event_locations[cancer])
# patients.sort()

In [21]:
# count = 0
# current_bp = 0
# # results = list()
# start = list()
# end = list()
# size = list()
# total = list()
# for patient in patients:
#     if patient[0] != current_bp:
#         start.append(current_bp)
#         end.append(patient[0])
#         size.append(patient[0]-current_bp)
#         total.append(count)
# #         results.append((current_bp, patient[0], count))
#         current_bp = patient[0]
#     if patient[1] == 1:
#         count += 1
#     else:
#         count -= 1
# data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})
# data

In [22]:
# fig, ax = plt.subplots(nrows=7)
# count = 0
# for cancer in gain_event_locations.keys():
#     ax[count].broken_barh(gain_event_locations[cancer], (0,5), facecolors='red')
#     ax[count].broken_barh(loss_event_locations[cancer], (0,5), facecolors='blue')
#     count += 1
# ax[count].bar(x= data.start, height = data.counts, width=data.length)

In [23]:
# count = 0
# current_bp = 0
# results = list()
# for patient in patients:
#     if patient[0] != current_bp:
#         results.append((current_bp, patient[0], count))
#         current_bp = patient[0]
#     if patient[1] == 1:
#         count += 1
#     else:
#         count -= 1
# results