# Define Events

## Setup

In [2]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import cptac
import numpy as np
import cnvutils

In [3]:
CUTOFF_PERCENT = .2
CANCER_TYPES = ['BRCA', 'COLON', 'HNSCC', 'LSCC', 'LUAD', 'OVARIAN']

In [4]:
cnv = {
    'BRCA': cptac.Brca().get_CNV(),
    'CCRCC': cptac.Ccrcc().get_CNV(),
    'COLON': cptac.Colon().get_CNV(),
    'ENDO': cptac.Endometrial().get_CNV(),
    'GBM': cptac.Gbm().get_CNV(),
    'HNSCC': cptac.Hnscc().get_CNV(),
    'LSCC': cptac.Lscc().get_CNV(),
    'LUAD': cptac.Luad().get_CNV(),
    'OVARIAN': cptac.Ovarian().get_CNV()
}

Checking that hnscc index is up-to-date...      



Checking that lscc index is up-to-date... 



Checking that luad index is up-to-date...



                                            

In [5]:
counts = pd.read_csv("cnv_counts.tsv", sep='\t', usecols=['Name', 'Database_ID', 'start_bp', 'end_bp', 'variable', 'value', 'cancer'])

In [6]:
cutoffs = dict()
for cancer_type in cnv.keys():
    df = cnv[cancer_type]
    cutoffs[cancer_type] = len(df) * CUTOFF_PERCENT

## Find Loss Regions

In [7]:
df = counts
loss_event_locations = dict()
for cancer in CANCER_TYPES:
#     df = counts[cancer]
    df_loss = df[(df.variable == 'loss') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_loss.value)
    loss_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                loss_events.append((start, i))
                start = None
    if start is not None:
        loss_events.append((start, len(values)-1))
    event_locations = list()
    for event in loss_events:
        start_bp = df_loss.iloc[event[0]].start_bp
        end_bp = df_loss.iloc[event[1]].start_bp
        event_locations.append((start_bp, end_bp-start_bp))
    loss_event_locations[cancer] = event_locations

In [22]:
loss_event_locations["LUAD"]

[(232137.0, 40298453.0)]

In [8]:
loss_event_patients = list()
for cancer in loss_event_locations.keys():
    events = loss_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        loss_event_patients.append((start, 1))
        loss_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
loss_event_patients.sort()

In [23]:
loss_event_patients

[(166049.0, 1),
 (166049.0, 1),
 (166049.0, 1),
 (166049.0, 1),
 (202660.0, 1),
 (232137.0, 1),
 (35525176.0, 0),
 (37421341.0, 0),
 (38600661.0, 0),
 (38728186.0, 1),
 (38901235.0, 0),
 (38996869.0, 1),
 (39314591.0, 0),
 (39314591.0, 0),
 (39743735.0, 1),
 (39902275.0, 0),
 (40530590.0, 0),
 (40530590.0, 0),
 (40530590.0, 1),
 (41261962.0, 0),
 (41271048.0, 1),
 (41529218.0, 0),
 (41645177.0, 1),
 (41653220.0, 1),
 (41840059.0, 0),
 (41929479.0, 0),
 (43284626.0, 1),
 (43292483.0, 0),
 (43372559.0, 1),
 (43378297.0, 0),
 (66628487.0, 1),
 (66921684.0, 0),
 (119325171.0, 1),
 (119557086.0, 0),
 (123348034.0, 1),
 (123416726.0, 0)]

In [9]:
count = 0
current_bp = 0
start = list()
end = list()
size = list()
total = list()
for patient in loss_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
loss_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})
loss_event_data

Unnamed: 0,start,end,counts,length
0,0.0,166049.0,0,166049.0
1,166049.0,202660.0,4,36611.0
2,202660.0,232137.0,5,29477.0
3,232137.0,35525176.0,6,35293039.0
4,35525176.0,37421341.0,5,1896165.0
5,37421341.0,38600661.0,4,1179320.0
6,38600661.0,38728186.0,3,127525.0
7,38728186.0,38901235.0,4,173049.0
8,38901235.0,38996869.0,3,95634.0
9,38996869.0,39314591.0,4,317722.0


## Find Gain Regions

In [10]:
df = counts
gain_event_locations = dict()
for cancer in CANCER_TYPES:
    df_gain = df[(df.variable == 'gain') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_gain.value)
    gain_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                gain_events.append((start, i))
                start = None
    if start is not None:
        gain_events.append((start, len(values)-1))
    event_locations = list()
    for event in gain_events:
        start_bp = df_gain.iloc[event[0]].start_bp
        end_bp = df_gain.iloc[event[1]].start_bp
        
        event_locations.append((start_bp, end_bp-start_bp))
    gain_event_locations[cancer] = event_locations
#     print(cancer)
#     print(event_locations)

In [11]:
gain_event_patients = list()
for cancer in gain_event_locations.keys():
    events = gain_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        gain_event_patients.append((start, 1))
        gain_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
gain_event_patients.sort()

In [12]:
count = 0
current_bp = 0
# results = list()
start = list()
end = list()
size = list()
total = list()
for patient in gain_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
#         results.append((current_bp, patient[0], count))
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
gain_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})

In [13]:
gain_event_data.sort_values('start')

Unnamed: 0,start,end,counts,length
0,0.0,166049.0,0,166049.0
1,166049.0,232137.0,2,66088.0
2,232137.0,2935353.0,1,2703216.0
3,2935353.0,31639222.0,0,28703869.0
4,31639222.0,32192028.0,1,552806.0
5,32192028.0,33370824.0,0,1178796.0
6,33370824.0,34784028.0,1,1413204.0
7,34784028.0,36784324.0,2,2000296.0
8,36784324.0,37695782.0,3,911458.0
9,37695782.0,38163335.0,4,467553.0


In [14]:
## Next we need to find a way to determine which regions to include in the
## event and make new has_event files

## Make Plot

In [15]:
# fig, ax = plt.subplots(nrows=8, sharex=True, sharey=False, gridspec_kw={'height_ratios': [1,1,1,1,1,1,10,1]})
# count = 0
# for cancer in gain_event_locations.keys():
#     ax[count].broken_barh(gain_event_locations[cancer], (0,1), facecolors='#E72121')
#     ax[count].broken_barh(loss_event_locations[cancer], (0,1), facecolors='#1B3DD2')
#     count += 1
# ax[count].bar(x= loss_event_data.start, height = loss_event_data.counts, width=loss_event_data.length, color='blue', align='edge')

# ax[count].bar(x= gain_event_data.start, height = gain_event_data.counts, width=gain_event_data.length, color='red', align='edge')
# cnvutils.make_chromosome_plot('8', ax=ax[count+1])
# ax.broken_barh(gain_event_locations['OVARIAN'], (0, 5), facecolors='red')
# ax.broken_barh(loss_event_locations['OVARIAN'], (0, 5), facecolors='blue')

In [16]:
# plt.bar(x= loss_event_data.start, height = loss_event_data.counts, width=loss_event_data.length, color='blue', align='edge')

In [17]:
# plt.bar(x= gain_event_data.start, height = gain_event_data.counts, width=gain_event_data.length, color='blue', align='edge')

In [18]:
# patients = list()
# for cancer in gain_event_locations.keys():
#     events = gain_event_locations[cancer]
#     for event in events:
#         start = event[0]
#         end = event[0] + event[1]
#         patients.append((start, 1))
#         patients.append((end, 0))
# #     patients += list(gain_event_locations[cancer])
# patients.sort()

In [19]:
# count = 0
# current_bp = 0
# # results = list()
# start = list()
# end = list()
# size = list()
# total = list()
# for patient in patients:
#     if patient[0] != current_bp:
#         start.append(current_bp)
#         end.append(patient[0])
#         size.append(patient[0]-current_bp)
#         total.append(count)
# #         results.append((current_bp, patient[0], count))
#         current_bp = patient[0]
#     if patient[1] == 1:
#         count += 1
#     else:
#         count -= 1
# data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})
# data

In [20]:
# fig, ax = plt.subplots(nrows=7)
# count = 0
# for cancer in gain_event_locations.keys():
#     ax[count].broken_barh(gain_event_locations[cancer], (0,5), facecolors='red')
#     ax[count].broken_barh(loss_event_locations[cancer], (0,5), facecolors='blue')
#     count += 1
# ax[count].bar(x= data.start, height = data.counts, width=data.length)

In [21]:
# count = 0
# current_bp = 0
# results = list()
# for patient in patients:
#     if patient[0] != current_bp:
#         results.append((current_bp, patient[0], count))
#         current_bp = patient[0]
#     if patient[1] == 1:
#         count += 1
#     else:
#         count -= 1
# results