# Define Event Boundaries

This notebook gives you an idea of where gain and loss regions are in a chromosome, so you can decide what specific boundaries to define for a graphically observed particular event.

Use the table to determine what boundaries make the most sense, referencing the plots from the previous notebooks for guidance. Also keep in mind where the centromere is.

For example if I was looking at 6 cancer types, I would ideally choose a boundary that included each of the regions where all 6 cancer types had the event while not including any regions with only a few. However, the regions in the table won't always coincide nicely with the plot, so sometimes you need to include a region with just 5 cancer types as a boundary to more closely resemble what you see.

## Setup

In [67]:
import cnvutils
import json
import numpy as np
import os
import pandas as pd
import seaborn as sns
import altair as alt

In [2]:
chr_params = cnvutils.load_params(os.path.join("data", "chr_params.json"))
CANCER_TYPES = chr_params["CHR_CANCER_TYPES"]
CHROMOSOME = chr_params["CHROMOSOME"]
CUTOFF_PERCENT = chr_params["GENE_CNV_PROPORTION_CUTOFF"]

In [3]:
counts = pd.read_csv(os.path.join("data", f"chr{CHROMOSOME:0>2}_cnv_counts.tsv"), sep='\t')

In [4]:
cutoffs = dict()

for cancer_type in CANCER_TYPES:
    cutoffs[cancer_type] = counts[counts["cancer"] == cancer_type]["cancer_type_total_patients"].iloc[0] * CUTOFF_PERCENT

## Find Loss Regions

In [5]:
df = counts
loss_event_locations = dict()
for cancer in CANCER_TYPES:
    
    df_loss = df[(df.variable == 'loss') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_loss.value)
    loss_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                loss_events.append((start, i))
                start = None
    if start is not None:
        loss_events.append((start, len(values)-1))
    event_locations = list()
    for event in loss_events:
        start_bp = df_loss.iloc[event[0]].start_bp
        end_bp = df_loss.iloc[event[1]].start_bp
        event_locations.append((start_bp, end_bp-start_bp))
    loss_event_locations[cancer] = event_locations

In [6]:
loss_event_locations["luad"]

[(406428.0, 40124162.0)]

In [7]:
loss_event_patients = list()
for cancer in loss_event_locations.keys():
    events = loss_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        loss_event_patients.append((start, 1))
        loss_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
loss_event_patients.sort()

In [8]:
count = 0
current_bp = 0
start = list()
end = list()
size = list()
total = list()
for patient in loss_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
loss_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})
loss_event_data

Unnamed: 0,start,end,counts,length
0,0.0,166049.0,0,166049.0
1,166049.0,406428.0,1,240379.0
2,406428.0,2935353.0,6,2528925.0
3,2935353.0,7355517.0,7,4420164.0
4,7355517.0,8701937.0,6,1346420.0
5,8701937.0,12104389.0,7,3402452.0
6,12104389.0,12721906.0,6,617517.0
7,12721906.0,31639222.0,7,18917316.0
8,31639222.0,36784324.0,6,5145102.0
9,36784324.0,37695782.0,5,911458.0


In [69]:
def make_chromosome_plot_altair(
    chrm,
    genes=None, 
    show_labels=True, 
    title=None, 
    above=True
):
    """ Create a cytoband plot and mark genes

    Parameters:
    chrm (str): The chromosome to be plotted
    genes (list or dict): a list of genes to mark on the plot; if using a dict, the key should be the color with the value as a list of genes to be marked in the g
iven color.
    show_labels (bool): whether to show the gene names
    title (str): the title to show on the plot
    above (bool): If true labels will be placed above the plot. If false labels will be placed below.
    """
    
    # Get cytoband info and gene locations
    cytoband_info = cnvutils.get_cytoband_info()
    if chrm not in cytoband_info['#chromosome'].values:
        raise ValueError(f"Chromosome '{chrm}' not found in cytoband data. Make sure it's a string, not an int.")
    data = cytoband_info[cytoband_info['#chromosome'] == chrm]
    locations = cnvutils.get_gene_locations()

    # Get boundaries for what part of the chromosome to display
    if arm:
        data = data[data.arm == arm]

    if start_bp:
        data = data[data.bp_stop > start_bp]
    else:
        start_bp = np.min(data.bp_start)

    if end_bp:
        data = data[data.bp_start < end_bp]
    else:
        end_bp = np.max(data.bp_stop)

    if above:
        label_location = 75
    else:
        label_location = 20

    # Assemble the colors for the cytobands on the chart
    colors = []
    bp_ends = []
    prev_stop = 0
    for index, row in data.iterrows():
        
        bp_ends.append(row['bp_stop'])
        
        if row['stain'] == 'gneg':
            colors.append('white')
        elif row['stain'] == 'gpos':
            if row['density'] == 25.0:
                colors.append('lightgray')
            elif row['density'] == 50.0:
                colors.append('gray')
            elif row['density'] == 75.0:
                colors.append('darkgray')
            else:
                colors.append('black')
        elif row['stain'] == 'acen':
            colors.append('red')
        else:
            colors.append('lightgray')

    bar_data = pd.DataFrame({
        "chromosome": chrm,
        "color": colors,
        "bp_end": bp_ends,
    })

    bars = alt.Chart(bar_data).mark_bar().encode(
        x=alt.X(
            "bp_end",
        ),
        y=alt.Y(
            "chromosome",
        ),
    )
    
    return bars
    
    not_found = []
    
    if isinstance(genes, list):
        
        for gene in genes:

            loc = list(locations.loc[gene, 'start_bp'])[0]
            gene_chrm = list(locations.loc[gene, 'chromosome'])[0]
            
            if loc > start_bp and loc < end_bp and gene_chrm == chrm:
                ax.axvline(loc, 0, 15, color='r')
                if show_labels:
                    ax.text(loc, label_location, gene, rotation=90)
            else:
                not_found.append(gene)
                
    elif isinstance(genes, dict):
        
        for color in genes.keys():
            for gene in genes[color]:
                loc = list(locations.loc[gene, 'start_bp'])[0]
                gene_chrm = list(locations.loc[gene, 'chromosome'])[0]
                
                if loc > start_bp and loc < end_bp and gene_chrm == chrm:
                    ax.axvline(loc, 0, 15, color=color)
                    if show_labels:
                        ax.text(loc, label_location, gene, rotation=90)
                else:
                    not_found.append(gene)
                    
    if len(not_found) > 0:
        warnings.warn(f'The following genes were not found within the event: {not_found}')

    return plt

make_chromosome_plot_altair(CHROMOSOME, above=True)

## Find Gain Regions

In [9]:
df = counts
gain_event_locations = dict()
for cancer in CANCER_TYPES:
    df_gain = df[(df.variable == 'gain') & (df.cancer == cancer)].sort_values('start_bp')
    values = list(df_gain.value)
    gain_events = list()
    start = None
    for i in range(0, len(values)):
        val = values[i]
        if val > cutoffs[cancer]:
            if start is None:
                start = i
        else:
            if start is not None:
                gain_events.append((start, i))
                start = None
    if start is not None:
        gain_events.append((start, len(values)-1))
    event_locations = list()
    for event in gain_events:
        start_bp = df_gain.iloc[event[0]].start_bp
        end_bp = df_gain.iloc[event[1]].start_bp
        
        event_locations.append((start_bp, end_bp-start_bp))
    gain_event_locations[cancer] = event_locations
#     print(cancer)
#     print(event_locations)

In [10]:
gain_event_patients = list()
for cancer in gain_event_locations.keys():
    events = gain_event_locations[cancer]
    for event in events:
        start = event[0]
        end = event[0] + event[1]
        gain_event_patients.append((start, 1))
        gain_event_patients.append((end, 0))
#     patients += list(gain_event_locations[cancer])
gain_event_patients.sort()

In [11]:
count = 0
current_bp = 0
# results = list()
start = list()
end = list()
size = list()
total = list()
for patient in gain_event_patients:
    if patient[0] != current_bp:
        start.append(current_bp)
        end.append(patient[0])
        size.append(patient[0]-current_bp)
        total.append(count)
#         results.append((current_bp, patient[0], count))
        current_bp = patient[0]
    if patient[1] == 1:
        count += 1
    else:
        count -= 1
gain_event_data = pd.DataFrame({'start': start, 'end': end, 'counts': total, 'length': size})

In [12]:
gain_event_data.sort_values('start')

Unnamed: 0,start,end,counts,length
0,0.0,8701937.0,0,8701937.0
1,8701937.0,12104389.0,1,3402452.0
2,12104389.0,14084845.0,0,1980456.0
3,14084845.0,18170477.0,1,4085632.0
4,18170477.0,18391282.0,0,220805.0
5,18391282.0,20246165.0,1,1854883.0
6,20246165.0,22089150.0,0,1842985.0
7,22089150.0,22165140.0,1,75990.0
8,22165140.0,30384511.0,0,8219371.0
9,30384511.0,31639222.0,1,1254711.0
