# Preload Labels for 3KB Windows

In this notebook we're using existing peak calls for **Encode e11.5's face and hindbrain** dataset and search for differentially expressed peaks.

In [1]:
%load_ext autoreload
%autoreload 2

import bbi
import json
import numpy as np
import os
import sqlite3
import sys
import warnings

**Configurations**

In [65]:
MAX_PRELOADED_LABELS = 100
NUM_SEARCHES_TO_BE_PRELOADED = 10
CLEAR_DB = True

###########################################
# Only change if you know what you're doing
###########################################

base = "../"
settings_filepath = "config-user-study-encode-e11-5-face-hindbrain.json"
window_size = 3000
step_size = 1500
resolution = 25
# 1395142003 is the absolute offset of chr10
target_from = 1395142003 + 57039000
target_to = 1395142003 + 57042000

assert target_to - target_from == window_size

# Minimum value to consider a peak annotation a peak for differential accessible peak annotations
min_peak_val_diff = 0.75 
# Minimum value to consider a peak annotation a peak for equally accessible peak annotations
min_peak_val_same = 1

with open(os.path.join(base, settings_filepath), "r") as f:
    settings = json.load(f)

signal_face = "data/ENCFF373NJX.bigWig"
signal_hindbrain = "data/ENCFF943PHW.bigWig"

narrow_peaks_face = "data/ENCFF545ITR.bigBed"
narrow_peaks_hindbrain = "data/ENCFF007GMX.bigBed"

broad_peaks_face = "data/ENCFF285BLZ.bigBed"
broad_peaks_hindbrain = "data/ENCFF007GMX.bigBed"

In [3]:
# Ignore warnings as they just pollute the output
warnings.filterwarnings('ignore')

# Enable importing modules from the parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../experiments'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../server'))
if module_path not in sys.path:
    sys.path.append(module_path)

### Extract windows

In [4]:
from server.bigwig import chunk

windows_face = chunk(
    signal_face,
    window_size,
    resolution,
    window_size // settings['step_freq'],
    settings['chroms'],
    verbose=True,
)

windows_hindbrain = chunk(
    signal_hindbrain,
    window_size,
    resolution,
    window_size // settings['step_freq'],
    settings['chroms'],
    verbose=True,
)

Extracted 87129 windows from chr10 with a max value of 1.0.
Extracted 87129 windows from chr10 with a max value of 1.0.


**Get the max signal per window**

In [5]:
max_signal_face = np.max(windows_face, axis=1)
max_signal_hindbrain = np.max(windows_hindbrain, axis=1)

### Find differentially accessible peaks much faster

`chunk_beds_binary()` extracts only a binary value per window: `1` if a window contains an annotation, i.e., a peak, or `0` if not.

In [58]:
def chunk_beds(bigbed):
    bins = 11

    chrom_sizes = bbi.chromsizes(bigbed)
    chrom_size = chrom_sizes[settings['chroms'][0]]
    num_total_windows = np.ceil((chrom_size - window_size) / step_size).astype(int) + 1

    num_windows = np.ceil((chrom_size - window_size) / step_size).astype(int) + 1
    start_pos = np.arange(0, step_size * num_total_windows, step_size)
    end_pos = np.arange(window_size, step_size * num_total_windows + window_size, step_size)

    return bbi.stackup(
        bigbed,
        settings['chroms'] * num_total_windows,
        start_pos,
        end_pos,
        bins=bins,
        missing=0,
        oob=0,
    ).astype(int)

In [61]:
peaks_face = chunk_beds(narrow_peaks_face)
peaks_hindbrain = chunk_beds(narrow_peaks_hindbrain)

In [62]:
print('Face peaks: {}'.format(np.sum(np.max(peaks_face[:,2:9], axis=1))))
print('Hindbrain peaks: {}'.format(np.sum(np.max(peaks_hindbrain[:,2:9], axis=1))))

diff_peaks = (
    (
        np.max(peaks_face[:,2:9], axis=1) + np.max(peaks_hindbrain[:,2:9], axis=1) == 1
    ) & (
        np.abs(np.sum(peaks_face[:,2:9], axis=1) - np.sum(peaks_hindbrain[:,2:9], axis=1)) > 2
    )
)
print('Diff peaks: {}'.format(np.sum(diff_peaks)))

same_peaks = (
    np.max(peaks_face[:,2:9], axis=1) + np.max(peaks_hindbrain[:,2:9], axis=1)
) == 2
print('Same peaks: {}'.format(np.sum(same_peaks)))

diff_peaks_win_ids = np.where(diff_peaks)[0]
same_peaks_win_ids = np.where(same_peaks)[0]

Face peaks: 11008
Hindbrain peaks: 8347
Diff peaks: 1762
Same peaks: 5130


In [66]:
diff_peaks_with_max = diff_peaks & ((max_signal_face >= min_peak_val_diff) | (max_signal_hindbrain >= min_peak_val_diff))
diff_peaks_with_max_ids = np.where(diff_peaks_with_max)[0]

print('Diff peaks with max val >= {}: {}'.format(min_peak_val_diff, np.sum(diff_peaks_with_max)))

same_peaks_with_max = same_peaks & ((max_signal_face >= min_peak_val_same) | (max_signal_hindbrain >= min_peak_val_same))
same_peaks_with_max_ids = np.where(same_peaks_with_max)[0]

print('Same peaks with max val >= {}: {}'.format(min_peak_val_same, np.sum(same_peaks_with_max)))

Diff peaks with max val >= 0.75: 55
Same peaks with max val >= 1: 1201


### Preload Search DB with some Labels

Preload at most `MAX_PRELOADED_LABELS` positive and negative differentially accessible peaks. We are limiting the number to not overrepresent negative examples as there seem to be many more peaks that are equally accessible.

In [15]:
from ipywidgets.widgets import Checkbox

clear_db = Checkbox(value=False, description='Clear DB (Make sure you know what you do!)')
clear_db

Checkbox(value=False, description='Clear DB (Make sure you know what you do!)')

In [74]:
from server.config import Config
from server.database import DB

db_path = os.path.join(base, settings["db_path"])

if os.path.exists(db_path) and not clear_db.value:
    print('Database already exist. Check above to delete!')
else:
    os.remove(db_path)
    DB(db_path=db_path, clear=True)

    with sqlite3.connect(db_path) as db:
        for search_id in range(1, NUM_SEARCHES_TO_BE_PRELOADED + 1):
            db.execute(
                """
                    INSERT INTO
                        search(id, target_from, target_to, config)
                    VALUES
                        (?, ?, ?, ?);
                """,
                (int(search_id), int(target_from), int(target_to), json.dumps(settings)),
            )

            for window_idx in np.random.choice(
                diff_peaks_with_max_ids,
                np.min((diff_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),
                replace=False
            ):
                db.execute(
                    """
                        INSERT INTO
                            classification(search_id, window_id, is_positive)
                        VALUES
                            (?, ?, ?);
                    """,
                    (int(search_id), int(window_idx), 1),
                )


            for window_idx in np.random.choice(
                same_peaks_with_max_ids,
                np.min((same_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),
                replace=False
            ):
                db.execute(
                    """
                        INSERT INTO
                            classification(search_id, window_id, is_positive)
                        VALUES
                            (?, ?, ?);
                    """,
                    (int(search_id), int(window_idx), -1),
                )

            db.commit()

**Make sure to start the server first!**

In [75]:
import requests
import time

for search_id in range(NUM_SEARCHES_TO_BE_PRELOADED, 0, -1):
    r = requests.post(
        url = f'http://localhost:5000/api/v1/classifier/?s={search_id}'
    )
    time.sleep(5)
    r = requests.post(
        url = f'http://localhost:5000/api/v1/progress/?s={search_id}&u=1'
    )
    time.sleep(5)