# Preload Labels for 3KB Windows

In this notebook we're using existing peak calls for **Encode e11.5's face and hindbrain** dataset and search for differentially expressed peaks.

In [1]:
%load_ext autoreload
%autoreload 2

import bbi
import json
import numpy as np
import os
import sqlite3
import sys
import warnings

**Configurations**

In [2]:
MAX_PRELOADED_LABELS = 100
NUM_SEARCHES_TO_BE_PRELOADED = 10
CLEAR_DB = True

###########################################
# Only change if you know what you're doing
###########################################

base = "../"
settings_filepath = "config-user-study-encode-e11-5-face-hindbrain.json"
window_size = 3000
resolution = 25

# Minimum value to consider a peak annotation a peak for differential accessible peak annotations
min_peak_val_diff = 0.5 
# Minimum value to consider a peak annotation a peak for equally accessible peak annotations
min_peak_val_same = 1

with open(os.path.join(base, settings_filepath), "r") as f:
    settings = json.load(f)

signal_face = "data/ENCFF373NJX.bigWig"
signal_hindbrain = "data/ENCFF943PHW.bigWig"

narrow_peaks_face = "data/ENCFF545ITR.bigBed"
narrow_peaks_hindbrain = "data/ENCFF007GMX.bigBed"

broad_peaks_face = "data/ENCFF285BLZ.bigBed"
broad_peaks_hindbrain = "data/ENCFF007GMX.bigBed"

In [3]:
# Ignore warnings as they just pollute the output
warnings.filterwarnings('ignore')

# Enable importing modules from the parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../experiments'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../server'))
if module_path not in sys.path:
    sys.path.append(module_path)

### Extract windows

In [4]:
from server.bigwig import chunk

windows_face = chunk(
    signal_face,
    window_size,
    resolution,
    window_size // settings['step_freq'],
    settings['chroms'],
    verbose=True,
)

windows_hindbrain = chunk(
    signal_hindbrain,
    window_size,
    resolution,
    window_size // settings['step_freq'],
    settings['chroms'],
    verbose=True,
)

Extracted 87129 windows from chr10 with a max value of 1.0.
Extracted 87129 windows from chr10 with a max value of 1.0.


**Get the max signal per window**

In [5]:
max_signal_face = np.max(windows_face, axis=1)
max_signal_hindbrain = np.max(windows_hindbrain, axis=1)

### Find differentially accessible peaks much faster

`chunk_beds_binary()` extracts only a binary value per window: `1` if a window contains an annotation, i.e., a peak, or `0` if not.

In [6]:
from ae.utils import chunk_beds_binary

face_wins_has_peaks = chunk_beds_binary(
    broad_peaks_face,
    window_size,
    window_size // settings['step_freq'],
    settings['chroms'],
    verbose=True,
).flatten()

hindbrain_wins_has_peaks = chunk_beds_binary(
    broad_peaks_hindbrain,
    window_size,
    window_size // settings['step_freq'],
    settings['chroms'],
    verbose=True,
).flatten()

print('Face peaks: {}'.format(np.sum(face_wins_has_peaks)))
print('Hindbrain peaks: {}'.format(np.sum(hindbrain_wins_has_peaks)))

wins_has_diff_peak = (face_wins_has_peaks + hindbrain_wins_has_peaks) == 1
print('Diff peaks: {}'.format(np.sum(wins_has_diff_peak)))

wins_has_same_peaks = (face_wins_has_peaks + hindbrain_wins_has_peaks) == 2
print('Same peaks: {}'.format(np.sum(wins_has_same_peaks)))

diff_peaks_win_ids = np.where(wins_has_diff_peak)[0]
same_peaks_win_ids = np.where(wins_has_same_peaks)[0]

diff_peaks_with_max = wins_has_diff_peak & ((max_signal_face >= min_peak_val_diff) | (max_signal_hindbrain >= min_peak_val_diff))
diff_peaks_with_max_ids = np.where(diff_peaks_with_max)[0]

print('Diff peaks with max val >= {}: {}'.format(min_peak_val_diff, np.sum(diff_peaks_with_max)))

same_peaks_with_max = wins_has_same_peaks & ((max_signal_face >= min_peak_val_same) | (max_signal_hindbrain >= min_peak_val_same))
same_peaks_with_max_ids = np.where(same_peaks_with_max)[0]

print('Same peaks with max val >= {}: {}'.format(min_peak_val_same, np.sum(same_peaks_with_max)))

Extracted 87129 windows from chr10 with a max value of 1.0.
Extracted 87129 windows from chr10 with a max value of 1.0.
Face peaks: 11769
Hindbrain peaks: 11257
Diff peaks: 644
Same peaks: 11191
Diff peaks with max val >= 0.5: 77
Same peaks with max val >= 1: 1572


### Preload Search DB with some Labels

Preload at most `MAX_PRELOADED_LABELS` positive and negative differentially accessible peaks. We are limiting the number to not overrepresent negative examples as there seem to be many more peaks that are equally accessible.

In [7]:
from server.config import Config
from server.database import DB

db_path = os.path.join(base, settings["db_path"])

if CLEAR_DB:
    os.remove(db_path)
    DB(db_path=db_path, clear=True)
else:
    try:
        with sqlite3.connect(db_path) as db:
            c = db.cursor()
            c.execute(f"SELECT * FROM classification")
            c.fetchone()
    except sqlite3.OperationalError:
        DB(db_path=db_path, clear=CLEAR_DB)

with sqlite3.connect(db_path) as db:
    for search_id in range(NUM_SEARCHES_TO_BE_PRELOADED): 
        db.execute(f"DELETE FROM classification WHERE search_id = {int(search_id)};")
        db.commit()

        for window_idx in np.random.choice(
            diff_peaks_with_max_ids,
            np.min((diff_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),
            replace=False
        ):
            db.execute(
                """
                    INSERT INTO
                        classification(search_id, window_id, is_positive)
                    VALUES
                        (?, ?, ?);
                """,
                (int(search_id), int(window_idx), 1),
            )


        for window_idx in np.random.choice(
            same_peaks_with_max_ids,
            np.min((same_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),
            replace=False
        ):
            db.execute(
                """
                    INSERT INTO
                        classification(search_id, window_id, is_positive)
                    VALUES
                        (?, ?, ?);
                """,
                (int(search_id), int(window_idx), -1),
            )

        db.commit()