In [1]:
import torch
import torch.nn as nn

import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

from models._data import ChromatinDataset


In [2]:
from utils.data_utils import process_bed

In [16]:
import pandas as pd

def adjust_bed_length(pos_bed_file, neg_bed_file, output_pos_file, output_neg_file, length=1000):
    """
    Adjusts the length of regions in positive and negative BED files to a fixed length.
    
    Parameters:
    - pos_bed_file: str, path to the positive BED file
    - neg_bed_file: str, path to the negative BED file
    - output_pos_file: str, path to save the adjusted positive BED file
    - output_neg_file: str, path to save the adjusted negative BED file
    - length: int, the desired length for each BED region (default: 1000)
    """
    # Load the BED files
    pos_df = pd.read_csv(pos_bed_file, sep='\t', header=None)
    pos_df = pos_df.iloc[:, :3]
    pos_df.columns = ['chr', 'start', 'end']
    print(pos_df.head())
    
    # Rename the columns
    
    neg_df = pd.read_csv(neg_bed_file, sep='\t')
    neg_df = neg_df.iloc[:, :3]
    neg_df.columns = ['chr', 'start', 'end']
    print(neg_df.head())

    

    def adjust_length(df):
        """Adjust the start and end positions of the BED regions."""
        adjusted_data = []
        for _, row in df.iterrows():
            midpoint = (row['start'] + row['end']) // 2
            start = midpoint - (length // 2)
            end = midpoint + (length // 2)
            adjusted_data.append([row['chr'], start, end])
        return pd.DataFrame(adjusted_data, columns=['chr', 'start', 'end'])

    # Adjust lengths for both positive and negative BED files
    adjusted_pos_df = adjust_length(pos_df)
    adjusted_neg_df = adjust_length(neg_df)

    # Save the adjusted BED files
    adjusted_pos_df.to_csv(output_pos_file, sep='\t', header=False, index=False)
    adjusted_neg_df.to_csv(output_neg_file, sep='\t', header=False, index=False)

    print(f"Adjusted positive BED file saved to: {output_pos_file}")
    print(f"Adjusted negative BED file saved to: {output_neg_file}")


In [17]:
adjust_bed_length('/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/peaks_no_blacklist.1000.bed', 
                  '/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/output_negatives.1000.bed', 
                  '/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/peaks_no_blacklist.1000.adjusted.bed',
                  '/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/output_negatives.1000.adjusted.bed',
)

    chr      start        end
0  chr1  100027916  100029541
1  chr1  100027916  100029541
2  chr1  100027916  100029541
3  chr1  100027916  100029541
4  chr1  100027916  100029541
    chr      start        end
0  chr3  102470000  102472114
1  chr1  107112000  107114114
2  chr1   13013000   13015114
3  chr1   88577000   88579114
4  chr6   57995000   57997114
Adjusted positive BED file saved to: /gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/peaks_no_blacklist.1000.adjusted.bed
Adjusted negative BED file saved to: /gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/output_negatives.1000.adjusted.bed


In [5]:
import pyfaidx
import pyBigWig
genome = pyfaidx.Fasta("/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/downloads/hg38.fa")


In [6]:
cts_bw = pyBigWig.open("/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/own_data/ENCFF217FGC.bigWig")


In [7]:
cts_bw

<pyBigWig.bigWigFile at 0x7f359f51a0c0>

In [23]:
genome.keys()

odict_keys(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'chr1_KI270706v1_random', 'chr1_KI270707v1_random', 'chr1_KI270708v1_random', 'chr1_KI270709v1_random', 'chr1_KI270710v1_random', 'chr1_KI270711v1_random', 'chr1_KI270712v1_random', 'chr1_KI270713v1_random', 'chr1_KI270714v1_random', 'chr2_KI270715v1_random', 'chr2_KI270716v1_random', 'chr3_GL000221v1_random', 'chr4_GL000008v2_random', 'chr5_GL000208v1_random', 'chr9_KI270717v1_random', 'chr9_KI270718v1_random', 'chr9_KI270719v1_random', 'chr9_KI270720v1_random', 'chr11_KI270721v1_random', 'chr14_GL000009v2_random', 'chr14_GL000225v1_random', 'chr14_KI270722v1_random', 'chr14_GL000194v1_random', 'chr14_KI270723v1_random', 'chr14_KI270724v1_random', 'chr14_KI270725v1_random', 'chr14_KI270726v1_random', 'chr15_KI270727v1_random', 'chr16_KI270728v1_random', 'chr17_GL0

In [2]:
dataset = ChromatinDataset(
    peak_regions="/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/peaks_no_blacklist.1000.adjusted.bed",
    nonpeak_regions="/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/output_negatives.1000.adjusted.bed",
    genome_fasta="/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/data/downloads/hg38.fa",
    cts_bw_file="/gladstone/corces/lab/users/vishvak/chrombpnet_tutorial/own_data/ENCFF217FGC.bigWig",
    inputlen=100,
    outputlen=100,
    max_jitter=50,
    negative_sampling_ratio=0.5
)


['CTAGCTTGGCCATCGTCAGGTGTCTGAATCTGAACAGTGAGAGTAGGACCACACTGCTACCGCATACCTGACTGCCTCATTCTGTGTCCCTCCTGTTCGATATAGTCTGGTTTTGCAAACTGAGGCTTAAGCCTCGGCAGCCAAGAGCAAGCCCTTGTTCCTGTTCTGATGAGGATGTACTGAATTATTTTGTATTCAGT', 'GATTTTCTGTGCAAGTAAAGGCTGCTGAAGCTGAAGCTCATTAGTTTCAGGTAAATCTGCTTCCGATTTGCAGTTCTGCTAAGGTATTTCTTGTGTAGAATATCATTTGGGCTAGGAGGAGGTAGAAATAAAGATAGAAATCTACACAGATGATGATATTTTAAAATTTGGTAAAATAGGATTTAAGTCTATACTACCCA', 'GACACAATTTGTTTTCCCCAAATACTTTTGATCCATGGTTGAATCCACGGATGTAGAAGATACAGAATCACAGATACGTAGGCCCCACTGAAACACACACACACACACCACACACACACACATATGCACACACATCTTCTCTGAGCCACAGTTTCTTGATTAATTAATATAGCAAACATATATGGAGCGCCTCAATATGC', 'AACCTCAACGTACCAGTGATTTCCAGTTCCTTTGTGGCTCAGCAACTTGTGAAGACCAACTAAGTCCTAAGTACCCAGCCTGCAGTATCAGTGTGGGAGTCGGAAACAGATGTTCACGGGACAATGCCATTTGTTGTCACTGTCGATACTATGTGGCAACTTTCCCAAATGAAATTGTTTGTGGATGGAAAAGAATGTAA', 'AATAGTAACATGCTGTACAGGTTTGTAGCCCAGGAGCGATAGACTATTCCATATAGCCTAGGTTCATGGTAGGTGCTACCATCTAGGTTTGTGTAGGTTCACTCTCTGCTGTTCACACAAGGATGAAATCACCTAATGATGCATTTCTCAGAACATGTCCCCATCATTAAGCGAAGCATGAC

RuntimeError: Invalid interval bounds!