In [None]:
Convert the 147 bp DNA sequences at nucleosome positions on each chromosome into 
a one-hot encoding matrix of 16 dinucleotides,and save it as an HDF5 file with labels.

In [1]:
import h5py, numpy as np, pandas as pd
from tqdm import tqdm

In [2]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH1n.txt'  
CSV_FILE = 'Classified Chromosomes/ch1.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr1-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 1682/1682 [00:00<00:00, 11411.04it/s]


In [3]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH2n.txt'  
CSV_FILE = 'Classified Chromosomes/ch2.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr2-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 7484/7484 [00:00<00:00, 11655.43it/s]


In [4]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH3n.txt'  
CSV_FILE = 'Classified Chromosomes/ch3.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr3-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 2449/2449 [00:00<00:00, 12096.56it/s]


In [5]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH4n.txt'  
CSV_FILE = 'Classified Chromosomes/ch4.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr4-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|███████████████████████████████████████████████████████████████| 13784/13784 [00:01<00:00, 11974.81it/s]


In [6]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH5n.txt'  
CSV_FILE = 'Classified Chromosomes/ch5.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr5-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 5150/5150 [00:00<00:00, 11746.66it/s]


In [7]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH6n.txt'  
CSV_FILE = 'Classified Chromosomes/ch6.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr6-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 2364/2364 [00:00<00:00, 12410.16it/s]


In [8]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH7n.txt'  
CSV_FILE = 'Classified Chromosomes/ch7.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr7-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 9833/9833 [00:00<00:00, 11617.95it/s]


In [9]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH8n.txt'  
CSV_FILE = 'Classified Chromosomes/ch8.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr8-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 4846/4846 [00:00<00:00, 11949.98it/s]


In [10]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH9n.txt'  
CSV_FILE = 'Classified Chromosomes/ch9.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr9-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 3824/3824 [00:00<00:00, 12225.63it/s]


In [11]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH10n.txt'  
CSV_FILE = 'Classified Chromosomes/ch10.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr10-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 6595/6595 [00:00<00:00, 11730.64it/s]


In [12]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH11n.txt'  
CSV_FILE = 'Classified Chromosomes/ch11.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr11-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 6326/6326 [00:00<00:00, 12129.73it/s]


In [13]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH12n.txt'  
CSV_FILE = 'Classified Chromosomes/ch12.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr12-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 9319/9319 [00:00<00:00, 12270.50it/s]


In [14]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH13n.txt'  
CSV_FILE = 'Classified Chromosomes/ch13.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr13-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 8482/8482 [00:00<00:00, 11782.45it/s]


In [15]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH14n.txt'  
CSV_FILE = 'Classified Chromosomes/ch14.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr14-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 7064/7064 [00:00<00:00, 12042.29it/s]


In [16]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH15n.txt'  
CSV_FILE = 'Classified Chromosomes/ch15.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr15-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 9974/9974 [00:00<00:00, 12186.83it/s]


In [17]:
GENOME   = 'SAC1 Genome Deletion of Spaces/CH16n.txt'  
CSV_FILE = 'Classified Chromosomes/ch16.csv'          
H5_OUT   = 'Nucleosome Digital Matrix/chr16-Digital Matrix.h5'
BASE     = ['AA','AC','AG','AT','CA','CC','CG','CT',
            'GA','GC','GG','GT','TA','TC','TG','TT']
SEQ_LEN  = 147
DIN_LEN  = SEQ_LEN - 1
HALF_WIN = 73      

df       = pd.read_csv(CSV_FILE)
sites    = df['site'].to_numpy(dtype=int)
labels   = df['value'].to_numpy(dtype=np.float32)
num_seq  = sites.size

with open(GENOME) as f:
    genome = f.read().strip().upper()

nuar = np.zeros((num_seq, len(BASE), DIN_LEN), dtype=np.float32)
for i, pos in enumerate(tqdm(sites, desc='Encoding')):
    seq = genome[pos-1-HALF_WIN : pos+HALF_WIN] 
    if len(seq) != SEQ_LEN:
        raise ValueError(f'Position {pos} length mismatch: {len(seq)}')
    for j in range(DIN_LEN):
        din = seq[j:j+2]
        if din in BASE:
            nuar[i, BASE.index(din), j] = 1

with h5py.File(H5_OUT, 'w') as h5:
    h5.create_dataset('Nuar',  data=nuar,   dtype=np.float32)
    h5.create_dataset('label', data=labels, dtype=np.float32)

Encoding: 100%|█████████████████████████████████████████████████████████████████| 8569/8569 [00:00<00:00, 12216.54it/s]
