In [1]:
import h5py, os, sys
from io import StringIO
from Bio import SeqIO
import numpy as np

In [2]:
#Use h5py pacakge to read fast5 file

fast5path = 'test/0003d072-ec5c-436e-9ac7-9b1f2b695e00.fast5'
f = h5py.File(fast5path, 'r')
f

OSError: Unable to open file (unable to open file: name = 'test/0003d072-ec5c-436e-9ac7-9b1f2b695e00.fast5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [3]:
##Show the group name in the hdf5
list(f.keys())

['Analyses', 'Raw', 'UniqueGlobalKey']

In [4]:
# Get the names of all groups and subgroups in the file
list_name = []
f.visit(list_name.append)
list_name

['Analyses',
 'Analyses/Basecall_1D_000',
 'Analyses/Basecall_1D_000/BaseCalled_template',
 'Analyses/Basecall_1D_000/BaseCalled_template/Fastq',
 'Analyses/Basecall_1D_001',
 'Analyses/Basecall_1D_001/BaseCalled_template',
 'Analyses/Basecall_1D_001/BaseCalled_template/Fastq',
 'Analyses/Basecall_1D_001/BaseCalled_template/Move',
 'Analyses/Basecall_1D_001/BaseCalled_template/Trace',
 'Analyses/Basecall_1D_001/Summary',
 'Analyses/Basecall_1D_001/Summary/basecall_1d_template',
 'Analyses/RawGenomeCorrected_000',
 'Analyses/RawGenomeCorrected_000/BaseCalled_template',
 'Analyses/RawGenomeCorrected_000/BaseCalled_template/Alignment',
 'Analyses/RawGenomeCorrected_000/BaseCalled_template/Events',
 'Analyses/RawGenomeCorrected_001',
 'Analyses/RawGenomeCorrected_001/BaseCalled_template',
 'Analyses/RawGenomeCorrected_001/BaseCalled_template/Alignment',
 'Analyses/RawGenomeCorrected_001/BaseCalled_template/Events',
 'Analyses/Segmentation_000',
 'Analyses/Segmentation_000/Summary',
 'Analy

In [5]:
#Different ways to extract values in different layers
print(f['Analyses/RawGenomeCorrected_001'])
print(f['Analyses']['RawGenomeCorrected_001'])

<HDF5 group "/Analyses/RawGenomeCorrected_001" (1 members)>
<HDF5 group "/Analyses/RawGenomeCorrected_001" (1 members)>


In [6]:
##Define the specific group in fast5 by using .join 
corr_group='RawGenomeCorrected_001'
basecall_subgroup='BaseCalled_template'
corrgroup_path = f['/'.join(('/Analyses', corr_group))]
corrgroup_path

<HDF5 group "/Analyses/RawGenomeCorrected_001" (1 members)>

In [7]:
#Check whether the fast5 read is template or complementary read
test = '/'.join(('/Analyses',corr_group,basecall_subgroup))
test.endswith('template')

True

In [8]:
##Extract chromosome information
strand_path = '/'.join(('/Analyses', corr_group, basecall_subgroup))
align_data = f['/'.join((strand_path, 'Alignment'))]
chrom = align_data.attrs['mapped_chrom']
chrom

'KJ477685.1'

In [11]:
## Check python version, ensure python >= 3.0
sys.version_info.major >=3

True

### Sequence after Guppy basecall

In [12]:
##Extract the sequence information
##https://github.com/rsemeraro/PyPore/blob/master/lib/fastqparser.py
base_path = 'Analyses/Basecall_1D_001/BaseCalled_template/Fastq'
fastq = SeqIO.read(StringIO(f[base_path][()].decode('UTF-8')), "fastq")
fastq

SeqRecord(seq=Seq('AGTGTACTTCGTTCAGTTGCTTCAATTTAGGTGTTTAACCGTTTTCGCATTTAT...TTC', SingleLetterAlphabet()), id='0003d072-ec5c-436e-9ac7-9b1f2b695e00', name='0003d072-ec5c-436e-9ac7-9b1f2b695e00', description='0003d072-ec5c-436e-9ac7-9b1f2b695e00 runid=d91ad31c79c775c842fbd9190b78583d166c15a0 sampleid=19-li-001 read=344142 ch=276 start_time=2019-02-28T14:18:35Z', dbxrefs=[])

In [13]:
fastq.id

'0003d072-ec5c-436e-9ac7-9b1f2b695e00'

In [14]:
type(fastq.id)

str

In [15]:
fastq.seq

Seq('AGTGTACTTCGTTCAGTTGCTTCAATTTAGGTGTTTAACCGTTTTCGCATTTAT...TTC', SingleLetterAlphabet())

### Event after Tombo re-squiggle

In [25]:
#Check the value in the event after baescall
event = []
event_path = f['/'.join([strand_path, 'Events'])]
'/'.join([strand_path, 'Events']) in f

True

In [26]:
type(event_path)

h5py._hl.dataset.Dataset

In [27]:
list(event_path)[0:10]

[(-1.13370124, nan, 0, 3, b'A'),
 (-0.91222203, nan, 3, 26, b'A'),
 (2.19862358, nan, 29, 22, b'T'),
 (0.08417984, nan, 51, 9, b'A'),
 (-1.36380601, nan, 60, 51, b'A'),
 (-0.88788829, nan, 111, 29, b'A'),
 (2.17762879, nan, 140, 46, b'T'),
 (0.0334348, nan, 186, 9, b'A'),
 (-1.37378748, nan, 195, 11, b'A'),
 (-0.96083065, nan, 206, 34, b'A')]

In [28]:
list(event_path)[-1]

(0.96449606, nan, 484271, 3, b'C')

In [29]:
print(type(event_path))
print(event_path.shape)
print(event_path.dtype)

<class 'h5py._hl.dataset.Dataset'>
(29750,)
[('norm_mean', '<f8'), ('norm_stdev', '<f8'), ('start', '<u4'), ('length', '<u4'), ('base', 'S1')]


In [31]:
#Check the header for the event
event_path.dtype.names

('norm_mean', 'norm_stdev', 'start', 'length', 'base')

In [32]:
#Get the base information for each event
event=list(event_path)
for i in range(len(event)):
    seq += event[i]['base'].decode('UTF-8')
    seq = ''.join(seq)
seq[0:10]

'AATAAATAAA'

In [33]:
length = event_path['length']
length

array([ 3, 26, 22, ...,  3,  5,  3], dtype=uint32)

In [37]:
#Get read_start_rel_to_raw
read_start_rel_to_raw = event_path.attrs['read_start_rel_to_raw']
starts = list(map(lambda x: x+read_start_rel_to_raw, event_path['start']))
lengths = event_path['length'].astype(np.int)
base = [x.decode("UTF-8") for x in event_path['base']]
assert len(starts) == len(lengths)
assert len(lengths) == len(base)
events = list(zip(starts, lengths, base))
events[-1]

(485990, 3, 'C')

In [38]:
reads_group = 'Raw/Reads'
read = list(f[reads_group].values())[0]
print(read)

<HDF5 group "/Raw/Reads/Read_344142" (1 members)>


In [39]:
np.array(read['Signal'])==read['Signal'][()]

array([ True,  True,  True, ...,  True,  True,  True])

In [40]:
raw_signal = np.array(read['Signal'])
raw_signal

array([505, 517, 516, ..., 595, 590, 601], dtype=int16)

In [53]:
import seaborn as sns
import re
fast5path = 'test/0003d072-ec5c-436e-9ac7-9b1f2b695e00.fast5'
fast5_data = h5py.File(fast5path, mode='r') 

event_path = '/'.join(['/Analyses', corr_group, basecall_subgroup, 'Events'])
event = []
if event_path in fast5_data:
    event=list(f[event_path])
    for i in range(len(event)):
        seq += event[i]['base'].decode('UTF-8')
        seq = ''.join(seq)
        CGs = [m.start() for m in re.finditer('CG', seq)] 
        
    print("We have", len(event), "events / bp in the reference genome")
    print(len(CGs), "CpGs detected in the sequence.")

strand_path = '/'.join(('/Analyses', corr_group, basecall_subgroup))
if '/'.join([strand_path, 'Alignment']) in fast5_data:
    align_data = fast5_data['/'.join([strand_path, 'Alignment'])]
    chrom = align_data.attrs['mapped_chrom']
    strand = align_data.attrs['mapped_strand']
    start = align_data.attrs['mapped_start']
    end = align_data.attrs['mapped_end'] 
    num_deletions = align_data.attrs['num_deletions']                
    num_insertions = align_data.attrs['num_insertions']
    num_matches = align_data.attrs['num_matches']
    num_mismatches = align_data.attrs['num_mismatches']
    BaseCalled_template_status = fast5_data[strand_path].attrs['status']
    if strand_path.endswith('template'):
        direction = 't'
    else:
        direction = 'c'
        
    print("Alignment:")        
    print("chromosome:", chrom)
    print("start:", start)
    print("end:", end)
    print("strand:", strand)
    print("num_deletions:", num_deletions)
    print("num_insertions:", num_insertions)
    print("num_matches:", num_matches)
    print("num_mismatches:", num_mismatches)
    print("sequence_direction:", direction)
    print("Basecall template:", BaseCalled_template_status)    

KeyboardInterrupt: 

In [None]:
          
data = []
for i in CGs:
    data.append(events[i][0])    
p3=sns.kdeplot(data, shade=True, color="b")    

In [142]:
readID = list(fast5_data['Raw']['Reads'])[0]
rawSignal = fast5_data['/'.join([read_group, readID, 'Signal'])]
rawSignal = 

<HDF5 dataset "Signal": shape (486005,), type "<i2">

In [161]:
read_group = 'Raw/Reads'
if read_group in fast5_data:
    #Extract signal information
    raw_read = next(iter(fast5_data[read_group].values()))
    median_before = raw_read.attrs.get('median_before')
    signal = raw_read['Signal']
    print(median_before, signal)

203.82383728027344 <HDF5 dataset "Signal": shape (486005,), type "<i2">


In [157]:
read_start_rel_to_raw = fast5_data['/'.join([strand_path, 'Events'])].attrs['read_start_rel_to_raw']
read_start_rel_to_raw

1719