In [1]:
import numpy as np
import pod5 as p5

In [9]:
def extract_signal_from_pod5(pod5_path):
    signals = {}
    with p5.Reader(pod5_path) as reader:
        for read_record in reader.reads():
            # print(read_record.read_id)
            # print(read_record.calibration.offset)#shift_dacs_to_pa 用于将原始信号转化为pA（皮安）
            # print(read_record.calibration.scale)#scale_dacs_to_pa
            # signals.update({read_record.read_id:read_record.signal})
            signals[str(read_record.read_id)] = {
                "signal": read_record.signal,
                "shift": read_record.calibration.offset,
                "scale": read_record.calibration.scale,
            }  # 不加str会变成UUID，很奇怪
    return signals

In [6]:
# remote版
pod5_path = "/homeb/xiaoyf/data/HG002/example/pod5/output.pod5"
signal = extract_signal_from_pod5(pod5_path)

71d1a7db-4491-479c-9fe8-7e21806eda35
-249.0
0.1462070643901825
71d6edf9-21e2-4286-b136-77b400817078
-257.0
0.1462070643901825
71e028f7-aa1a-4a30-b6f0-890ce4b2003c
-262.0
0.1462070643901825
71f5c444-4a01-4ecb-97c0-29dfd477bd12
-240.0
0.1462070643901825
7203e3d2-7560-49ae-aaa5-b225d4255112
-259.0
0.1462070643901825
72111336-5394-44b5-800b-3cc75b065b39
-236.0
0.1462070643901825
72175521-bd89-47f1-8d9b-2bb32c2c37d9
-221.0
0.1462070643901825
721e6073-9e64-4e0e-b3a5-8b7f7a796cb2
-244.0
0.1462070643901825
723bdbfb-a5da-46ef-b20a-3b0e15710f30
-224.0
0.1462070643901825
7257c8a5-1ef9-43ee-ba58-93cf02c0dd2b
-247.0
0.1462070643901825
7260220e-c612-4e5d-b7ba-5bcffab3b983
-253.0
0.1462070643901825
72797799-7a6d-48fd-a3a1-64ab3b55a643
-247.0
0.1462070643901825
72bd63ef-4b01-4fb6-a075-1736da517f97
-250.0
0.1462070643901825
72cb51b6-6cc4-439d-8f50-73a3d88df44e
-234.0
0.1462070643901825
72cc8352-0183-47af-83c9-6bd438f14aa2
-246.0
0.1462070643901825
72e277cf-6821-4df5-9a1b-b5c1b58c3c28
-262.0
0.146207064

In [18]:
signal["0000b1ad-fdaf-49e6-bc11-cbe93270e3a3"]

{'signal': array([1115.61789042, 1215.61789042, 1259.61789042, ..., 1106.61789042,
        1123.61789042, 1310.61789042]),
 'shift': -242.0,
 'scale': 0.1462070643901825}

In [19]:
def convert_signal_pA(signal):
    for read_id in signal.keys():
        assert signal[read_id]["shift"] is not None
        assert signal[read_id]["scale"] is not None
        signal[read_id]["signal"] = (
            signal[read_id]["signal"] + signal[read_id]["shift"]
        ) * signal[read_id]["scale"]
    return signal

In [61]:
def norm_signal(signal, shift_scale):
    shift_scale_norm = {}
    for read_id in signal.keys():
        shift_scale_norm[read_id] = {}
        shift_scale_norm[read_id]["shift"] = (
            shift_scale[read_id]["shift"] / signal[read_id]["scale"]
        ) - signal[read_id]["shift"]
        shift_scale_norm[read_id]["scale"] = (
            shift_scale[read_id]["scale"] / signal[read_id]["scale"]
        )
        stride = shift_scale[read_id]["stride"]
        signal[read_id]["signal"] = (
            signal[read_id]["signal"][stride:] - shift_scale_norm[read_id]["shift"]
        ) / shift_scale_norm[read_id]["scale"]
    return signal

In [56]:
import pysam


def extract_shift_scale_from_bam(bam_path):
    read_dict = dict()
    bamfile = pysam.AlignmentFile(bam_path, "rb", check_sq=False)
    try:
        for read in bamfile.fetch():
            tags = dict(read.tags)
            ts_tag = tags["ts"]
            sm_tag = tags["sm"]
            sd_tag = tags["sd"]
            read_dict.update(
                {read.query_name: {"shift": sm_tag, "scale": sd_tag, "stride": ts_tag}}
            )
            # print(read.query_sequence)
    except ValueError:
        # print('没有构建索引，现在开始构建！')
        # bamfile_index = pysam.IndexedReads(bamfile)
        # bamfile_index.build()
        for read in bamfile.fetch(until_eof=True):
            tags = dict(read.tags)
            ts_tag = tags["ts"]
            sm_tag = tags["sm"]
            sd_tag = tags["sd"]
            read_dict[read.query_name] = {
                "shift": sm_tag,
                "scale": sd_tag,
                "stride": ts_tag,
            }
    return read_dict

In [58]:
# remote版
bam_path = "/homeb/xiaoyf/data/HG002/example/bam/has_moves.bam"
shift_scale = extract_shift_scale_from_bam(bam_path)

[E::idx_find_and_load] Could not retrieve index file for '/homeb/xiaoyf/data/HG002/example/bam/has_moves.bam'


In [59]:
shift_scale["0000b1ad-fdaf-49e6-bc11-cbe93270e3a3"]

{'shift': 105.02491760253906, 'scale': 29.21363067626953, 'stride': 10}

In [62]:
n_signal = norm_signal(signal, shift_scale)
print(np.mean(n_signal["0000b1ad-fdaf-49e6-bc11-cbe93270e3a3"]["signal"]))
print(np.var(n_signal["0000b1ad-fdaf-49e6-bc11-cbe93270e3a3"]["signal"]))

-4.830391195153174
2.587317153717503e-19


In [11]:
import pysam


def extract_move_from_bam(bam_path):
    read_dict = dict()
    bamfile = pysam.AlignmentFile(bam_path, "rb", check_sq=False)
    try:
        for read in bamfile.fetch():
            tags = dict(read.tags)
            mv_tag = tags["mv"]
            ts_tag = tags["ts"]
            sm_tag = tags["sm"]
            sd_tag = tags["sd"]
            read_dict.update(
                {
                    read.query_name: {
                        "sequence": read.query_sequence,
                        "stride": mv_tag[0],
                        "mv_table": np.array(mv_tag[1:]),
                        "num_trimmed": ts_tag,
                        "shift": sm_tag,
                        "scale": sd_tag,
                    }
                }
            )
            # print(read.query_sequence)
    except ValueError:
        # print('没有构建索引，现在开始构建！')
        # bamfile_index = pysam.IndexedReads(bamfile)
        # bamfile_index.build()
        for read in bamfile.fetch(until_eof=True):
            tags = dict(read.tags)
            mv_tag = tags["mv"]
            ts_tag = tags["ts"]
            sm_tag = tags["sm"]
            sd_tag = tags["sd"]
            read_dict[read.query_name] = {
                "sequence": read.query_sequence,
                "stride": mv_tag[0],
                "mv_table": np.array(mv_tag[1:]),
                "num_trimmed": ts_tag,
                "shift": sm_tag,
                "scale": sd_tag,
            }
    return read_dict

In [4]:
def read_from_pod5_bam(pod5_path, bam_path, read_id=None):
    read = {}
    signal = extract_signal_from_pod5(pod5_path)
    seq_move = extract_move_from_bam(bam_path)
    if read_id is not None:
        if seq_move[read_id]["sequence"] is not None:
            if signal[read_id] is not None:
                read[read_id] = {
                    "sequence": seq_move[read_id]["sequence"],
                    "signal": signal[read_id]["signal"],
                    "mv_table": seq_move[read_id]["mv_table"],
                    "num_trimmed": seq_move[read_id]["num_trimmed"],
                    "to_norm_shift": seq_move[read_id]["shift"],
                    "to_norm_scale": seq_move[read_id]["scale"],
                    "stride": seq_move[read_id]["stride"],
                    "to_pA_shift": signal[read_id]["shift"],
                    "to_pA_scale": signal[read_id]["scale"],
                }
    else:
        for read_id in seq_move.keys():
            if seq_move[read_id]["sequence"] is not None:
                if signal[read_id] is not None:
                    read[read_id] = {
                        "sequence": seq_move[read_id]["sequence"],
                        "signal": signal[read_id]["signal"],
                        "mv_table": seq_move[read_id]["mv_table"],
                        "num_trimmed": seq_move[read_id]["num_trimmed"],
                        "to_norm_shift": seq_move[read_id]["shift"],
                        "to_norm_scale": seq_move[read_id]["scale"],
                        "stride": seq_move[read_id]["stride"],
                        "to_pA_shift": signal[read_id]["shift"],
                        "to_pA_scale": signal[read_id]["scale"],
                    }
    return read

In [5]:
def norm_signal_read_id(signal):
    shift_scale_norm = {}
    signal_norm = {}
    shift_scale_norm = {}
    shift_scale_norm["shift"] = (
        signal["to_norm_shift"] / signal["to_pA_scale"]
    ) - signal["to_pA_shift"]
    shift_scale_norm["scale"] = signal["to_norm_scale"] / signal["to_pA_scale"]
    num_trimmed = signal["num_trimmed"]
    signal_norm = (
        signal["signal"][num_trimmed:] - shift_scale_norm["shift"]
    ) / shift_scale_norm["scale"]

    return signal_norm

In [6]:
def caculate_feature_for_each_base(read, base_num=0):
    feature = {}
    for read_id in read.keys():
        feature[read_id] = []
        sequence = read[read_id]["sequence"]
        movetable = read[read_id]["mv_table"]
        stride = read[read_id]["stride"]
        # num_trimmed = read[read_id]['num_trimmed']
        trimed_signals = norm_signal_read_id(read[read_id])  # 筛掉背景信号,norm
        move_pos = np.append(np.argwhere(movetable == 1).flatten(), len(movetable))
        # print(len(move_pos))
        for move_idx in range(len(move_pos) - 1):
            start, end = move_pos[move_idx], move_pos[move_idx + 1]
            signal = trimed_signals[(start * stride) : (end * stride)].tolist()
            mean = np.mean(signal)
            std = np.std(signal)
            num = end - start
            feature[read_id].append(
                {"signal": signal, "std": std, "mean": mean, "num": num}
            )
    return feature

In [12]:
pod5_path = "/homeb/xiaoyf/data/HG002/example/pod5/output.pod5"
bam_path = "/homeb/xiaoyf/data/HG002/example/bam/has_moves.bam"
read = read_from_pod5_bam(pod5_path, bam_path)

[E::idx_find_and_load] Could not retrieve index file for '/homeb/xiaoyf/data/HG002/example/bam/has_moves.bam'


In [80]:
read["0000b1ad-fdaf-49e6-bc11-cbe93270e3a3"]["stride"]
read["0000b1ad-fdaf-49e6-bc11-cbe93270e3a3"]["signal"]

{'signal': array([ 909, 1009, 1053, ...,  900,  917, 1104], dtype=int16),
 'shift': -242.0,
 'scale': 0.1462070643901825}

In [13]:
feature = caculate_feature_for_each_base(read)

In [16]:
feature["0000b1ad-fdaf-49e6-bc11-cbe93270e3a3"]

[{'signal': [0.15850069241339143,
   -0.0817275479069563,
   -0.36699858328736923,
   -0.32696054323397794,
   -0.28191774817391274],
  'std': 0.19554699269319398,
  'mean': -0.17982074603776493,
  'num': 1}]

一组信号5个值

In [38]:
sig = []
for read_id in n_signal.keys():
    if len(sig) == 0:
        sig = n_signal[read_id]["signal"]
    for i in range(len(sig)):
        sig[i] = sig[i] + n_signal[read_id]["signal"][i]
print(np.mean(sig))
print(np.var(sig))

IndexError: index 13225 is out of bounds for axis 0 with size 13225

In [31]:
[1, 2][1] + [2, 3][0]

4