# CliqueSNV validation protocols

In [None]:
from pathlib import Path
import os
import subprocess
import pandas as pd
import glob
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from Bio.Align import AlignInfo
import re
import json

In [None]:
#Haplotyping tools and configs:
CliqueSNV_fn = "/home/code/cliquesnv/2.0.2/clique-snv.jar"
aBayesQR_fn = "/home/code/abayesqr/aBayesQR/aBayesQR"
PredictHaplo_fn = "/home/code/PredictHaplo-Paired-0.4/PredictHaplo-Paired"

In [None]:
#other constants:
samtools_dn = "/home/code/simseq/SimSeq"
picard_dn = "/home/code/picard/picard-tools-1.119"

base_dn = "/alina-data0/sergey/CliqueSNV"

HXB2_pol_ref_fn = "refs/HXB2_pol_ref/ref.fas"
HXB2_fl_ref_fn = "refs/HXB2_fl/HXB2_fl.fas"
IAV_ref_fn = "refs/IAV_ref/ref.fa"
HCV_ref_fn = "refs/HCV_ref/NC_004102_ref.fasta"
ZIKA_ref_fn = "refs/ZIKA_ref/NC_012532_ref.fasta"

subsample_fn = "scripts/Subsampler.py"
sim_read_generator_fn = "scripts/ReadGenerator.py"

# 1) Data preparation

In [None]:
experimental_datasets = ["HIV9exp", "HIV2exp"]
reduced_experimental_datasets = ["{}_50k_reads".format(x) for x in experimental_datasets]
labmix_dataset = ["HIV5exp", "HIV5exp_fl"]
simulated_datasets = ["HIV7sim", "IAV10sim", "HCV10sim", "ZIKA3sim"]
fragment_datasets = ["HIV5exp2k","HIV5exp5k","HCV10sim1k","HCV10sim2k","HCV10sim5k","ZIKA3sim1k","ZIKA3sim2k","ZIKA3sim5k"]
er1 = ["reads/{}_R1.fastq.gz".format(x) for x in experimental_datasets]
er2 = ["reads/{}_R2.fastq.gz".format(x) for x in experimental_datasets]
rer1 = ["reads/{}_R1.fastq.gz".format(x) for x in reduced_experimental_datasets]
rer2 = ["reads/{}_R2.fastq.gz".format(x) for x in reduced_experimental_datasets]
labmix1 = ["reads/SRR961514_1.fastq"]
labmix2 = ["reads/SRR961514_1.fastq"]
sim_hapl = ["relevant_haplotypes/HIV7sim.fasta", "relevant_haplotypes/IAV10sim.fasta", "sim_haplotypes/HCV10sim.fasta", "sim_haplotypes/ZIKA3sim.fasta"]
rs1 = ["reads/{}_R1.fastq.gz".format(x) for x in simulated_datasets]
rs2 = ["reads/{}_R2.fastq.gz".format(x) for x in simulated_datasets]
exp_sams = ["alignment/{}.sam".format(x) for x in experimental_datasets]
reduced_exp_sams = ["alignment/{}.sam".format(x) for x in reduced_experimental_datasets]
labmix_sam = ["alignment/{}.sam".format(x) for x in labmix_dataset]
simulated_sams = ["alignment/{}.sam".format(x) for x in simulated_datasets]

## Prepare simulated HIV7sim, IAV10sim and HCV10sim datasets

In [None]:
%%capture
for i in range(len(rs1)):
    op = Path("reads/tmp")
    op.mkdir(parents=True, exist_ok=True)
    od = str(op)
    iref = sim_hapl[i]
    !python3 $sim_read_generator_fn -c 50000 -s $samtools_dn -p $picard_dn -i $iref -o $od
    
    i1 = Path(od, "sim_reads.1.fastq")
    i2 = Path(od, "sim_reads.2.fastq")
    o1 = rs1[i]
    o2 = rs2[i]
    
    !gzip -c $i1 > $o1
    !gzip -c $i2 > $o2
    !rm -rf $od

## Reduced HIV9exp and HIV2exp so that the datasets to consist of just 50k reads

In [None]:
for i in range(len(er1)):
    i1 = er1[i]
    i2 = er2[i]
    ir1 = rer1[i]
    ir2 = rer2[i]
    !python3 $subsample_fn --n-samples 50000 --fastq1 $i1 --fastq2 $i2 --fastq1_out $ir1 --fastq2_out $ir2

## Aligning reads

In [None]:
%%capture
for r in [HXB2_pol_ref_fn, HXB2_fl_ref_fn, IAV_ref_fn, HCV_ref_fn, ZIKA_ref_fn]:
    !bwa index $r
all_r1 = er1 + rer1 + labmix1*2 + rs1
all_r2 = er2 + rer2 + labmix2*2 + rs2
all_sams = exp_sams + reduced_exp_sams + labmix_sam + simulated_sams
refs = [HXB2_pol_ref_fn] * 5 + [HXB2_fl_ref_fn, HXB2_pol_ref_fn, IAV_ref_fn, HCV_ref_fn, ZIKA_ref_fn]

for i in range(len(all_r1)):
    i1 = all_r1[i]
    i2 = all_r2[i]
    o = all_sams[i]
    r = refs[i]
    !bwa mem -B 2 $r $i1 $i2 > $o

# 2) Protocol of Sensitivity and Specificity analysis for CliqueSNV

## Running CliqueSNV with different sensitivity thresholds

In [None]:
%%capture
datasets = experimental_datasets \
    + reduced_experimental_datasets \
    + labmix_dataset \
    + simulated_datasets \
    + fragment_datasets

fragments = {"HIV5exp2k":(1672,3671),
             "HIV5exp5k":(589,5588),
             "HCV10sim1k":(3800,4799),
             "HCV10sim2k":(3300,5299),
             "HCV10sim5k":(1800,6799),
             "ZIKA3sim1k":(4299,5298),
             "ZIKA3sim2k":(3799,5798),
             "ZIKA3sim5k":(2299,7298)}

for d in datasets:
    for x in [0.1, 0.05, 0.02, 0.01]:
        out_dir = str(Path(base_dn, "results", "{}_{}p_CliqueSNV".format(d, x*100)))
        if d in fragments:
            b,e = fragments[d]
            in_f = str(Path(base_dn, "alignment", "{}.sam".format(d[:-2])))
            !java -Xmx100g -jar $CliqueSNV_fn -m snv-illumina -tf $x -outDir $out_dir -in $in_f -sp $b -ep $e
        else:
            in_f = str(Path(base_dn, "alignment", "{}.sam".format(d)))
            !java -Xmx100g -jar $CliqueSNV_fn -m snv-illumina -tf $x -outDir $out_dir -in $in_f

# 3) Protocol of comparison of CliqueSNV with Consensus, PredictHaplo, aBayesQR, and 2SNV

### PredictHaplo
PredictHaplo failed on HIV7sim and HIV9exp.
HIV9exp region reduced from 1:1074 to 1:1065.
HIV7sim region reduced from 1:1074 to 25:1050.

In [None]:
%%capture
datasets = experimental_datasets \
    + reduced_experimental_datasets \
    + labmix_dataset \
    + simulated_datasets \
    + fragment_datasets

for d in datasets:
    config_fn = str(Path(base_dn,"tool_configs/{}_PredictHaplo.config".format(d)))
    out_dir = str(Path(base_dn,"results/{}_PredictHaplo".format(d)))
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    os.chdir(out_dir)
    !$PredictHaplo_fn $config_fn
    os.chdir(base_dn)

### aBayesQR
aBayesQR didn't finish on a full HIV2exp and HIV9exp datasets

In [None]:
%%capture
datasets = reduced_experimental_datasets \
    + labmix_dataset[0:1] \
    + simulated_datasets[0:2]
for d in datasets:
    config_fn = str(Path(base_dn,"tool_configs/{}_aBayesQR.config".format(d)))
    out_dir = str(Path(base_dn,"results/{}_aBayesQR".format(d)))
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    os.chdir(out_dir)
    !$aBayesQR_fn $config_fn
    os.chdir(base_dn)

### Consensus

In [None]:
%%capture
datasets = experimental_datasets \
    + reduced_experimental_datasets \
    + labmix_dataset \
    + simulated_datasets
for d in datasets:
    out_dir = str(Path(base_dn, "results/{}_consensus".format(d)))
    in_f = str(Path(base_dn,"alignment/{}.sam".format(d)))
    !java -jar $CliqueSNV_fn -m consensus-illumina -in $in_f -outDir $out_dir

# Create standard fasta for tools' results

In [None]:
regions = {"HIV9exp":(0,1065),
           "HIV2exp":(4,1074),
           "HIV5exp":(0,1074),
           "HIV5exp_fl":(0,9276),
           "HIV7sim":(24,1050),
           "IAV10sim":(0,2263),
           "HCV10sim":(0,8992),
           "ZIKA3sim":(0,9930),
           "HCV10sim2k":(0,2000),
           "HCV10sim5k":(0,5000),
           "ZIKA3sim2k":(0,2000),
           "ZIKA3sim5k":(0,5000)
          }

a=list(map(lambda x: [x[0],x[1][0],x[1][1]],regions.items()))
b=[[r[col] for r in a] for col in range(len(a[0]))]
datasets = pd.DataFrame({"dataset":b[0], "begin_pos":b[1], "end_pos":b[2]})
tools = pd.DataFrame({"tool":["aBayesQR", "PredictHaplo", "CliqueSNV", "consensus"]})
datasets["key"] = 0
tools["key"] = 0

results = pd.merge(datasets, tools, on="key")
results = results.drop(columns=["key"])
column_names=["dataset", "tool", "begin_pos", "end_pos"]
results = results.reindex(columns=column_names)

In [None]:
tool_results_paths = list()
results_paths = list()
for i,row in results.iterrows():
    n = [row["dataset"]]
    if row["tool"] == "aBayesQR" and (row["dataset"] == "HIV9exp" or
                                      row["dataset"] == "HIV2exp"):
        n.append("50k_reads")
    elif row["tool"] == "CliqueSNV":
        n.append("2.0p")
    n.append(row["tool"])
    tool_results_paths.append("results/"+"_".join(n))
    results_paths.append("results/"+"_".join([row["dataset"], row["tool"]])+'.fasta')
results["tool_out_path"] = tool_results_paths
results["output_fasta"] = results_paths

Scripts for converting tools' output to standard form

In [None]:
def standard_fasta_aBayesQR(indir, outf, begin_pos, end_pos):
    ins=list()
    try:
        with open(indir+"/test_Seq.txt") as f:
            for s in f:
                ins.append(s.strip()[begin_pos:end_pos])
        with open(indir+"/test_Freq.txt") as f:
            infr=next(f).strip().split()
        seqs=list()
        for i,s in enumerate(ins):
            sn="{}_{}".format(str(i),infr[i])
            seqs.append(SeqRecord(Seq(s),id=sn,description=sn))
        SeqIO.write(seqs,outf,"fasta")
    except:
        return

def standard_fasta_PredictHaplo(indir, outf, begin_pos, end_pos):
    fs=re.findall("[^,]*ph_global[^,]*\.fas",",".join(glob.glob(indir+"/*")))
    seq_beg_pos=0
    seq_end_pos=0
    best_range=0
    for fn in fs:
        a=fn.split(".")[0].split("_")
        b,e=int(a[-2]),int(a[-1])
        if e-b>best_range:
            best_range=e-b
            seq_beg_pos=b-1
            seq_end_pos=e
    new_beg_pos = max(0,begin_pos-seq_beg_pos)
    new_end_pos = min(seq_end_pos-seq_beg_pos, end_pos-seq_beg_pos)
    fn = indir + "/ph_global_{}_{}.fas".format(str(seq_beg_pos+1), str(seq_end_pos))
    with open(fn) as f:
        r="".join(f.read()).replace("\n","")
    hs = list()
    seqs=list()
    ss = r.split(">reconstructed_")
    for i in range(1,len(ss)):
        hs.append(re.findall(".*;Freq:(\d+\.\d+).*;EndOfComments([^>]*)",ss[i])[0])
    for i,h in enumerate(hs):
        n="{}_{}".format(i,h[0])
        seqs.append(SeqRecord(Seq(h[1][new_beg_pos:new_end_pos]),id=n,description=n))
    SeqIO.write(seqs,outf,"fasta")

def standard_fasta_CliqueSNV(indir, outf, begin_pos, end_pos):
    fi=glob.glob(indir+"/*.fasta")[0]
    seqs=list(SeqIO.parse(fi,"fasta"))
    for s in seqs:
        s.seq = Seq(str(s.seq)[begin_pos:end_pos])
    SeqIO.write(seqs, outf, "fasta")

def standard_fasta_consensus(indir, outf, begin_pos, end_pos):
    try:
        fi=glob.glob(indir+"/*.fasta")[0]
        seqs=list(SeqIO.parse(fi,"fasta"))
        seqs[0].id = "0_fr_1.0"
        seqs[0].seq = Seq(str(seqs[0].seq)[begin_pos:end_pos])
        SeqIO.write(seqs, outf, "fasta")
    except:
        return

In [None]:
for i,row in results.iterrows():
    globals()["standard_fasta_"+row["tool"]](row["tool_out_path"], row["output_fasta"], row["begin_pos"], row["end_pos"])

## Creating fasta with relevant haplotypes

In [None]:
for d in results.dataset.unique():
    begin_pos=results[results.dataset==d].begin_pos.min()
    end_pos=results[results.dataset==d].end_pos.max()
    seqs=list(SeqIO.parse("relevant_haplotypes/"+d+".fasta", "fasta"))
    for s in seqs:
        s.seq=s[begin_pos:end_pos].seq
    SeqIO.write(seqs, "relevant_haplotypes/"+d+"_trm"+".fasta", 'fasta')

In [None]:
relevant_haplotypes=list()
for i,row in results.iterrows():
    n = row["dataset"]
    relevant_haplotypes.append("relevant_haplotypes/{}_trm.fasta".format(n))
results["relevant_haplotypes_fasta"] = relevant_haplotypes

## Collect statistics

In [None]:
stat = dict()
for i,row in results.iterrows():
    n = "_".join([row["dataset"], row["tool"]])
    p = row["output_fasta"]
    r = row["relevant_haplotypes_fasta"]
    a = !python scripts/analyze_prediction.py $p $r
    stat[n] = json.loads(a[0])

In [None]:
stat_keys = ["EMD","TP","FP","Et->p","Et<-p"]
stat_keys_dict = {"EMD":"EMD","TP":"TP","FP":"FP","Et->p":"APE","Et<-p":"ADC"}
results_df = pd.DataFrame(columns=["prediction"] + stat_keys)
for i,row in results.iterrows():
    stat_row = list()
    p = "_".join([row["dataset"], row["tool"]])
    stat_row.append(p)
    for s in stat_keys:
        stat_row.append(stat[p][stat_keys_dict[s]])
    results_df.loc[i] = stat_row

In [None]:
results_df

In [None]:
datasets = experimental_datasets \
    + reduced_experimental_datasets \
    + labmix_dataset \
    + simulated_datasets
cliquesnv_stat = dict()
for d in datasets:
    for x in [0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001]:
        pf = str(Path(base_dn, "results", "{}_{}p_CliqueSNV/{}.fasta".format(d, x*100, d)))
        if not os.path.exists(pf):
            continue
        rf = "relevant_haplotypes/{}.fasta".format(d.split("_")[0])
        a = !python scripts/analyze_prediction.py $pf $rf
        p = "_".join([d, str(x*100)])
        cliquesnv_stat[p] = json.loads(a[0])

In [None]:
cliquesnv_results_df = pd.DataFrame(columns=["prediction"] + stat_keys)
i=0
for d in datasets:
    for x in [0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001]:
        p = "_".join([d, str(x*100)])
        if p not in cliquesnv_stat:
            continue
        stat_row = list()
        stat_row.append(p)
        a = cliquesnv_stat[p]
        for s in stat_keys:
            stat_row.append(a[stat_keys_dict[s]])
        cliquesnv_results_df.loc[i] = stat_row
        i+=1

In [None]:
cliquesnv_results_df

In [None]:
datasets = experimental_datasets \
    + reduced_experimental_datasets \
    + labmix_dataset \
    + simulated_datasets
cliquesnv_t_stat = dict()
for d in datasets:
    for x in [10, 15, 20, 30, 50, 70, 100, 150, 200, 300, 500]:
        pf = str(Path(base_dn, "results", "{}_2p_{}t_CliqueSNV/{}.fasta".format(d, x, d)))
        if not os.path.exists(pf):
            continue
        rf = "relevant_haplotypes/{}.fasta".format(d.split("_")[0])
        a = !python scripts/analyze_prediction.py $pf $rf
        p = "_".join([d, str(x)])
        cliquesnv_t_stat[p] = json.loads(a[0])

cliquesnv_t_results_df = pd.DataFrame(columns=["prediction"] + stat_keys)
i=0
for d in datasets:
    for x in [10, 15, 20, 30, 50, 70, 100, 150, 200, 300, 500]:
        p = "_".join([d, str(x)])
        if p not in cliquesnv_t_stat:
            continue
        stat_row = list()
        stat_row.append(p)
        a = cliquesnv_t_stat[p]
        for s in stat_keys:
            stat_row.append(a[stat_keys_dict[s]])
        cliquesnv_t_results_df.loc[i] = stat_row
        i+=1

In [None]:
pd.set_option('display.max_rows', cliquesnv_t_results_df.shape[0]+1)
print(cliquesnv_t_results_df)

### Export statistics as csv files

In [None]:
stat_dir = Path("prediction_stats")
stat_dir.mkdir(parents=True, exist_ok=True)

#### Export csv for TP and FP plot (figure 2)

In [None]:
ect_dfs = dict()
for s in stat:
    d, t = s.split("_")
    if t == "consensus":
        t = "Consensus"
    if not d in ect_dfs:
        ect_dfs[d] = pd.DataFrame(columns=["ECT","Method"])
    for e in stat[s]["ECT"]:
        ect_dfs[d] = ect_dfs[d].append({"ECT": e, "Method": t}, ignore_index=True)
for df in ect_dfs:
    ect_dfs[df].to_csv(Path(stat_dir,df+"_ECT.csv"),index=False)

In [None]:
ecp_dfs = dict()
for s in stat:
    d, t = s.split("_")
    if t == "consensus":
        t = "Consensus"
    if not d in ecp_dfs:
        ecp_dfs[d] = pd.DataFrame(columns=["ECP","Method"])
    for e in stat[s]["ECP"]:
        ecp_dfs[d] = ecp_dfs[d].append({"ECP": e, "Method": t}, ignore_index=True)
for df in ecp_dfs:
    ecp_dfs[df].to_csv(Path(stat_dir,df+"_ECP.csv"),index=False)

#### Export csv for matching distances (figure 3)

In [None]:
match_dist_df = pd.DataFrame(columns=["ADC","APE","Method","Dataset"])
for r in results_df.iterrows():
    d, t = r[1]["prediction"].split("_")
    if t == "consensus":
        t = "Consensus"
    match_dist_df=match_dist_df.append({"ADC":r[1]["Et<-p"],"APE":r[1]["Et->p"],"Method":t,"Dataset":d}, ignore_index=True)
match_dist_df.to_csv(Path(stat_dir,"match_dist.csv"),index=False)

#### Export csv for EMD (figure 4)

In [None]:
emd_df = pd.DataFrame(columns=["EMD","Method","Dataset"])
for r in results_df.iterrows():
    d, t = r[1]["prediction"].split("_")
    if t == "consensus":
        t = "Consensus"
    emd_df=emd_df.append({"EMD":r[1]["EMD"],"Method":t,"Dataset":d}, ignore_index=True)
emd_df.to_csv(Path(stat_dir,"emd.csv"),index=False)

#### Export csv for precision and recall (table 2)

In [None]:
precision_recall_df=pd.DataFrame(columns=["Dataset", "Precision", "Recall"])
for s in sorted(stat):
    precision_recall_df=precision_recall_df.append({"Dataset":s,
                                                    "Precision":stat[s]["PPV"],
                                                    "Recall":stat[s]["Sensitivity"]}, ignore_index=True)
precision_recall_df.to_csv(Path(stat_dir,"precision_recall.csv"),index=False)

In [None]:
precision_recall_df

## Additional validation for NAR review

### Running tools on different fragment length for HIV-1 labmix dataset

In [None]:
HIV5_full_length_aln_file = "refs/HIV5_var_length/HIV5_full_length_aln.fas"
HIV5_full_length_ref_file = "refs/HIV5_var_length/HIV5_full_length_ref.fas"
HIV5_2000_nt_ref_file = "refs/HIV5_var_length/HIV5_2000_nt_ref.fas"
HIV5_5000_nt_ref_file = "refs/HIV5_var_length/HIV5_5000_nt_ref.fas"
HIV5_full_length_sam = "alignment/HIV5_full_length.sam"
HIV5_2000_nt_sam = "alignment/HIV5_2000_nt.sam"
HIV5_5000_nt_sam = "alignment/HIV5_5000_nt.sam"

HIV5_frag_2000 = slice(1795,3795)
HIV5_frag_5000 = slice(694,5694)

#### Create reference of length of 2000nt, 5000nt, full length (~9000nt) that has all insertions from all 5 haplotypes

Creating full length ref with no gaps

In [None]:
%%capture
seqs = list(SeqIO.parse(HIV5_full_length_aln_file, 'fasta'))

ref = list()
for j in range(len(seqs[0])):
    nucls = {'A': 0, 'C': 0, 'G': 0, 'T':0}
    for i in range(len(seqs)):
        nuc = seqs[i][j]
        if nuc == '-':
            continue
        nucls[nuc]+=1
    sa = sorted(nucls.items(), key=lambda item: item[1], reverse=True)
    if sa[0][1] == sa[1][1]:
        hxb2_nuc = seqs[1][j]
        if hxb2_nuc != '-':
            ref.append(hxb2_nuc)
            continue
    ref.append(sa[0][0])

ref_fa = [SeqRecord(Seq("".join(ref)),id="HIV1_ref_no_gaps",description="HIV1_ref_no_gaps")]

SeqIO.write(ref_fa, HIV5_full_length_ref_file, 'fasta')

ref_2000_fa = [SeqRecord(Seq("".join(ref[HIV5_frag_2000])),id="HIV1_2000nt_ref_no_gaps",description="HIV1_2000nt_ref_no_gaps")]
SeqIO.write(ref_2000_fa, HIV5_2000_nt_ref_file, 'fasta')

ref_5000_fa = [SeqRecord(Seq("".join(ref[HIV5_frag_5000])),id="HIV1_5000nt_ref_no_gaps",description="HIV1_5000nt_ref_no_gaps")]
SeqIO.write(ref_5000_fa, HIV5_5000_nt_ref_file, 'fasta')

Align reads to full length reference

In [None]:
%%capture
HIV5_refs = [HIV5_full_length_ref_file, HIV5_2000_nt_ref_file, HIV5_5000_nt_ref_file]
HIV5_sams = [HIV5_full_length_sam, HIV5_2000_nt_sam, HIV5_5000_nt_sam]
for i in range(len(HIV5_refs)):
    r = HIV5_refs[i]
    !bwa index $r
    i1 = labmix1[0]
    i2 = labmix2[0]
    s = HIV5_sams[i]
    !bwa mem -B 2 $r $i1 $i2 > $s

Run CliqueSNV

In [None]:
%%capture
for i in [HIV5_2000_nt_sam, HIV5_5000_nt_sam, HIV5_full_length_sam]:
    !java -Xmx100g -jar $CliqueSNV_fn -m snv-illumina -outDir tmp -in $i

Run PredictHaplo

In [None]:
%%capture
datasets = ["HIV5_2000_nt", "HIV5_5000_nt", "HIV5_full_length"]
for d in datasets:
    config_fn = str(Path(base_dn,"tmp/{}_PredictHaplo.config".format(d)))
    out_dir = str(Path(base_dn,"tmp/{}_PredictHaplo".format(d)))
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    os.chdir(out_dir)
    !$PredictHaplo_fn $config_fn
    os.chdir(base_dn)

In [None]:
datasets = ["HIV5_2000_nt", "HIV5_5000_nt", "HIV5_full_length"]
pos = {"HIV5_2000_nt":(1,9809), "HIV5_5000_nt":(1,2000), "HIV5_full_length":(1,5000)}
for d in datasets:
    indir = "/alina-data0/sergey/CliqueSNV/tmp/{}_PredictHaplo".format(d)
    outf = "tmp/{}_PH.fasta".format(d)
    begin_pos = pos[d][0]
    end_pos = pos[d][1]
    standard_fasta_PredictHaplo(indir, outf, begin_pos, end_pos)

Consensus

In [None]:
%%capture
for i in [HIV5_2000_nt_sam, HIV5_5000_nt_sam, HIV5_full_length_sam]:
    !java -Xmx100g -jar $CliqueSNV_fn -m consensus-illumina -outDir tmp/HIV5_consensus -in $i