In [1]:
#載入並初始化nanoACT
from nanoact import nanoact
dumb = nanoact.NanoAct()

[17:51:10] Temp folder is set to ./temp/, you can change it by NanoAct(TEMP='your_temp_folder')
[17:51:10] We recommend not to set the temp folder to nfs, ftp, samba, google drive, etc. as it may cause unexpected errors


In [2]:
#將fastq.gz檔案合併成一個fastq檔案
all_fastq = dumb.combine_fastq(src = "./nanoact/sample/1_raw_reads/",
                               des = "./nanoact/sample/2_merged_fastq/",
                               name = "all.fastq"
                               )

[15:35:02] Found fastq file: FAW73888_d763411f_40013a17_1.fast5.fastq.gz
[15:35:02] Found fastq file: FAW73888_d763411f_40013a17_2.fast5.fastq.gz
[15:35:02] Found fastq file: FAW73888_d763411f_40013a17_0.fast5.fastq.gz


In [5]:
# 透過品質及長度過濾fastq檔案
# See also nanofilt()
filtered_fastq = dumb.qualityfilt(src = "./nanoact/sample/2_merged_fastq/all.fastq",
                           des = "./nanoact/sample/3_filtered_fastq/",
                           name = 'all.fastq',
                           QSCORE = 7,  #recommended 7-9
                           MIN_LEN = 2500, #depends on the length of your reads
                           MAX_LEN = 3500 #depends on the length of your reads
                           )

[15:35:24] Start Qualityfilt...
[15:35:24] QSCORE: 7, MIN_LEN: 2500, MAX_LEN: 3500
[15:35:30] 2481/12000 (20.67%) reads were passed quality filter


In [6]:
##透過barcode_index將fastq檔案分類(demultiplex)
#modified from Krehenwinkel, Henrik, Aaron Pomerantz, James B. Henderson, Susan R. Kennedy, Jun Ying Lim, Varun Swamy, Juan Diego Shoobridge, et al. 2019. “Nanopore Sequencing of Long Ribosomal DNA Amplicons Enables Portable and Simple Biodiversity Assessments with High Phylogenetic Resolution across Broad Taxonomic Scale.” GigaScience 8 (5). https://doi.org/10.1093/gigascience/giz006.
demultiplexed = dumb.singlebar(src = "./nanoact/sample/3_filtered_fastq/all.fastq", #Input: 單個fastq檔案，例如 all.fastq
                               des =  "./nanoact/sample/4_demultiplex/", #Output: 一個資料夾，程式會在該資料夾中輸出以SampleID為檔名的fastq檔案或是fasta檔案（由output_format決定），例如 SampleID.fastq
                               BARCODE_INDEX_FILE =  "./nanoact/sample/barcode.csv", #BARCODE_INDEX_FILE: barcode資料庫，可以是csv或是tsv檔案，例如 barcode.csv。必須包含SampleID, FwIndex, RvAnchor、ExpectedLength四個欄位。
                               mismatch_ratio_f = 0.1, #mismatch_ratio_f: FwIndex容許的錯誤率，預設為0.15。例如barcode長度為20bp，則容許0.15*20=3bp的錯誤(edit distance)。
                               mismatch_ratio_r = 0.1, #mismatch_ratio_r: RvAnchor容許的錯誤率，預設為0.15。
                               expected_length_variation = 0.3, #expected_length_variation: 預期的read長度變異，預設為0.3。例如ExpectedLengt為300bp，則容許0.3*300=90bp的變異。
                               search_range=150, #search_range: 搜尋barcode的範圍，預設為150bp。代表搜尋範圍為前150bp和後150bp。
                               output_format="both" #output_format: 輸出檔案的格式，預設為both。可以是fastq或是fasta或是fas(fasta格式但附檔名為fas)。both代表同時輸出fastq和fasta。
                               )

[15:36:49] BARCODE_INDEX_FILE loaded
[15:36:51] 498/2481 (20.07%) reads were demultiplexed successfully


In [8]:
##Orientation，透過FwPrimer和RvPrimer將反向的序列轉換成正向的序列
orientation = dumb.orientation( src= "./nanoact/sample/4_demultiplex/",  #Input: 一個資料夾，資料夾中包含以SampleID為檔名的fastq檔案或是fasta檔案（由input_format決定），例如 SampleID.fastq
                                des= "./nanoact/sample/5_orientation/",  #Output: 一個資料夾，程式會在該資料夾中輸出以SampleID為檔名的fastq檔案或是fasta檔案（由output_format決定），例如 SampleID.fastq
                                input_format = "fastq", #input_format: 輸入檔案的格式，預設為fastq。可以是fastq或是fasta或是fas(副檔名為fas之fasta格式檔案)。
                                output_format = "both", #output_format: 輸出檔案的格式，預設為both。可以是fastq或是fasta。both代表同時輸出fastq和fasta。
                                BARCODE_INDEX_FILE = "./nanoact/sample/barcode.csv", #BARCODE_INDEX_FILE: barcode資料庫，可以是csv或是tsv檔案，例如 barcode.csv。
                                FwPrimer = "FwPrimer", #FwPrimer: FwPrimer的欄位名稱，預設為FwPrimer。
                                RvPrimer = "RvPrimer", #RvPrimer: RvPrimer的欄位名稱，預設為RvPrimer。
                                search_range=200, #search_range: 搜尋FwPrimer和RvPrimer的範圍，預設為200bp。代表搜尋範圍為前200bp和後200bp。
                                )

[15:37:12] T24.fas is not in the accepted input format, skipping
[15:37:12] T4.fas is not in the accepted input format, skipping
[15:37:12] Processing T2.fastq
[15:37:12] T2.fas is not in the accepted input format, skipping
[15:37:12] Processing T6.fastq
[15:37:12] T15.fas is not in the accepted input format, skipping
[15:37:12] Processing T4.fastq
[15:37:12] T13.fas is not in the accepted input format, skipping
[15:37:12] T5.fas is not in the accepted input format, skipping
[15:37:12] T27.fas is not in the accepted input format, skipping
[15:37:12] Processing T14.fastq
[15:37:12] T17.fas is not in the accepted input format, skipping
[15:37:12] Processing T20.fastq
[15:37:12] T8.fas is not in the accepted input format, skipping
[15:37:12] Processing T18.fastq
[15:37:12] Processing T3.fastq
[15:37:12] T11.fas is not in the accepted input format, skipping
[15:37:12] T19.fas is not in the accepted input format, skipping
[15:37:12] Processing T25.fastq
[15:37:12] T28.fas is not in the acce

[15:37:13] T1.fas is not in the accepted input format, skipping
[15:37:13] Processing T31.fastq
[15:37:13] T22.fas is not in the accepted input format, skipping
[15:37:13] T30.fas is not in the accepted input format, skipping
[15:37:13] T23.fas is not in the accepted input format, skipping
[15:37:13] Processing T5.fastq
[15:37:13] Processing T28.fastq
[15:37:13] Processing T13.fastq
[15:37:13] Processing T12.fastq
[15:37:13] Processing T1.fastq
[15:37:13] T6.fas is not in the accepted input format, skipping
[15:37:13] T3.fas is not in the accepted input format, skipping
[15:37:13] Processing T9.fastq
[15:37:13] Processing T23.fastq
[15:37:13] T12.fas is not in the accepted input format, skipping
[15:37:13] 2_Singlebar_stat.csv is not in the accepted input format, skipping
[15:37:13] Processing T16.fastq
[15:37:13] Processing T19.fastq
[15:37:13] T9.fas is not in the accepted input format, skipping
[15:37:13] T20.fas is not in the accepted input format, skipping
[15:37:13] Processing T2

In [9]:
# 將barcode, Primer等人造序列去除
# 有兩種模式，一種是table，一種是case
# model = table時，需要輸入barcode_index_file，並且需要指定fw_col和rv_col。程式會透過barcode_index_file中的SampleID, FwPrimer, RvPrimer欄位來去除人造序列。
# model = case時，則是利用singlebar()分類時，會對raw reads找到的FwIndex及RvAnchor做小寫標記，trim_reads()依據小寫標記的序列位置來去除人造序列。

dumb.trim_reads (src="./nanoact/sample/5_orientation/",   #Input 一個資料夾，資料夾中包含以SampleID為檔名的fastq檔案或是fasta檔案（由input_format決定），例如 SampleID.fastq
                 des="./nanoact/sample/6_trimmed/",   #Output 一個資料夾，程式會在該資料夾中輸出以SampleID為檔名的fastq檔案或是fasta檔案（由output_format決定），例如 SampleID.fastq
                BARCODE_INDEX_FILE = "./nanoact/sample/barcode.csv", #barcode資料庫，可以是csv或是tsv檔案，例如 barcode.csv。僅在mode為table時需要。
                fw_col = "FwIndex",   #forward的欄位名稱，預設為FwPrimer。僅在mode為table時需要。
                rv_col = "RvPrimer",  #reverse的欄位名稱，預設為RvPrimer。僅在mode為table時需要。
                input_format="fastq", #input_format: 輸入檔案的格式，預設為fastq。可以是fastq或是fasta。
                output_format="both", #output_format: 輸出檔案的格式，預設為both。可以是fastq或是fasta。both代表同時輸出fastq和fasta。
                mode="case", #mode: 輸入barcode的模式，預設為table。可以是table或是case。table代表使用barcode_index_file，case代表使用singlebar的小寫標記的barcode。
                fw_offset = 0, #從距離找到的切除位點開始往後切除幾個bp，預設為0，可以是負數。例如fw_offset=-10，則從距離找到的切除位點開始往前切除10個bp。
                rv_offset = 0, #從距離找到的切除位點開始往前切除幾個bp，預設為0，可以是負數。例如rv_offset=-10，則從距離找到的切除位點開始往後切除10個bp。
                mismatch_ratio_f = 0.1, #FwIndex容許的錯誤率，預設為0.15。例如barcode長度為20bp，則容許0.15*20=3bp的錯誤(edit distance)。 僅在mode為table時需要。
                mismatch_ratio_r = 0.1, #RvAnchor容許的錯誤率，預設為0.15。 僅在mode為table時需要。
                discard_no_match = False, #如果沒有找到barcode，是否捨棄該序列，預設為False。 僅在mode為table時需要。
                check_both_directions = True, #是否同時檢查正向和反向序列，如果反向序列中找到barcode，則輸出反向序列，預設為True。 僅在mode為table時需要。
                reverse_complement_rv_col = True, #是否將RvAnchor欄位的序列反向互補後再進行搜尋，預設為True。 僅在mode為table時需要。
                search_range=100, #搜尋FwIndex和RvAnchor的範圍，預設為200bp。代表搜尋範圍為前200bp和後200bp。 僅在mode為table時需要。
                )

Notice: mode is set to 'case', arguments other than src, des, fw_offset, rv_offset,input_format, output_format will be ignored
[15:37:20] T28.fasta is not in the accepted inpute format, skipping
[15:37:20] T24.fas is not in the accepted inpute format, skipping
[15:37:20] T4.fas is not in the accepted inpute format, skipping
[15:37:20] Tirmming T2.fastq
[15:37:20] T2.fas is not in the accepted inpute format, skipping
[15:37:20] Tirmming T6.fastq
[15:37:20] T15.fas is not in the accepted inpute format, skipping
[15:37:20] Tirmming T4.fastq
[15:37:20] T4.fasta is not in the accepted inpute format, skipping
[15:37:20] T13.fas is not in the accepted inpute format, skipping
[15:37:20] T5.fas is not in the accepted inpute format, skipping
[15:37:20] T22.fasta is not in the accepted inpute format, skipping
[15:37:20] T27.fas is not in the accepted inpute format, skipping
[15:37:20] Tirmming T14.fastq
[15:37:20] T15.fasta is not in the accepted inpute format, skipping
[15:37:20] T17.fas is not 

In [10]:
#Clustering，將序列分群。
#一個barcode下，在許多情境下raw reads內可能包含許多異質的序列。
#例如由土壤放大的16S rRNA gene，可能包含多種細菌的16S rRNA gene(meta-barcoding)。
#又或是來自分析過程中的污染、非專一性放大產物、chimera等等。
#若直接將異質的raw reads進行後續分析，會導致具有許多雜訊的結果
#這裡介紹mmseqs，本工具也實作了其他的分群工具，請見nanoACT tutorial
dumb.mmseqs_cluster(src = "./nanoact/sample/6_trimmed/",
                    des = "./nanoact/sample/7_cluster/",
                    min_seq_id=0.8, #min_seq_id sets the minimum sequence identity threshold for prefiltering (pre-clustering). 
                    #Sequences with pairwise similarity below this value will not be clustered together.
                    #Set higher values for more stringent prefiltering. (Save time)
                    cluster_mode=0, #See mmseqs2 document: https://mmseqs.com/latest/userguide.pdf
                    cov_mode=0, #See mmseqs2 document: https://mmseqs.com/latest/userguide.pdf
                    k=14, #k specifies the length of k-mers length that will be used to compute sequence similarity.
                    #A k-mer is a substring of length k found in a sequence.
                    #   Increasing the value of k increases the specificity of sequence similarity, but also increases
                    #   the computational (memory) cost.
                    #   If you receive a memory error, try reducing the value of k.
                    kmer_per_seq = 20, #Only used when cluster_mode="linclust", see mmseqs2 document: https://mmseqs.com/latest/userguide.pdf
                    s=7.5,# s sets the sensitivity of the clustering. values between 1 and 8.5.
                    #   Increasing the value of s increases the sensitivity of the clustering (more clusters),
                    #   but also increases the false positive rate (more spurious clusters).
                    #   If you receive too many clusters, try reducing the value of s.
                    min_read_num=0, #min_read_num: This sets the minimum number of reads in a cluster.
                    #    Clusters with fewer reads than this value will be discarded.
                    input_format = "fastq", # input_format: 輸入檔案的格式，預設為fastq。可以是fastq或是fasta。
                    output_format = "both", # output_format: 輸出檔案的格式，預設為both。可以是fastq或是fasta。both代表同時輸出fastq和fasta。
                    )

[15:37:27] T24.fas is not in the accepted input format, skipping
[15:37:27] T4.fas is not in the accepted input format, skipping
[15:37:27] Clustering T2.fastq
[15:37:32] Number of clusters 5
[15:37:32] T2.fas is not in the accepted input format, skipping
[15:37:32] Clustering T6.fastq
[15:37:36] Number of clusters 1
[15:37:36] T15.fas is not in the accepted input format, skipping
[15:37:36] Clustering T4.fastq
[15:37:41] Number of clusters 3
[15:37:41] T13.fas is not in the accepted input format, skipping
[15:37:41] T5.fas is not in the accepted input format, skipping
[15:37:41] T27.fas is not in the accepted input format, skipping
[15:37:41] Clustering T14.fastq
[15:37:45] Number of clusters 5
[15:37:45] T17.fas is not in the accepted input format, skipping
[15:37:45] Clustering T20.fastq
[15:37:48] Number of clusters 2
[15:37:48] T8.fas is not in the accepted input format, skipping
[15:37:48] Clustering T18.fastq
[15:37:52] Number of clusters 4
[15:37:52] Clustering T3.fastq
[15:37:

'./nanoact/sample/7_cluster/'

In [3]:
#利用mafft將序列進行多序列排比，並且取得序列的consensus sequence
#本步驟完成後，建議先檢查各個序列的排比檔(aln_)是否有異質的情形(長短不一、align不良)

dumb.mafft_consensus (src="./nanoact/sample/7_cluster/", 
                      des="./nanoact/sample/8_consensus/",
                      minimal_reads=0,  #minimal_reads: 設定最少的序列數量，預設為0。例如minimal_reads=10，則只會輸出序列數量大於10的consensus sequence。
                      input_format="fastq"
                    )  


[16:00:56] Working on T10_cluster_0_r7 ...
[16:00:57] T11_cluster_4_r1.fas is not in the accepted input format, skipping
[16:00:57] Working on T3_cluster_1_r1 ...
[16:00:58] T13_cluster_1_r1.fas is not in the accepted input format, skipping
[16:00:58] Working on T2_cluster_4_r7 ...
[16:01:00] Working on T20_cluster_1_r7 ...
[16:01:02] Working on T9_cluster_0_r1 ...
[16:01:02] T31_cluster_1_r2.fas is not in the accepted input format, skipping
[16:01:02] T25_cluster_0_r5.fas is not in the accepted input format, skipping
[16:01:02] Working on T3_cluster_2_r1 ...
[16:01:03] T8_cluster_0_r2.fas is not in the accepted input format, skipping
[16:01:03] T11_cluster_2_r1.fas is not in the accepted input format, skipping
[16:01:03] Working on T4_cluster_0_r1 ...
[16:01:03] Working on T24_cluster_0_r1 ...
[16:01:04] Working on T31_cluster_0_r1 ...
[16:01:04] T25_cluster_1_r2.fas is not in the accepted input format, skipping
[16:01:04] T1_cluster_0_r8.fas is not in the accepted input format, skipp

'/home/raingel/桌面/nanopore_working/nanoACT/nanoact/sample/8_consensus'

In [2]:
#將consensus序列進行blast，並生成一個csv檔，內包含每個序列的blast結果
#Input: A folder containing fasta files named in the specified format
#Format: con_{sampleID}_cluster_{number}_r{reads_number}
#e.g.: con_2523_cluster_1_r499
#Output: A csv file named {name} is saved in the {des} folder
# `funguild`: This parameter is a boolean value that indicates whether to perform a Funguild search or not.
#             Funguild is a web-based annotation tool that allows users to predict the ecological functions of fungal communities based on their taxonomic composition.
# `startswith`: This parameter is a string indicating the prefix that the input fasta file names should start with. It is used to filter out files that do not match the given prefix.
# `query_range`: 代表該序列要用來送去blast的區間。一般建議不要超過500 bp。過長除了會造成blast伺服器負擔過重外，由於blast的排序同時考慮coverage及similarity。過長
#                的query將會導致高coverage但低similarity的hit排在前面，排擠掉中等coverage但高similarity的hit
#                輸入值為tuple，範例： 假設序列為 AAATTTCCC
#                query_range=(None,None)則代表完全不裁切
#                query_range=(0,None)也代表完全不裁切
#                query_range=(0,-1) 則代表從第0個位置(第1個 bp，程式上習慣從0開始計數)開始，到(不包含)倒數最後一個，實際query為 AAATTTCC
#                query_range=(2,5) 則代表從第2個開始，到(不包含)第5個，實際query為 ATT
#                query_range=(3,-3) 則代表從第3個開始，到(不包含)倒數第3個，實際query為 TTT
# `batch`: This parameter is an integer indicating the number of sequences to blast at a time.
#          The input sequences are divided into batches of size `batch`, and each batch is blasted separately.
#          This is done to avoid overloading the NCBI BLAST server with too many requests at once.
dumb.blast_2(src= "./nanoact/sample/8_consensus/", #Input: 一個資料夾，資料夾中包含以SampleID為檔名的fasta檔案，例如 SampleID.fasta
           des= "./nanoact/sample/8_consensus/", #Output: 一個資料夾，程式會在該資料夾中輸出一個csv檔案，檔案
           name="blast.csv", #name: 輸出檔案的檔名，預設為blast.csv。
           funguild=True, #funguild: 是否進行funguild的分析，預設為True。
           startswith ="con_", #startswith: 輸入fasta檔案的檔名所具有的開頭，預設為con_。
           input_format="fas", #input_format: 輸入fasta檔案的格式，預設為fasta, 若為fas則為副檔名為fas的fasta檔案。
           query_range=(200,400), #`query_range`: 代表該序列要用來送去blast的區間。一般建議不要超過500 bp。過長除了會造成blast伺服器負擔過重外，由於blast的排序同時考慮coverage及similarity。過長
                                #的query將會導致高coverage但低similarity的hit排在前面，排擠掉中等coverage但高similarity的hit
                                #輸入值為tuple，範例： 假設序列為 AAATTTCCC
                                #query_range=(None,None)則代表完全不裁切
                                #query_range=(0,None)也代表完全不裁切
                                #query_range=(0,-1) 則代表從第0個位置(第1個 bp，程式上習慣從0開始計數)開始，到(不包含)倒數最後一個，實際query為 AAATTTCC
                                #query_range=(2,5) 則代表從第2個開始，到(不包含)第5個，實際query為 ATT
                                #query_range=(3,-3) 則代表從第3個開始，到(不包含)倒數第3個，實際query為 TTT
           batch = 30 #`batch`: This parameter is an integer indicating the number of sequences to blast at a time.
                      #The input sequences are divided into batches of size `batch`, and each batch is blasted separately.
                      #This is done to avoid overloading the NCBI BLAST server with too many requests at once.
           )

[16:12:00] Blasting 0 to 30 of 69
[16:12:02] Query D19UHBH9016 submitted.
[16:12:02] You can check the status at https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=D19UHBH9016
[16:12:02] And results here: https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&RID=D19UHBH9016
[16:12:34] Search complete, retrieving results...
[16:12:34] Retrieving results fromhttps://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&RID=D19UHBH9016&FORMAT_TYPE=XML[16:12:43] 'TaxaSet'
[16:15:05] string indices must be integers
[16:15:34] Blasting 30 to 60 of 69
[16:15:37] Query D1A18A55013 submitted.
[16:15:37] You can check the status at https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=D1A18A55013
[16:15:37] And results here: https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&RID=D1A18A55013
[16:16:08] Search complete, retrieving results...
[16:16:08] Retrieving results fromhttps://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&RID=D1A1

'./nanoact/sample/8_consensus//blast.csv'

In [2]:
#我們也可以直接將trim過的raw reads直接對refseq等資料庫進行比對
#但這種方法由於計算量較大，所以無法直接使用blast的演算法進行
#這裡我們使用mmseqs2的演算法進行比對
#輸出的檔案中taxonomyResultReport可以使用pavian進行視覺化
#https://fbreitwieser.shinyapps.io/pavian/
#taxonomyResultReport.html是使用krona進行視覺化的結果
#taxonomyResult.tsv是比對結果，並且可用taxonomy_assign_visualizer()進行視覺化
dumb.taxonomy_assign(
                        src= "./nanoact/sample/6_trimmed/", #Input: 一個資料夾，資料夾中包含以SampleID為檔名的fastq檔案或是fasta檔案（由input_format決定），例如 SampleID.fastq
                        des= "./nanoact/sample/6_1_taxonomy_assign/", 
                        input_format='fastq', #input_format: 輸入檔案的格式，預設為fastq。可以是fastq或是fasta。
                        lca_mode = 3, #lca_mode: 解釋較為複雜，請見mmseqs2的文件: https://mmseqs.com/latest/userguide.pdf
                        custom_acc = ['LC729284', 'LC729293', 'LC729281', 'LC729294', 'LC729290', 'LC729267', 'LC729273'],
                        #custom_acc為一個自訂的accession no.的列表，可以讓mmseqs2加入進行比對
                        custom_gbff = [],
                        #custom_gbff為一個自訂的gbff檔案的"網址"，可以讓mmseqs2加入進行比對
                        ref_db = ['fungi.ITS','bacteria.16SrRNA'],#ref_db是ncbi的refdb資料庫，目前支援的資料庫如下
                        #archaea.16SrRNA
                        #archaea.23SrRNA
                        #archaea.5SrRNA
                        #bacteria.16SrRNA
                        #bacteria.23SrRNA
                        #bacteria.5SrRNA
                        #fungi.18SrRNA
                        #fungi.28SrRNA
                        #fungi.ITS
)

[17:51:16] Downloading custom database from NCBI...
[17:51:18] Getting taxinfo for each record...
[17:51:18] 6/6 taxid processed...
[17:51:18] Merging custom database and ref_db...
[17:51:18] Downloading ref_db: fungi.ITS[17:51:20] Downloading ref_db: bacteria.16SrRNA[17:51:25] Building ref_db from ref_db.fas...
[17:51:54] Processing file: T2.fastq
[17:51:54] Processing file: T6.fastq
[17:51:55] Processing file: T4.fastq
[17:51:56] Processing file: T14.fastq
[17:51:56] Processing file: T20.fastq
[17:51:57] Processing file: T18.fastq
[17:51:57] Processing file: T3.fastq
[17:51:58] Processing file: T25.fastq
[17:51:58] Processing file: T7.fastq
[17:51:58] Processing file: T24.fastq
[17:51:59] Processing file: T27.fastq
[17:51:59] Processing file: T29.fastq
[17:51:59] Processing file: T11.fastq
[17:52:00] Processing file: T21.fastq
[17:52:00] Processing file: T30.fastq
[17:52:00] Processing file: T31.fastq
[17:52:01] Processing file: T5.fastq
[17:52:01] Processing file: T28.fastq
[17:52:0

In [None]:
#custom_taxonomy_sankey是用來將mmseqs2輸出的taxonomyResult.tsv進行Sankey diagram的視覺化
dumb.custom_taxonomy_sankey(src="./nanoact/sample/6_1_taxonomy_assign/",  #Input: 一個資料夾，資料夾中包含以SampleID為檔名的taxonomyResult.tsv檔案，例如 SampleID.taxonomyResult.tsv
                                des="./nanoact/sample/6_1_taxonomy_assign/",  #Output: 一個資料夾，程式會在該資料夾中輸出以SampleID為檔名的檔案
                                img_ext="png", #img_ext: 輸出檔案的格式，預設為png。可以matplotlib支援的格式，如png、svg、jpg等。
                                minimal_reads=2, #當一個taxon的reads數量小於minimal_reads時則不顯示
                                vertical_scale=0.8 #圖片的垂直高度，需要視情況調整，若發現輸出的圖片有字重疊的情形，可以把vertical_scale調大
                                )