Skip to content

Commit

Permalink
Fix contig ID issue with MEGAHIT version
Browse files Browse the repository at this point in the history
  • Loading branch information
Vini2 committed Jul 20, 2020
1 parent 8e03bba commit dc6288c
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 8 deletions.
22 changes: 21 additions & 1 deletion graphbin
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ def main():
"--paths",
default=None,
required=False,
help="path to the contigs.paths file, only SPAdes need",
help="path to the contigs.paths file, only needed for SPAdes",
)
parser.add_argument(
"--contigs",
default=None,
required=False,
help="path to the final.contigs.fa file, only needed for MEGAHIT",
)

args = parser.parse_args()
Expand Down Expand Up @@ -100,6 +106,20 @@ def main():

print("\nExiting GraphBin...\nBye...!\n")
sys.exit(1)

# Check if final.contigs.fa files is provided when the assembler type is MEGAHIT
if args.assembler.lower() == "megahit" and args.contigs is None:
print("\nPlease make sure to provide the path to the final.contigs.fa file.")

print("\nExiting GraphBin...\nBye...!\n")
sys.exit(1)

# Check final.contigs.fa file for MEGAHIT
if args.assembler.lower() == "megahit" and not os.path.isfile(args.contigs):
print("\nFailed to open the final.contigs.fa file.")

print("\nExiting GraphBin...\nBye...!\n")
sys.exit(1)

# Check the file with the initial binning output
if not os.path.isfile(args.binned):
Expand Down
37 changes: 31 additions & 6 deletions graphbin_utils/graphbin_MEGAHIT.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import re
import logging

from Bio import SeqIO
from igraph import *
from graphbin_utils.labelpropagation.labelprop import LabelProp
from graphbin_utils.bidirectionalmap.bidirectionalmap import BidirectionalMap
Expand Down Expand Up @@ -50,6 +51,7 @@ def run(args):


assembly_graph_file = args.graph
contigs_file = args.contigs
contig_bins_file = args.binned
output_path = args.output
prefix = args.prefix
Expand Down Expand Up @@ -104,11 +106,22 @@ def run(args):
logger.info("Constructing the assembly graph")


## Construct the assembly graph
# Get original contig IDs
#-------------------------------

original_contigs = {}

for index, record in enumerate(SeqIO.parse(contigs_file, "fasta")):
original_contigs[record.id] = str(record.seq)


# Construct the assembly graph
#-------------------------------

node_count = 0

graph_contigs = {}

links = []

my_map = BidirectionalMap()
Expand Down Expand Up @@ -152,6 +165,8 @@ def run(args):

my_map[node_count] = int(contig_num)

graph_contigs[contig_num] = strings[2]

node_count += 1

line = file.readline()
Expand Down Expand Up @@ -194,6 +209,18 @@ def run(args):
logger.info("Total number of edges in the assembly graph: "+str(len(edge_list)))


# Map original contig IDs to contig IDS of assembly graph
#--------------------------------------------------------

graph_to_contig_map = BidirectionalMap()

for (n,m), (n2,m2) in zip(graph_contigs.items(), original_contigs.items()):
if m==m2:
graph_to_contig_map[n] = n2

graph_to_contig_map_rev = graph_to_contig_map.inverse


# Get initial binning result
#----------------------------

Expand All @@ -205,9 +232,7 @@ def run(args):
with open(contig_bins_file) as contig_bins:
readCSV = csv.reader(contig_bins, delimiter=',')
for row in readCSV:
start = 'NODE_'
end = ''
contig_num = contigs_map_rev[int(re.search('%s(.*)%s' % (start, end), row[0]).group(1))]
contig_num = contigs_map_rev[int(graph_to_contig_map_rev[row[0]])]

bin_num = int(row[1])-1
bins[bin_num].append(contig_num)
Expand Down Expand Up @@ -471,7 +496,7 @@ def run(args):
for k in range(n_bins):
if i in bins[k]:
line = []
line.append("NODE_"+str(contigs_map[i]))
line.append(graph_to_contig_map[contigs_map[i]])
line.append(k+1)
output_bins.append(line)

Expand All @@ -491,7 +516,7 @@ def run(args):
for i in range(node_count):
if i in remove_labels or i not in non_isolated:
line = []
line.append("NODE_"+str(contigs_map[i]))
line.append(graph_to_contig_map[contigs_map[i]])
unbinned_contigs.append(line)

if len(unbinned_contigs)!=0:
Expand Down
15 changes: 14 additions & 1 deletion support/prepResult.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,27 @@

try:

if assembler.lower() == "spades" or assembler.lower() == "megahit":
if assembler.lower() == "spades":

start_n = 'NODE_'
end_n = '_length'

contig_num = int(re.search('%s(.*)%s' % (start_n, end_n), contig_name).group(1))
line.append('NODE_'+str(contig_num))

elif assembler.lower() == "megahit":

start_k = 'k'
end_k = '_'

k_num = int(re.search('%s(.*)%s' % (start_k, end_k), contig_name).group(1))

start_n = '_'
end_n = ''

contig_num = int(re.search('%s(.*)%s' % (start_n, end_n), contig_name).group(1))
line.append('k'+str(k_num)+'_'+str(contig_num))

elif assembler.lower() == "sga":

start_n = 'contig-'
Expand Down

0 comments on commit dc6288c

Please sign in to comment.