/
contig_pairs.py
64 lines (45 loc) · 1.72 KB
/
contig_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#! /usr/bin/env python3
import argparse
from collections import defaultdict
import subprocess
import sys
parser = argparse.ArgumentParser()
parser.add_argument('--blastout', required=True, help='blast output from busco')
parser.add_argument('--contigs', required=True, help='folder with singulated contigs')
args = parser.parse_args()
def find_redundancy(blastout):
# takes the blastfile output from a busco run and returns a dict of
# possibly redundant contigs
hits = defaultdict(list)
with open(blastout, 'r') as blastfile:
for line in blastfile:
if line.startswith('#'):
continue
else:
l = line.split('\t')
if l[1] not in hits[l[0]]:
hits[l[0]].append(l[1])
return(hits)
def mummer_contigs(contigs, contig1, contig2):
# takes two contigs and runs mummer
name = contig1 + '_' + contig2
c1 = f"{contigs}/{contig1}.fa"
c2 = f"{contigs}/{contig2}.fa"
task = ["../../builds/MUMmer3.23/nucmer", "--maxmatch", "-p", name, c1, c2]
running = subprocess.Popen(' '.join(task), stdout=subprocess.PIPE, stderr=subprocess.PIPE,
encoding='utf-8', shell=True)
stdout, stderr = running.communicate()
print(stdout)
print(stderr)
task = ["../../builds/MUMmer3.23/mummerplot", "-layout", "-large", "-filter", "--png", "-p", f"{name}.plot", f"{name}.delta"]
running = subprocess.Popen(' '.join(task), stdout=subprocess.PIPE, stderr=subprocess.PIPE,
encoding='utf-8', shell=True)
stdout, stderr = running.communicate()
print(stdout)
print(stderr)
reduntigs = find_redundancy(blastout=args.blastout)
# check number of items in reduntigs elements
for gene, clist in reduntigs.items():
print(len(clist))
if len(clist) == 2:
mummer_contigs(contigs=args.contigs, contig1=clist[0], contig2=clist[1])