-
Notifications
You must be signed in to change notification settings - Fork 0
/
super_blast.py
124 lines (102 loc) · 4.48 KB
/
super_blast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import glob as gl
import pandas as pd
import re as re
import itertools as it
import xlsxwriter as xl
import textwrap as txtw
import numpy as np
import argparse
from blast import blaster
from blast import apply_leven as lev
import glob
from collections import defaultdict
###ОТРЕЗКИ
def intersec(a, b):
#Находит у двух отрезков факт пересечения или 'почти' пересечения (5 нуклеотидов разницы)
a1, a2, b1, b2 = int(a.split('-')[0]), int(a.split('-')[1]), int(b.split('-')[0]), int(b.split('-')[1])
one = (min(a1, a2), max(a1, a2))
two = (min(b1, b2), max(b1, b2))
left = one if min(one) <= min(two) else two
right = two if max(two) >= max(one) else one
intersec_fact = min(right) - max(left) < 5 or max(right) - max(left) == 0
return intersec_fact, left, right
def grouping(L):
#проходит через все отрезки списка, попарно смотрит пересечения и их объединяет
L = list(set(L))
FullCycle=len(L)<2
while not FullCycle:
pairs = it.combinations(L, 2)
changes = False
for a, b in pairs:
intersec_fact, left, right = intersec(a, b)
if intersec_fact:
changes = True
c = str(min(left))+'-'+str(max(right))
#print(L, left, right, a, b, (min(left), max(right)))
if c == a and b in L:
L.remove(b)
if c == b and a in L:
L.remove(a)
else:
L.remove(a) if a in L else False
L.remove(b) if b in L else False
L.append(c) if c not in L else False
if len(L)==1:
FullCycle = True
break
if not changes:
FullCycle = True
return('\n'.join(L))
##ПАРСИНГИ
def get_cyphers():
with open('cypher.csv', 'r') as file:
#Шифровки для образцов
cypherdict = defaultdict(None)
content = {x.split(',')[0]:x.split(',')[1:] for x in file.read().split('\n')}
cypherdict.update(content)
return cypherdict
def get_tareans(path, index):
repeats_paths = gl.glob(f'{path}/*TAREAN*')
names, seqs = [], []
for repeat_path in repeats_paths:
with open(repeat_path, 'r') as file:
splited_fastas = [*filter(None, file.read().split('>'))]
for fasta in splited_fastas:
lines = [*filter(None, fasta.split('\n'))]
names.append(index+'_'+lines[0])
seqs.append('\n'.join(txtw.wrap("".join(lines[1:]), width=80)))
cls = list(map(lambda x: re.findall('CL\d+',x)[0], names))
cls_zeroes = list(map(lambda x: "CL{:04d}".format(int(re.findall('\d+',x)[0])), cls))
pics = []
for x in cls_zeroes:
glob_search = gl.glob(f'{path}/seqclust/clustering/clusters/dir_{x}/graph_layout.png')
if glob_search:
pics.append(glob_search[0])
else:
pics.append('./No_image.png')
tareandf = pd.DataFrame({'cluster':names, 'seq':seqs, 'pic_path':pics, 'Cluster':cls})
return tareandf
def get_copies(path):
cltab = gl.glob(f'{path}/CLUSTER_TABLE.csv')[0]
copydf = pd.read_csv(cltab, skiprows=5, delimiter='\t')
with open(cltab, 'r') as file:
lines=file.readlines()
total_size=int(lines[4].split('\t')[1].strip())
copydf['Cluster']=copydf['Cluster'].apply(lambda x: 'CL'+str(x))
copydf['size, %']=copydf['Size'].apply(lambda x: 100*x/total_size)
return copydf
def get_comparative_data(index, path):
df = pd.read_csv(f'{path}/COMPARATIVE_ANALYSIS_COUNTS.csv', skiprows=2, delimiter='\t', nrows=500)
local_indexi = df.columns.drop(['cluster','supercluster']).to_list()
for local_index in local_indexi:
df[f'{local_index}, %'] = (100*df[local_index]/(df[local_indexi].sum(axis=1))).map('{:,.2f}'.format)
df['merger'] = df['cluster'].apply(lambda x: index + '_CL' + str(x))
return df.drop(['cluster', 'supercluster'], axis=1)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('index')
args = parser.parse_args()
indexi = [args.index]
dbs = ['comp'] if len(re.findall('[A-Z]+[0-9]+', args.index))>1 else ['local']
my_df = blaster(dbs=dbs, subjects=indexi, tasks=['megablast', 'dc-megablast', 'blastn'], verbose=False, num_threads=300)
my_df.to_csv(f"{indexi[0]}_blast_result.csv")