In [21]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import edlib

### DBSCAN

#### DBSCAN 15

In [47]:
f_in = open('dbscan15_labels_seq1000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq1000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5]


In [48]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 15 42 ... 48 42 43]
 [15  0 50 ... 56 51 51]
 [42 50  0 ... 19  2  1]
 ...
 [48 56 19 ...  0 22 21]
 [42 51  2 ... 22  0  3]
 [43 51  1 ... 21  3  0]]


In [49]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

79860


In [50]:
c_names = ['black', 'green', 'yellow', 'red', 'firebrick', 'cyan', 'orange', 'chocolate', 
          'purple', 'deepskyblue', 'turquoise', 'slategray', 'cornflowerblue', 'blueviolet',
          'violet', 'purple', 'magenta', 'hotpink', 'royalblue', 'azure', 'khaki', 'peru',
           'beige', 'lightgreen', 'thistle', 'oldlace']
def cluster_to_color(color):
    return c_names[color + 1]

In [51]:
f = open('graph_seq1000_1_dbscan15.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [52]:
f_in = open('dbscan15_labels_seq2000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4]


In [53]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 45 15 ... 79  8 79]
 [45  0 52 ... 81 49 82]
 [15 52  0 ... 85 17 83]
 ...
 [79 81 85 ...  0 84 44]
 [ 8 49 17 ... 84  0 81]
 [79 82 83 ... 44 81  0]]


In [54]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

288689


In [55]:
f = open('graph_seq2000_1_dbscan15.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
#<edge id="e3" source="n3" target="n2"/>
#<edge id="e4" source="n2" target="n4"/>
#<edge id="e5" source="n3" target="n5"/>
#<edge id="e6" source="n5" target="n4">
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [56]:
f_in = open('dbscan15_labels_seq2000_2.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_2.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5  6]


In [57]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 80 25 ... 53 16 80]
 [80  0 83 ... 81 85 10]
 [25 83  0 ... 56 18 82]
 ...
 [53 81 56 ...  0 48 82]
 [16 85 18 ... 48  0 84]
 [80 10 82 ... 82 84  0]]


In [58]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

263097


In [59]:
f = open('graph_seq2000_2_dbscan15.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [61]:
f_in = open('dbscan15_labels_seq2000_3.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_3.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5]


In [62]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0  4 43 ... 44 20 81]
 [ 4  0 45 ... 46 22 82]
 [43 45  0 ... 12 53 82]
 ...
 [44 46 12 ...  0 55 80]
 [20 22 53 ... 55  0 82]
 [81 82 82 ... 80 82  0]]


In [63]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

295407


In [64]:
f = open('graph_seq2000_3_dbscan15.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

#### DBSCAN 5

In [66]:
f_in = open('dbscan5_labels_seq1000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq1000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]


In [67]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 15 42 ... 48 42 43]
 [15  0 50 ... 56 51 51]
 [42 50  0 ... 19  2  1]
 ...
 [48 56 19 ...  0 22 21]
 [42 51  2 ... 22  0  3]
 [43 51  1 ... 21  3  0]]


In [68]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

79860


In [69]:
f = open('graph_seq1000_1_dbscan5.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [71]:
f_in = open('dbscan5_labels_seq2000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]


In [72]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 45 15 ... 79  8 79]
 [45  0 52 ... 81 49 82]
 [15 52  0 ... 85 17 83]
 ...
 [79 81 85 ...  0 84 44]
 [ 8 49 17 ... 84  0 81]
 [79 82 83 ... 44 81  0]]


In [73]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

288689


In [74]:
f = open('graph_seq2000_1_dbscan5.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [76]:
f_in = open('dbscan5_labels_seq2000_2.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_2.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [77]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 80 25 ... 53 16 80]
 [80  0 83 ... 81 85 10]
 [25 83  0 ... 56 18 82]
 ...
 [53 81 56 ...  0 48 82]
 [16 85 18 ... 48  0 84]
 [80 10 82 ... 82 84  0]]


In [78]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

263097


In [79]:
f = open('graph_seq2000_2_dbscan5.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [81]:
f_in = open('dbscan5_labels_seq2000_3.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_3.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13]


In [82]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0  4 43 ... 44 20 81]
 [ 4  0 45 ... 46 22 82]
 [43 45  0 ... 12 53 82]
 ...
 [44 46 12 ...  0 55 80]
 [20 22 53 ... 55  0 82]
 [81 82 82 ... 80 82  0]]


In [83]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

295407


In [84]:
f = open('graph_seq2000_3_dbscan5.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

### Affinity prop

In [96]:
f_in = open('affinity_labels_seq1000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq1000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5]


In [97]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 15 42 ... 48 42 43]
 [15  0 50 ... 56 51 51]
 [42 50  0 ... 19  2  1]
 ...
 [48 56 19 ...  0 22 21]
 [42 51  2 ... 22  0  3]
 [43 51  1 ... 21  3  0]]


In [98]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

79860


In [99]:
f = open('graph_seq1000_1_affinity.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [100]:
f_in = open('affinity_labels_seq2000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5 6 7]


In [101]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 45 15 ... 79  8 79]
 [45  0 52 ... 81 49 82]
 [15 52  0 ... 85 17 83]
 ...
 [79 81 85 ...  0 84 44]
 [ 8 49 17 ... 84  0 81]
 [79 82 83 ... 44 81  0]]


In [102]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

288689


In [103]:
f = open('graph_seq2000_1_affinity.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [104]:
f_in = open('affinity_labels_seq2000_2.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_2.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5 6 7 8 9]


In [105]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 80 25 ... 53 16 80]
 [80  0 83 ... 81 85 10]
 [25 83  0 ... 56 18 82]
 ...
 [53 81 56 ...  0 48 82]
 [16 85 18 ... 48  0 84]
 [80 10 82 ... 82 84  0]]


In [106]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

263097


In [107]:
f = open('graph_seq2000_2_affinity.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [109]:
f_in = open('affinity_labels_seq2000_3.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_3.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5 6 7]


In [110]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0  4 43 ... 44 20 81]
 [ 4  0 45 ... 46 22 82]
 [43 45  0 ... 12 53 82]
 ...
 [44 46 12 ...  0 55 80]
 [20 22 53 ... 55  0 82]
 [81 82 82 ... 80 82  0]]


In [111]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

295407


In [112]:
f = open('graph_seq2000_3_affinity.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

### Уральский

In [113]:
f_in = open('ur_centr_seq1000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq1000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5]


In [114]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 15 42 ... 48 42 43]
 [15  0 50 ... 56 51 51]
 [42 50  0 ... 19  2  1]
 ...
 [48 56 19 ...  0 22 21]
 [42 51  2 ... 22  0  3]
 [43 51  1 ... 21  3  0]]


In [115]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

79860


In [116]:
f = open('graph_seq1000_1_ur.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [117]:
f_in = open('ur_centr_seq2000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5]


In [118]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 45 15 ... 79  8 79]
 [45  0 52 ... 81 49 82]
 [15 52  0 ... 85 17 83]
 ...
 [79 81 85 ...  0 84 44]
 [ 8 49 17 ... 84  0 81]
 [79 82 83 ... 44 81  0]]


In [119]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

288689


In [120]:
f = open('graph_seq2000_1_ur.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [121]:
f_in = open('ur_centr_seq2000_2.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_2.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5]


In [122]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 80 25 ... 53 16 80]
 [80  0 83 ... 81 85 10]
 [25 83  0 ... 56 18 82]
 ...
 [53 81 56 ...  0 48 82]
 [16 85 18 ... 48  0 84]
 [80 10 82 ... 82 84  0]]


In [123]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

263097


In [124]:
f = open('graph_seq2000_2_ur.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [125]:
f_in = open('ur_centr_seq2000_3.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_3.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[0 1 2 3 4 5]


In [126]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0  4 43 ... 44 20 81]
 [ 4  0 45 ... 46 22 82]
 [43 45  0 ... 12 53 82]
 ...
 [44 46 12 ...  0 55 80]
 [20 22 53 ... 55  0 82]
 [81 82 82 ... 80 82  0]]


In [127]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

295407


In [128]:
f = open('graph_seq2000_3_ur.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

### Уральский новый

In [129]:
f_in = open('ur_centr_10_seq1000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq1000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5]


In [130]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 15 42 ... 48 42 43]
 [15  0 50 ... 56 51 51]
 [42 50  0 ... 19  2  1]
 ...
 [48 56 19 ...  0 22 21]
 [42 51  2 ... 22  0  3]
 [43 51  1 ... 21  3  0]]


In [131]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

79860


In [132]:
f = open('graph_seq1000_1_ur_new.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [133]:
f_in = open('ur_centr_10_seq2000_1.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5]


In [134]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 45 15 ... 79  8 79]
 [45  0 52 ... 81 49 82]
 [15 52  0 ... 85 17 83]
 ...
 [79 81 85 ...  0 84 44]
 [ 8 49 17 ... 84  0 81]
 [79 82 83 ... 44 81  0]]


In [135]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

288689


In [136]:
f = open('graph_seq2000_1_ur_new.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [137]:
f_in = open('ur_centr_10_seq2000_2.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_2.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5]


In [138]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0 80 25 ... 53 16 80]
 [80  0 83 ... 81 85 10]
 [25 83  0 ... 56 18 82]
 ...
 [53 81 56 ...  0 48 82]
 [16 85 18 ... 48  0 84]
 [80 10 82 ... 82 84  0]]


In [139]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

263097


In [140]:
f = open('graph_seq2000_2_ur_new.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()

In [141]:
f_in = open('ur_centr_10_seq2000_3.txt', 'r')
colors = []
sequences = []
for line in f_in:
    colors.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('seq2000_3.fa', 'fasta'):
    sequences.append(str(record.seq))
#print(colors)
print(np.unique(colors))
f_in.close()

[-1  0  1  2  3  4  5]


In [142]:
dist_matrix = np.zeros((len(sequences), len(sequences)), dtype='int')
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        dist_matrix[i, j] = edlib.align(sequences[i], sequences[j], mode='HW', task='path')['editDistance']
dist_matrix = dist_matrix + dist_matrix.T
print(dist_matrix)

[[ 0  4 43 ... 44 20 81]
 [ 4  0 45 ... 46 22 82]
 [43 45  0 ... 12 53 82]
 ...
 [44 46 12 ...  0 55 80]
 [20 22 53 ... 55  0 82]
 [81 82 82 ... 80 82  0]]


In [143]:
edges = np.zeros((len(sequences), len(sequences)), dtype='int')
count = 0
for i in range(len(sequences)):
    for j in range(i, len(sequences)):
        if 0 < dist_matrix[i, j] < 10:
            edges[i, j] = dist_matrix[i, j]
            count += 1
print(count)

295407


In [144]:
f = open('graph_seq2000_3_ur_new.graphml', 'w')
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n')
f.write('xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">\n')
f.write('<key id="d0" for="node" attr.name="color" attr.type="string">\n')
f.write('<default>yellow</default>\n')
f.write('</key>\n')
f.write('<key id="d1" for="edge" attr.name="weight" attr.type="double"/>\n')
f.write('<graph id="G" edgedefault="undirected">\n')
for i in range(len(colors)):
    f.write(f'<node id="n{i}">\n')
    f.write(f'<data key="d0">{cluster_to_color(colors[i])}</data>\n')
    f.write('</node>\n')
count = 0
for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        if edges[i, j] > 0:
            f.write(f'<edge id="e{count}" source="n{i}" target="n{j}">\n')
            f.write(f'<data key="d1">{edges[i, j]}</data>\n')
            f.write(f'</edge>')
            count += 1
f.write('</graph>\n')
f.write('</graphml>')
f.close()