In [251]:
import edlib
import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [279]:
def radius(consensus_strings, X, labels):
    radii = []
    for label in np.unique(labels):
        one_cluster_points = [X[i] for i in range(len(X)) if labels[i] == label]
        consensus = consensus_strings[label]
        distances = [edlib.align(one_cluster_points[i], consensus, mode='HW', task='path')['editDistance'] for i in range(len(one_cluster_points))]
        radii.append(np.max(distances))
    return radii

In [280]:
#############
def mean_square(consensus_strings, X, labels):
    results = []
    for label in np.unique(labels):
        one_cluster_points = [X[i] for i in range(len(X)) if labels[i] == label]
        consensus = consensus_strings[label]
        distances = np.array([edlib.align(one_cluster_points[i], consensus, mode='HW', task='path')['editDistance'] for i in range(len(one_cluster_points))])
        results.append(np.round(np.sqrt((distances**2).mean()), 2))
    return results

In [254]:
def min_dist_between_clusters(X, labels):
    result = 0
    distances = []
    one_cluster_points = [[X[i] for i in range(len(X)) if labels[i] == label] for label in np.unique(labels)]
    for i in range(len(one_cluster_points)):
        for j in range(i + 1, len(one_cluster_points)):
            distances.append(np.min([edlib.align(one_cluster_points[i][k], one_cluster_points[j][l], mode='HW', task='path')['editDistance'] for k in range(len(one_cluster_points[i])) for l in range(len(one_cluster_points[j]))]))
    distances = np.array(distances)
    result = np.min(distances[np.nonzero(distances)])
    return result

In [255]:
###########
def centromere_cluster_distribution(X, labels, chromosomes):
    chromosomes = np.array(chromosomes)
    chromo_distribution = []
    for label in np.unique(labels):
        one_cluster_points_indices = [i for i in range(len(X)) if labels[i] == label]
        one_cluster_chromo = chromosomes[one_cluster_points_indices]
        cluster_chromo_distr = {}
        for chromo in np.unique(chromosomes):
            cluster_chromo_distr[chromo] = len([one_cluster_chromo[i] for i in range(len(one_cluster_chromo)) if one_cluster_chromo[i] == chromo])
        chromo_distribution.append(cluster_chromo_distr)
    return chromo_distribution

In [256]:
def consensus_distance(consensus_strings):
    distances = []
    for i in range(len(consensus_strings)):
        for j in range(i + 1, len(consensus_strings)):
            distances.append(edlib.align(consensus_strings[i], consensus_strings[j], mode='HW', task='path')['editDistance'])
    return distances

In [257]:
#################
def min_consensus_distance(consensus_strings):
    min_distances = []
    distances = []
    for i in range(len(consensus_strings)):
        for j in range(i + 1, len(consensus_strings)):
            distances.append(edlib.align(consensus_strings[i], consensus_strings[j], mode='HW', task='path')['editDistance'])
        min_distances.append(np.min(distances[(i + 1):]))
    return min_distances

In [258]:
###############
def edge_density(X, labels):
    density = []
    dist_matrix = np.zeros((len(X), len(X)), dtype='int')
    for i in range(len(X)):
        for j in range(i, len(X)):
            dist_matrix[i, j] = edlib.align(X[i], X[j], mode='HW', task='path')['editDistance']
    dist_matrix = dist_matrix + dist_matrix.T
    edges = np.zeros((len(X), len(X)), dtype='float')
    for i in range(len(X)):
            for j in range(i, len(X)):
                if 0 < dist_matrix[i, j] < 10:
                    edges[i, j] = dist_matrix[i, j]
    for label in np.unique(labels):
        one_cluster_points_indices = np.array([i for i in range(len(X)) if labels[i] == label])
        one_cluster_edges = edges[np.ix_(one_cluster_points_indices, one_cluster_points_indices)]
        count = len(one_cluster_edges[np.nonzero(one_cluster_edges)])
        max_edges = (len(one_cluster_points_indices) * (len(one_cluster_points_indices) - 1)) / 2
        density.append(np.round(count / max_edges, 2))
    return density

### DBSCAN5 1000

In [259]:
f_in = open('/Users/olga/Downloads/dbscan5_labels_seq1000_1.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq1000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/dbscan5_consensuses_seq1000_1.txt', 'r')
consensus_strings = []
c = 0
for line in f:
    c += 1
    if c % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]


In [260]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[67.84, 1.65, 1.99, 7.99, 6.51, 2.43, 1.94, 5.01, 5.38, 5.84, 2.28, 2.34, 10.27, 4.06, 1.2, 1.1, 1.83, 2.37]


In [261]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 8, 5: 4, 19: 24},
 {1: 0, 5: 37, 19: 0},
 {1: 0, 5: 23, 19: 0},
 {1: 90, 5: 62, 19: 177},
 {1: 88, 5: 29, 19: 159},
 {1: 9, 5: 0, 19: 0},
 {1: 9, 5: 0, 19: 0},
 {1: 15, 5: 18, 19: 2},
 {1: 18, 5: 14, 19: 4},
 {1: 55, 5: 0, 19: 0},
 {1: 0, 5: 0, 19: 5},
 {1: 15, 5: 0, 19: 2},
 {1: 47, 5: 0, 19: 0},
 {1: 14, 5: 33, 19: 3},
 {1: 9, 5: 0, 19: 0},
 {1: 10, 5: 0, 19: 0},
 {1: 0, 5: 0, 19: 9},
 {1: 8, 5: 0, 19: 0}]

In [262]:
min_consensus_distance(consensus_strings)

[61, 11, 11, 11, 9, 9, 9, 9, 9, 9, 6, 6, 6, 6, 6, 6, 6, 6]

In [263]:
edge_density(sequences, labels)

[0.03,
 0.8,
 0.73,
 0.8,
 0.76,
 0.92,
 0.92,
 0.89,
 0.91,
 0.87,
 1.0,
 0.91,
 0.69,
 0.93,
 0.69,
 0.53,
 0.89,
 0.89]

### DBSCAN5 2000_1

In [264]:
f_in = open('/Users/olga/Downloads/dbscan5_labels_seq2000_1.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/dbscan5_consensuses_seq2000_1.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]


In [265]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[77.07, 7.62, 8.96, 2.76, 5.86, 7.06, 16.48, 1.7, 2.39, 5.96, 3.86, 1.6, 2.82, 1.35, 2.0, 1.54]


In [266]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 11, 5: 2, 19: 33},
 {1: 158, 5: 61, 19: 320},
 {1: 176, 5: 104, 19: 342},
 {1: 2, 5: 54, 19: 0},
 {1: 105, 5: 0, 19: 0},
 {1: 41, 5: 99, 19: 7},
 {1: 95, 5: 0, 19: 0},
 {1: 0, 5: 89, 19: 0},
 {1: 26, 5: 0, 19: 0},
 {1: 43, 5: 28, 19: 7},
 {1: 31, 5: 28, 19: 1},
 {1: 31, 5: 0, 19: 0},
 {1: 27, 5: 0, 19: 0},
 {1: 24, 5: 0, 19: 0},
 {1: 34, 5: 0, 19: 0},
 {1: 21, 5: 0, 19: 0}]

In [267]:
min_consensus_distance(consensus_strings)

[77, 13, 13, 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11]

In [268]:
edge_density(sequences, labels)

[0.03,
 0.71,
 0.79,
 0.91,
 0.86,
 0.92,
 0.68,
 0.88,
 0.9,
 0.92,
 0.87,
 0.83,
 0.96,
 0.71,
 0.93,
 0.97]

### DBSCAN5 2000_2

In [269]:
f_in = open('/Users/olga/Downloads/dbscan5_labels_seq2000_2.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_2.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/dbscan5_consensuses_seq2000_2.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [270]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[64.89, 6.72, 2.17, 7.12, 5.4, 1.92, 6.13, 10.28, 1.97, 2.15, 5.18, 4.86, 2.57, 1.02, 2.28, 1.29, 1.59]


In [271]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 17, 5: 6, 19: 31},
 {1: 165, 5: 52, 19: 311},
 {1: 29, 5: 0, 19: 0},
 {1: 147, 5: 106, 19: 326},
 {1: 119, 5: 0, 19: 0},
 {1: 0, 5: 100, 19: 0},
 {1: 42, 5: 85, 19: 2},
 {1: 92, 5: 0, 19: 0},
 {1: 33, 5: 0, 19: 0},
 {1: 33, 5: 0, 19: 0},
 {1: 49, 5: 30, 19: 10},
 {1: 41, 5: 20, 19: 3},
 {1: 0, 5: 54, 19: 0},
 {1: 20, 5: 0, 19: 0},
 {1: 0, 5: 0, 19: 15},
 {1: 35, 5: 0, 19: 0},
 {1: 27, 5: 0, 19: 0}]

In [272]:
min_consensus_distance(consensus_strings)

[66, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]

In [273]:
edge_density(sequences, labels)

[0.03,
 0.71,
 0.79,
 0.76,
 0.84,
 0.81,
 0.94,
 0.68,
 0.8,
 0.89,
 0.92,
 0.91,
 0.92,
 0.45,
 1.0,
 0.91,
 0.94]

### DBSCAN5 2000_3

In [274]:
f_in = open('/Users/olga/Downloads/dbscan5_labels_seq2000_3.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_3.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/dbscan5_consensuses_seq2000_3.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13]


In [275]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[43.86, 7.09, 1.98, 6.96, 6.12, 0.9, 5.52, 6.03, 2.14, 1.43, 9.46, 5.23, 0.87, 1.67, 1.88]


In [276]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 6, 5: 4, 19: 17},
 {1: 153, 5: 116, 19: 358},
 {1: 50, 5: 0, 19: 0},
 {1: 158, 5: 82, 19: 350},
 {1: 39, 5: 21, 19: 4},
 {1: 27, 5: 0, 19: 0},
 {1: 41, 5: 25, 19: 8},
 {1: 43, 5: 80, 19: 14},
 {1: 0, 5: 48, 19: 0},
 {1: 0, 5: 76, 19: 0},
 {1: 104, 5: 0, 19: 0},
 {1: 101, 5: 0, 19: 0},
 {1: 25, 5: 0, 19: 0},
 {1: 31, 5: 0, 19: 0},
 {1: 19, 5: 0, 19: 0}]

In [277]:
min_consensus_distance(consensus_strings)

[28, 22, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]

In [278]:
edge_density(sequences, labels)

[0.04,
 0.78,
 0.87,
 0.66,
 0.9,
 0.66,
 0.91,
 0.91,
 0.87,
 0.84,
 0.54,
 0.89,
 0.76,
 0.77,
 0.92]

### Уральский 1000

In [336]:
f_in = open('/Users/olga/Downloads/ur_centr_seq1000_1.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq1000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
consensus_strings = []
for record in SeqIO.parse('/Users/olga/Downloads/cen1_mn.fa', 'fasta'):
    consensus_strings.append(record.seq)
print(np.unique(labels))

[0 1 2 3 4 5]


In [337]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[60.17, 13.84, 22.88, 36.79, 16.24, 11.86]


In [338]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 76, 5: 34, 19: 4},
 {1: 17, 5: 42, 19: 2},
 {1: 21, 5: 14, 19: 4},
 {1: 161, 5: 31, 19: 185},
 {1: 104, 5: 62, 19: 185},
 {1: 16, 5: 37, 19: 5}]

In [339]:
min_consensus_distance(consensus_strings)

[16, 16, 14, 14, 14, 14]

In [340]:
edge_density(sequences, labels)

[0.3, 0.43, 0.77, 0.45, 0.71, 0.4]

### Уральский 2000_1

In [341]:
f_in = open('/Users/olga/Downloads/ur_centr_seq2000_1.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
consensus_strings = []
for record in SeqIO.parse('/Users/olga/Downloads/cen1_mn.fa', 'fasta'):
    consensus_strings.append(record.seq)
print(np.unique(labels))

[0 1 2 3 4 5]


In [342]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[54.85, 14.72, 33.72, 40.03, 18.76, 12.22]


In [343]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 162, 5: 99, 19: 7},
 {1: 35, 5: 84, 19: 2},
 {1: 62, 5: 28, 19: 7},
 {1: 314, 5: 61, 19: 339},
 {1: 214, 5: 105, 19: 349},
 {1: 38, 5: 88, 19: 6}]

In [344]:
min_consensus_distance(consensus_strings)

[16, 16, 14, 14, 14, 14]

In [345]:
edge_density(sequences, labels)

[0.35, 0.42, 0.61, 0.43, 0.69, 0.44]

### Уральский 2000_2

In [291]:
f_in = open('/Users/olga/Downloads/ur_centr_seq2000_2.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_2.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/cen1_mn.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[0 1 2 3 4 5]


In [292]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[113.42, 121.68, 115.79, 122.91, 113.1, 122.14]


In [293]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 174, 5: 85, 19: 2},
 {1: 44, 5: 76, 19: 5},
 {1: 72, 5: 31, 19: 13},
 {1: 337, 5: 56, 19: 339},
 {1: 188, 5: 107, 19: 332},
 {1: 34, 5: 98, 19: 7}]

In [294]:
min_consensus_distance(consensus_strings)

[5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1]

In [295]:
edge_density(sequences, labels)

[0.31, 0.44, 0.58, 0.41, 0.66, 0.44]

### Уральский 2000_3

In [296]:
f_in = open('/Users/olga/Downloads/ur_centr_seq2000_3.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_3.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/cen1_mn.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[0 1 2 3 4 5]


In [297]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[113.16, 121.72, 116.12, 123.0, 113.11, 122.16]


In [298]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 167, 5: 80, 19: 14},
 {1: 43, 5: 73, 19: 9},
 {1: 52, 5: 21, 19: 5},
 {1: 301, 5: 84, 19: 358},
 {1: 181, 5: 117, 19: 360},
 {1: 53, 5: 77, 19: 5}]

In [299]:
min_consensus_distance(consensus_strings)

[5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1]

In [300]:
edge_density(sequences, labels)

[0.31, 0.45, 0.63, 0.44, 0.71, 0.39]

### Affinity prop 1000

In [301]:
f_in = open('/Users/olga/Downloads/affinity_labels_seq1000_1.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq1000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/affinity_consensuses_seq1000_1.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[0 1 2 3 4 5]


In [302]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[19.7, 37.26, 57.77, 25.67, 29.39, 28.21]


In [303]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 79, 5: 10, 19: 166},
 {1: 117, 5: 120, 19: 174},
 {1: 101, 5: 0, 19: 1},
 {1: 45, 5: 1, 19: 0},
 {1: 34, 5: 44, 19: 41},
 {1: 19, 5: 45, 19: 3}]

In [304]:
min_consensus_distance(consensus_strings)

[45, 21, 21, 21, 21, 21]

In [305]:
edge_density(sequences, labels)

[0.6, 0.4, 0.27, 0.33, 0.25, 0.27]

### Affinity prop 2000_1

In [315]:
f_in = open('/Users/olga/Downloads/affinity_labels_seq2000_1.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_1.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/affinity_consensuses_seq2000_1.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[0 1 2 3 4 5 6 7]


In [316]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[28.91, 23.17, 36.44, 68.52, 14.24, 24.48, 31.94, 14.5]


In [317]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 119, 5: 1, 19: 0},
 {1: 70, 5: 1, 19: 0},
 {1: 67, 5: 0, 19: 0},
 {1: 407, 5: 374, 19: 564},
 {1: 42, 5: 10, 19: 50},
 {1: 25, 5: 10, 19: 22},
 {1: 20, 5: 69, 19: 73},
 {1: 75, 5: 0, 19: 1}]

In [318]:
min_consensus_distance(consensus_strings)

[34, 23, 23, 23, 23, 23, 23, 23]

In [309]:
edge_density(sequences, labels)

[0.42, 0.35, 0.28, 0.23, 0.6, 0.45, 0.26, 0.71]

### Affinity prop 2000_2

In [319]:
f_in = open('/Users/olga/Downloads/affinity_labels_seq2000_2.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_2.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/affinity_consensuses_seq2000_2.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[0 1 2 3 4 5 6 7 8 9]


In [320]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[33.03, 33.67, 23.7, 59.44, 35.58, 58.82, 22.54, 18.51, 19.46, 34.39]


In [321]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 87, 5: 102, 19: 112},
 {1: 130, 5: 197, 19: 219},
 {1: 45, 5: 34, 19: 38},
 {1: 34, 5: 59, 19: 89},
 {1: 181, 5: 0, 19: 1},
 {1: 117, 5: 51, 19: 126},
 {1: 88, 5: 0, 19: 1},
 {1: 78, 5: 8, 19: 112},
 {1: 32, 5: 1, 19: 0},
 {1: 57, 5: 1, 19: 0}]

In [322]:
min_consensus_distance(consensus_strings)

[39, 23, 23, 23, 23, 23, 23, 23, 23, 27]

In [323]:
edge_density(sequences, labels)

[0.28, 0.29, 0.36, 0.35, 0.4, 0.31, 0.63, 0.74, 0.77, 0.27]

### Affinity prop 2000_3

In [324]:
f_in = open('/Users/olga/Downloads/affinity_labels_seq2000_3.txt', 'r')
labels = []
sequences = []
chromosomes = []
for line in f_in:
    labels.append(int(line.strip().split()[-1]))
for record in SeqIO.parse('/Users/olga/Downloads/seq2000_3.fa', 'fasta'):
    sequences.append(str(record.seq))
    chromosomes.append(int(record.id.split('_')[1][3:]))
f_in.close()
f = open('/Users/olga/Downloads/affinity_consensuses_seq2000_3.txt', 'r')
consensus_strings = []
count = 0
for line in f:
    count += 1
    if count % 2 == 0:
        consensus_strings.append(line.strip())
f.close()
print(np.unique(labels))

[0 1 2 3 4 5 6 7]


In [325]:
ms = mean_square(consensus_strings, sequences, labels)
print(ms)

[16.77, 9.5, 27.05, 59.34, 26.54, 20.11, 24.9, 17.52]


In [326]:
centromere_cluster_distribution(sequences, labels, chromosomes)

[{1: 146, 5: 223, 19: 363},
 {1: 77, 5: 23, 19: 197},
 {1: 58, 5: 3, 19: 55},
 {1: 130, 5: 48, 19: 45},
 {1: 143, 5: 1, 19: 0},
 {1: 8, 5: 57, 19: 14},
 {1: 75, 5: 96, 19: 77},
 {1: 160, 5: 1, 19: 0}]

In [327]:
min_consensus_distance(consensus_strings)

[54, 27, 27, 27, 27, 27, 27, 27]

In [328]:
edge_density(sequences, labels)

[0.47, 0.72, 0.29, 0.21, 0.46, 0.37, 0.27, 0.27]