In [1]:
import my_kmeans
import random_forest

In [33]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
from sklearn.cluster import KMeans
import numpy as np

file = 'data/good_introns50+3.fasta'
with open(file, "r") as handle:
    sequences = list(SimpleFastaParser(handle))

types = np.zeros((len(sequences),), dtype=int)
for i, s in enumerate(sequences):
    left_anchor = s[1][3:5]
    right_anchor = s[1][-5:-3]
    if left_anchor == 'GT' and right_anchor == 'AG' or left_anchor == 'CT' and right_anchor == 'AC':
        # 0  forcovetional intron
        types[i] = (0)
    else:
        # 1 for nonconventional
        types[i] = (1)

### KMeans clustering of the introns with two different ways of sequence representation, using 4- and 7-mers converted to TF-IDF and different number of clusters

In [8]:
seq4 = my_kmeans.preprocess(0, 4, sequences)
clusters4 = np.zeros((7, len(sequences)))
for x in range(1,8):
    n_clusters = 2**x
    kmeans_model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=10, random_state=0, n_jobs=12)
    kmeans_model.fit(seq4)
    clusters4[x - 1] = kmeans_model.labels_
print(clusters4)

[[  0.   1.   1. ...   1.   0.   0.]
 [  1.   3.   3. ...   3.   3.   1.]
 [  0.   1.   1. ...   1.   3.   6.]
 ...
 [  3.  18.   7. ...  23.   8.  13.]
 [ 25.  44.  20. ...  43.  37.  61.]
 [ 73.  20. 111. ...  32. 101.  60.]]


In [9]:
seq7 = my_kmeans.preprocess(0, 7, sequences)
clusters7 = np.zeros((7, len(sequences)))
for x in range(1,8):
    n_clusters = 2**x
    kmeans_model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=10, random_state=0, n_jobs=12)
    kmeans_model.fit(seq4)
    clusters7[x - 1] = kmeans_model.labels_
print(clusters7)

[[  0.   1.   1. ...   1.   0.   0.]
 [  1.   3.   3. ...   3.   3.   1.]
 [  0.   1.   1. ...   1.   3.   6.]
 ...
 [  3.  18.   7. ...  23.   8.  13.]
 [ 25.  44.  20. ...  43.  37.  61.]
 [ 73.  20. 111. ...  32. 101.  60.]]


In [29]:
def assess_cluster(clusters, n_of_clusters, true_classes):
    clus_dict = dict([(x, [0, 0]) for x in range(n_of_clusters)])
    for i, x in enumerate(clusters):
        true_type = types[i]
        clus_dict[int(x)][true_type] += 1    
    
    results = [[], []]
    for i, x in enumerate(types):
        cluster = clusters[i]
        res = clus_dict[cluster][x] / sum(clus_dict[cluster])
        results[int(x)].append(res)
    print('conv: ')
    print(sum(results[0]) / len(results[0]))
    print('nonconv: ')
    print(sum(results[1]) / len(results[1]))
    return clus_dict

### Each clustering is evaluated by measuring homogeneity of the clusters

In [30]:
for x in range(1,8):
    n_of_clusters =  2**x
    print('Word length: 4, number of clusters: %d' % (n_of_clusters))
    assess_cluster(clusters4[x-1], n_of_clusters, types)
    print('\n')

for x in range(1,8):
    n_of_clusters =  2**x
    print('Word length: 7, number of clusters: %d' % (n_of_clusters))
    assess_cluster(clusters7[x-1], n_of_clusters, types)
    print('\n') 

Word length: 4, number of clusters: 2
conv: 
0.5034937552404394
nonconv: 
0.5019001466579418


Word length: 4, number of clusters: 4
conv: 
0.5034765844025446
nonconv: 
0.501882920707725


Word length: 4, number of clusters: 8
conv: 
0.5076849803710132
nonconv: 
0.5061048241314562


Word length: 4, number of clusters: 16
conv: 
0.5136950724385081
nonconv: 
0.5121342064584714


Word length: 4, number of clusters: 32
conv: 
0.5165936317112962
nonconv: 
0.5150420690762714


Word length: 4, number of clusters: 64
conv: 
0.5284029415781221
nonconv: 
0.5268892826306588


Word length: 4, number of clusters: 128
conv: 
0.5286695079220537
nonconv: 
0.5271567045577814


Word length: 7, number of clusters: 2
conv: 
0.5034937552404394
nonconv: 
0.5019001466579418


Word length: 7, number of clusters: 4
conv: 
0.5034765844025446
nonconv: 
0.501882920707725


Word length: 7, number of clusters: 8
conv: 
0.5076849803710132
nonconv: 
0.5061048241314562


Word length: 7, number of clusters: 16
conv: 
0

### Then the sequences are classified by random forests. K-mers ranging from 4 to 11 are used for representation and for each length the classification is done twice - using whole sequences or without the junctions that were used to divide them into conventional or nonconventional.

In [32]:
file = 'data/good_introns50+3.fasta'
with open(file, "r") as handle:
    sequences = list(SimpleFastaParser(handle))

types = []
for s in sequences:
    left_anchor = s[1][3:5]
    right_anchor = s[1][-5:-3]
    if left_anchor == 'GT' and right_anchor == 'AG' or left_anchor == 'CT' and right_anchor == 'AC':
        types.append(0)
    else:
        types.append(1)


for n in range(4, 12):
    print('\nlen of ogligonucleotides: ', n)
    for cut in range(2):
        if cut == 0:
            print('uncut sequences: ')
        if cut == 1:
            print('conventional splices cut: ')
        data = random_forest.preprocess(cut, n, sequences)
        acc = random_forest.forest(10, data, types)
        print(str(acc))


len of ogligonucleotides:  4
uncut sequences: 
0.645330207112153
conventional splices cut: 
0.6483261690764621

len of ogligonucleotides:  5
uncut sequences: 
0.6641266119577961
conventional splices cut: 
0.66140419434675

len of ogligonucleotides:  6
uncut sequences: 
0.7303764491337761
conventional splices cut: 
0.7254005470887066

len of ogligonucleotides:  7
uncut sequences: 
0.819304415787417
conventional splices cut: 
0.8147192913898659

len of ogligonucleotides:  8
uncut sequences: 
0.8461247883287744
conventional splices cut: 
0.8500195388823759

len of ogligonucleotides:  9
uncut sequences: 
0.8514523902566108
conventional splices cut: 
0.8529112934740132

len of ogligonucleotides:  10
uncut sequences: 
0.8625113976813858
conventional splices cut: 
0.85761365116582

len of ogligonucleotides:  11
uncut sequences: 
0.8722157092614303
conventional splices cut: 
0.8688680474143545
