In [1]:
import sys  
sys.path.insert(0, '/scratch/szym/introns/noncanonical_introns')

import kmeans
import random_forest

In [2]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
from sklearn.cluster import KMeans
import numpy as np

import time

In [3]:
#head -n 20000 selected_introns.fasta > subset.fasta

started = time.time()
file = '/scratch/szym/introns/all_introns.fasta'
with open(file, "r") as handle:
    sequences = list(SimpleFastaParser(handle))

ended = time.time()
types = np.zeros((len(sequences),), dtype=int)
for i, s in enumerate(sequences):
    #left_anchor = s[1][3:5]
    #right_anchor = s[1][-5:-3]
    #if left_anchor == 'GT' and right_anchor == 'AG' or left_anchor == 'CT' and right_anchor == 'AC':
    class_signature=s[0][-2:]
    if class_signature == "KX":    
        # 0  for conventional intron
        types[i] = (0)
    else:
        # 1 for nonconventional
        types[i] = (1)

print(sum(types), len(types), "read: ", ended-started, "seconds, all:", time.time()-started, "seconds")

1637851 1910948 read:  6.31156849861145 seconds, all: 7.870845317840576 seconds


### KMeans clustering of the introns with two different ways of sequence representation, using 4- and 7-mers converted to TF-IDF and different number of clusters

In [4]:
_,_,seq4, _ = kmeans.preprocess(0, 4, sequences)
clusters4 = np.zeros((7, len(sequences)))
for x in range(1,8):
    n_clusters = 2**x
    kmeans_model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=10, random_state=0, n_jobs=12)
    kmeans_model.fit(seq4)
    clusters4[x - 1] = kmeans_model.labels_
print(clusters4)

<class 'scipy.sparse.csr.csr_matrix'>




[[  1.   1.   1. ...   1.   1.   1.]
 [  3.   3.   3. ...   3.   3.   3.]
 [  1.   1.   1. ...   5.   5.   1.]
 ...
 [ 20.  26.  26. ...   0.   0.  29.]
 [ 11.  26.  26. ...   8.   8.  32.]
 [100.  43.  43. ...  61.  61.  93.]]


In [5]:
_,_,seq7, _ = kmeans.preprocess(0, 7, sequences)
clusters7 = np.zeros((7, len(sequences)))
for x in range(1,8):
    n_clusters = 2**x
    kmeans_model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=10, random_state=0, n_jobs=12)
    kmeans_model.fit(seq4)
    clusters7[x - 1] = kmeans_model.labels_
print(clusters7)

<class 'scipy.sparse.csr.csr_matrix'>




[[  1.   1.   1. ...   1.   1.   1.]
 [  3.   3.   3. ...   3.   3.   3.]
 [  1.   1.   1. ...   5.   5.   1.]
 ...
 [ 20.  26.  26. ...   0.   0.  29.]
 [ 11.  26.  26. ...   8.   8.  32.]
 [100.  43.  43. ...  61.  61.  93.]]


In [6]:
def assess_cluster(clusters, n_of_clusters, true_classes):
    clus_dict = dict([(x, [0, 0]) for x in range(n_of_clusters)])
    for i, x in enumerate(clusters):
        true_type = types[i]
        clus_dict[int(x)][true_type] += 1    
    
    results = [[], []]
    for i, x in enumerate(types):
        cluster = clusters[i]
        res = clus_dict[cluster][x] / sum(clus_dict[cluster])
        results[int(x)].append(res)
    print('conv: ')
    print(sum(results[0]) / len(results[0]))
    print('nonconv: ')
    print(sum(results[1]) / len(results[1]))
    return clus_dict

### Each clustering is evaluated by measuring homogeneity of the clusters

In [7]:
for x in range(1,8):
    n_of_clusters =  2**x
    print('Word length: 4, number of clusters: %d' % (n_of_clusters))
    assess_cluster(clusters4[x-1], n_of_clusters, types)
    print('\n')

for x in range(1,8):
    n_of_clusters =  2**x
    print('Word length: 7, number of clusters: %d' % (n_of_clusters))
    assess_cluster(clusters7[x-1], n_of_clusters, types)
    print('\n') 

Word length: 4, number of clusters: 2
conv: 
0.327519030929496
nonconv: 
0.6748827357661099


Word length: 4, number of clusters: 4
conv: 
0.32790181655515566
nonconv: 
0.6750677970854801


Word length: 4, number of clusters: 8
conv: 
0.33194712671173565
nonconv: 
0.6770235404173909


Word length: 4, number of clusters: 16
conv: 
0.35241632118945776
nonconv: 
0.6869195654586138


Word length: 4, number of clusters: 32
conv: 
0.3638433227023758
nonconv: 
0.6924440570667593


Word length: 4, number of clusters: 64
conv: 
0.37339158179982307
nonconv: 
0.6970602529425365


Word length: 4, number of clusters: 128
conv: 
0.40776065177765697
nonconv: 
0.7136763038337576


Word length: 7, number of clusters: 2
conv: 
0.327519030929496
nonconv: 
0.6748827357661099


Word length: 7, number of clusters: 4
conv: 
0.32790181655515566
nonconv: 
0.6750677970854801


Word length: 7, number of clusters: 8
conv: 
0.33194712671173565
nonconv: 
0.6770235404173909


Word length: 7, number of clusters: 16
c

### Then the sequences are classified by random forests. K-mers ranging from 4 to 11 are used for representation and for each length the classification is done twice - using whole sequences or without the junctions that were used to divide them into conventional or nonconventional.

In [8]:
#file = 'data/good_introns50+3.fasta'
#with open(file, "r") as handle:
#    sequences = list(SimpleFastaParser(handle))

#types = []
#for s in sequences:
#    left_anchor = s[1][3:5]
#    right_anchor = s[1][-5:-3]
#    if left_anchor == 'GT' and right_anchor == 'AG' or left_anchor == 'CT' and right_anchor == 'AC':
#        types.append(0)
#    else:
#        types.append(1)


for n in range(4, 6):
    print('\nlen of ogligonucleotides: ', n)
    for cut in range(2):
        if cut == 0:
            print('uncut sequences: ')
        if cut == 1:
            print('conventional splices cut: ')
        data = random_forest.preprocess(cut, n, sequences)
        acc = random_forest.forest(10, data, types)
        print(str(acc))
        


len of ogligonucleotides:  4
uncut sequences: 
0.8581666666666665
conventional splices cut: 
0.8604666666666667

len of ogligonucleotides:  5
uncut sequences: 
0.8584333333333335
conventional splices cut: 
0.8606


In [10]:
###VISUALISATION OF WHAT IS HAPPENNING
started = time.time()

sequences_cut, split_seqs, seq4, vectorizer = kmeans.preprocess(0, 7, sequences)

print(time.time()-started, "seconds")

print(split_seqs[1])
print(sequences_cut[1])
print(seq4)

<class 'scipy.sparse.csr.csr_matrix'>
2074.7414848804474 seconds
GTGCGTT TGCGTTG GCGTTGC CGTTGCC GTTGCCT TTGCCTG TGCCTGT GCCTGTG CCTGTGT CTGTGTG TGTGTGA GTGTGAC TGTGACC GTGACCC TGACCCC GACCCCA ACCCCAA CCCCAAT CCCAATT CCAATTC CAATTCC AATTCCA ATTCCAG TTCCAGA TCCAGAT CCAGATT CAGATTT AGATTTT GATTTTG ATTTTGG TTTTGGG TTTGGGA TTGGGAA TGGGAAT GGGAATG GGAATGC GAATGCC AATGCCA ATGCCAA TGCCAAC GCCAACG CCAACGT CAACGTA AACGTAA ACGTAAT CGTAATG GTAATGC TAATGCC AATGCCA ATGCCAA TGCCAAC GCCAACA CCAACAT CAACATG AACATGG ACATGGG CATGGGA ATGGGAA TGGGAAA GGGAAAA GGAAAAA GAAAAAA AAAAAAT AAAAATC AAAATCA AAATCAC AATCACC ATCACCC TCACCCT CACCCTT ACCCTTG CCCTTGA CCTTGAA CTTGAAA TTGAAAA TGAAAAG GAAAAGA AAAAGAT AAAGATG AAGATGG AGATGGA GATGGAG ATGGAGA TGGAGAA GGAGAAT GAGAATT AGAATTG GAATTGG AATTGGG ATTGGGT TTGGGTT TGGGTTC GGGTTCC GGTTCCG GTTCCGG TTCCGGA TCCGGAC CCGGACC CGGACCC GGACCCA GACCCAA ACCCAAT CCCAATT CCAATTT CAATTTT AATTTTA ATTTTAC TTTTACC TTTACCC TTACCCT TACCCTA ACCCTAC CCCTACA CCTACAC CTACACC TACACCC ACACCCT

In [37]:
###VISUALISATION OF LIBRARY WORKING

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X)
print(vectorizer.get_feature_names())
for s in range(4):
    for ind, word in enumerate(vectorizer.get_feature_names()):
         if X[s, ind] != 0:
             print(s, word, X[s, ind])

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
0 document 0.46979138557992045
0 first 0.5802858236844359
0 is 0.38408524091481483
0 the 0.38408524091481483
0 this 0.38408524091481483
1 document 0.6876235979836938
1 is 0.281088674033753
1 second 0.5386476208856763
1 the 0.281088674033753
1 this 0.281088674033753
2 and 0.511848512707169
2 is 0.267103787642168
2 one 0.511848512707169


In [6]:
seq4

<1910948x506 sparse matrix of type '<class 'numpy.float64'>'
	with 369977091 stored elements in Compressed Sparse Row format>

In [31]:
X

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [9]:
print(vectorizer.get_feature_names())

['AA', 'AAA', 'AAAA', 'AAAC', 'AAAG', 'AAAN', 'AAAT', 'AAC', 'AACA', 'AACC', 'AACG', 'AACN', 'AACT', 'AAG', 'AAGA', 'AAGC', 'AAGG', 'AAGN', 'AAGT', 'AANN', 'AAT', 'AATA', 'AATC', 'AATG', 'AATN', 'AATT', 'AC', 'ACA', 'ACAA', 'ACAC', 'ACAG', 'ACAN', 'ACAT', 'ACC', 'ACCA', 'ACCC', 'ACCG', 'ACCN', 'ACCT', 'ACG', 'ACGA', 'ACGC', 'ACGG', 'ACGN', 'ACGT', 'ACNN', 'ACT', 'ACTA', 'ACTC', 'ACTG', 'ACTN', 'ACTT', 'AG', 'AGA', 'AGAA', 'AGAC', 'AGAG', 'AGAN', 'AGAT', 'AGC', 'AGCA', 'AGCC', 'AGCG', 'AGCN', 'AGCT', 'AGG', 'AGGA', 'AGGC', 'AGGG', 'AGGN', 'AGGT', 'AGNN', 'AGT', 'AGTA', 'AGTC', 'AGTG', 'AGTN', 'AGTT', 'ANNN', 'AT', 'ATA', 'ATAA', 'ATAC', 'ATAG', 'ATAN', 'ATAT', 'ATC', 'ATCA', 'ATCC', 'ATCG', 'ATCN', 'ATCT', 'ATG', 'ATGA', 'ATGC', 'ATGG', 'ATGN', 'ATGT', 'ATNN', 'ATT', 'ATTA', 'ATTC', 'ATTG', 'ATTN', 'ATTT', 'CA', 'CAA', 'CAAA', 'CAAC', 'CAAG', 'CAAN', 'CAAT', 'CAC', 'CACA', 'CACC', 'CACG', 'CACN', 'CACT', 'CAG', 'CAGA', 'CAGC', 'CAGG', 'CAGN', 'CAGT', 'CANN', 'CAT', 'CATA', 'CATC', 'CATG