In [1]:
import sys  
sys.path.insert(0, '/scratch/szym/introns/noncanonical_introns')

from representation_wrapper import RepresentationWrapper
from Bio.SeqIO.FastaIO import SimpleFastaParser
import time
import pickle
import pandas

In [2]:
started = time.time()
file = '/scratch/szym/introns/subset.fasta'
with open(file, "r") as handle:
    sequences = list(SimpleFastaParser(handle))
    
types = [0]*len(sequences)
for i, s in enumerate(sequences):
    class_signature=s[0][-2:]
    if class_signature == "KX":    
        # 0  for conventional intron
        types[i] = (0)
    else:
        # 1 for nonconventional
        types[i] = (1)

print("Elapsed",time.time()-started, "seconds")

Elapsed 0.04564476013183594 seconds


In [3]:
started = time.time()
representations = RepresentationWrapper(sequences, "list_of_FASTA_pairs", "ACTG", '_', types)
print("Elapsed",time.time()-started, "seconds")

Elapsed 0.0075871944427490234 seconds


In [5]:
started = time.time()
x=representations.to_pandas_dataframe()
print("Elapsed",time.time()-started, "seconds")

Elapsed 47.023966550827026 seconds


In [6]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30260,30261,30262,30263,30264,30265,30266,30267,class,length
Elonga_22251:22919_KX,G,T,G,T,T,G,C,T,T,T,...,,,,,,,,,0,668
Elonga_22976:24001_KX,G,T,G,C,G,T,T,G,C,C,...,,,,,,,,,0,1025
Elonga_25370:26395_KX,G,T,G,C,G,T,T,G,C,C,...,,,,,,,,,0,1025
Elonga_26452:27102_KX,G,T,G,T,T,G,C,T,T,T,...,,,,,,,,,0,650
Elonga_25370:26395_KX,G,T,G,C,G,T,T,G,C,C,...,,,,,,,,,0,1025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Elonga_21743:23524_XX,A,T,C,C,T,G,G,A,A,C,...,,,,,,,,,1,1781
Elonga_21743:23524_XX,A,T,C,C,T,G,G,A,A,C,...,,,,,,,,,1,1781
Elonga_21743:23524_XX,A,T,C,C,T,G,G,A,A,C,...,,,,,,,,,1,1781
Elonga_21743:23524_XX,A,T,C,C,T,G,G,A,A,C,...,,,,,,,,,1,1781


In [9]:
representations.sanity_check()

(True, 'Everything OK')

In [5]:
representations.to_kmer(6, "../DNABERT/subset.6mer")

0


In [5]:
started = time.time()
x=representations.to_TfIdf(space_treatment = "exclude")
print("Elapsed",time.time()-started, "seconds")

Elapsed 5.671555757522583 seconds


In [7]:
started = time.time()
x=representations.to_TfIdf(space_treatment = "include")
print("Elapsed",time.time()-started, "seconds")

Elapsed 5.545055866241455 seconds


In [5]:
started = time.time()
x=representations.to_bag_of_words(space_treatment = "exclude")
print("Elapsed",time.time()-started, "seconds")

Elapsed 5.525346755981445 seconds


In [16]:
started = time.time()
x=representations.to_bag_of_words(space_treatment = "include")
print("Elapsed",time.time()-started, "seconds")

Elapsed 849.6205141544342 seconds


In [8]:
col_count = len(representations.to_pandas_dataframe().columns)
representations.to_pandas_dataframe().columns = ["c%d"%i for i in range(col_count)]
started = time.time()
representations.to_pandas_dataframe().to_parquet("/scratch/szym/subset.parquet")
print("Elapsed",time.time()-started, "seconds")

TypeError: Sparse pandas data (column c0) not supported.

In [3]:
###VISUALISATION OF LIBRARY WORKING

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
corpus = ['AAAAAAAAAAAAAA', 'BBBBBBBBBBBBBBBBBBBBBBBB',
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]
vectorizer = CountVectorizer(analyzer = "char_wb", ngram_range=(4,4), lowercase=True)
#token_pattern="(?=([%s]{%d}))"%("document", 4)
X = vectorizer.fit_transform(corpus )
print(X)
print(vectorizer.get_feature_names())
for s in range(4):
    for ind, word in enumerate(vectorizer.get_feature_names()):
         if X[s, ind] != 0:
             print(s, word, X[s, ind])

  (0, 0)	1
  (0, 11)	11
  (0, 10)	1
  (1, 2)	1
  (1, 14)	21
  (1, 13)	1
  (2, 9)	1
  (2, 38)	1
  (2, 24)	1
  (2, 5)	1
  (2, 8)	1
  (2, 36)	1
  (2, 4)	1
  (2, 22)	1
  (2, 26)	1
  (2, 34)	1
  (2, 3)	1
  (2, 17)	1
  (2, 31)	1
  (2, 16)	1
  (2, 39)	1
  (2, 27)	1
  (2, 20)	1
  (2, 29)	1
  (3, 9)	1
  :	:
  (4, 12)	1
  (4, 37)	1
  (4, 23)	1
  (4, 25)	1
  (4, 6)	1
  (4, 33)	1
  (4, 28)	1
  (5, 9)	1
  (5, 38)	1
  (5, 24)	1
  (5, 5)	1
  (5, 8)	1
  (5, 36)	1
  (5, 4)	1
  (5, 22)	1
  (5, 26)	1
  (5, 34)	1
  (5, 3)	1
  (5, 17)	1
  (5, 31)	1
  (5, 16)	1
  (5, 39)	1
  (5, 27)	1
  (5, 21)	1
  (5, 30)	1
[' aaa', ' and', ' bbb', ' doc', ' fir', ' is ', ' one', ' sec', ' the', ' thi', 'aaa ', 'aaaa', 'and ', 'bbb ', 'bbbb', 'cond', 'cume', 'docu', 'econ', 'ent ', 'ent.', 'ent?', 'firs', 'hird', 'his ', 'ird ', 'irst', 'ment', 'ne. ', 'nt. ', 'nt? ', 'ocum', 'ond ', 'one.', 'rst ', 'seco', 'the ', 'thir', 'this', 'umen']
0  aaa 1
0 aaa  1
0 aaaa 11
1  bbb 1
1 bbb  1
1 bbbb 21
2  doc 1
2  fir 1
2  is  1
2 