In [1]:
import sys  
sys.path.insert(0, '/scratch/szym/introns/noncanonical_introns')    # path to the working directory - change it !!!

from representation_wrapper import RepresentationWrapper
from Bio.SeqIO.FastaIO import SimpleFastaParser
import time
import pickle
import pandas
import matplotlib.pyplot as plt

# Reading the sequences and assigning labels

Below `types` variable contains a label of the sequence (`0` is conventional, `1` non-conventional). 

In [2]:
started = time.time()
file = '/scratch/szym/introns/subset.fasta'
with open(file, "r") as handle:
    sequences = list(SimpleFastaParser(handle))
    
types = [0]*len(sequences)
for i, s in enumerate(sequences):
    class_signature=s[0][-2:]
    if class_signature == "KX":    
        # 0  for conventional intron
        types[i] = (0)
    else:
        # 1 for nonconventional
        types[i] = (1)

print("Elapsed",time.time()-started, "seconds")

Elapsed 0.06040048599243164 seconds


# Creating a `Representation Wrapper` class object

This object is named `representations`. It basically only loads the data into the object, no representation is yet created. 



In [3]:
started = time.time()
representations = RepresentationWrapper(sequences, "list_of_FASTA_pairs", "ACTG", '_', labels=types)
print("Elapsed",time.time()-started, "seconds") 

Elapsed 0.0051233768463134766 seconds


# Checking if everything is OK. 

`sanity_check()` method returns `True` if everything is OK or `False` with a problem description.

In [4]:
representations.sanity_check()

(True, 'Everything OK')

# TfIdf representation
Creating a vector **TfIdf representation**. It is returned in a form of a Scipy sparse matrix with as many rows as there were sequences. 


1. `ngram_length` is the length of internally built n-grams. N-gram length default value is 4, but it can be changed by providing `ngram_length=` parameter.
1. `space_treatment` is one of
    * `"include"` - it will treat space as a regular alphabet characters e.g. for "atcg_attcg" will be decomposed into 9 n-grams with length n=4: atcg, tcg_, cg_a, g_at, _att, attc, ttcg
    * `"exclude"` (default) - it will exclude all n-grams with spaces, e.g. string "atcg_attcg" will be decomposed into 3 n-grams with length n=4: atcg, attc, ttcg
    
In the example below, `ngram_length` is changed to 3 and spaces are excluded from the alphabet.

In [5]:
started = time.time()
x=representations.to_TfIdf(ngram_length=3, space_treatment = "exclude")
print("Elapsed",time.time()-started, "seconds")

Elapsed 6.876007080078125 seconds


In [6]:
x

<10000x64 sparse matrix of type '<class 'numpy.float32'>'
	with 610071 stored elements in Compressed Sparse Row format>

## Another example of TfIdf representation

This time `ngram_length` is the default (=4) and spaces are included in the alphabet. 

In [7]:
started = time.time()
x=representations.to_TfIdf(space_treatment = "include")
print("Elapsed",time.time()-started, "seconds")

Elapsed 7.102266550064087 seconds


In [8]:
x

<10000x256 sparse matrix of type '<class 'numpy.float32'>'
	with 1939343 stored elements in Compressed Sparse Row format>

# Bag of words representation
Creating a vector **Bag of words representation**. It is returned in a form of a **Scipy integer-based** sparse matrix with as many rows as there were sequences, counting occurences of each n-gram. 


1. `ngram_length` is the length of internally built n-grams. N-gram length default value is 4, but it can be changed by providing `ngram_length=` parameter.
1. `space_treatment` is one of
    * `"include"` - it will treat space as a regular alphabet characters e.g. for "atcg_attcg" will be decomposed into 9 n-grams with length n=4: atcg, tcg_, cg_a, g_at, _att, attc, ttcg
    * `"exclude"` (default) - it will exclude all n-grams with spaces, e.g. string "atcg_attcg" will be decomposed into 3 n-grams with length n=4: atcg, attc, ttcg
    
In the example below, `ngram_length` is the default (=4) and spaces are excluded from the alphabet.

In [9]:
started = time.time()
x=representations.to_bag_of_words(space_treatment = "exclude")
print("Elapsed",time.time()-started, "seconds")

Elapsed 7.034831762313843 seconds


In [10]:
x

<10000x256 sparse matrix of type '<class 'numpy.int16'>'
	with 1939343 stored elements in Compressed Sparse Row format>

## Another example of Bag of words representation

`ngram_length` is the default (=4) and spaces are included in the alphabet.

In [11]:
started = time.time()
x=representations.to_bag_of_words(space_treatment = "include")
print("Elapsed",time.time()-started, "seconds")

Elapsed 7.129010200500488 seconds


In [12]:
x

<10000x256 sparse matrix of type '<class 'numpy.int16'>'
	with 1939343 stored elements in Compressed Sparse Row format>

# K-mers

Creating a k-mer representation and saving it to a file. A k-mer representation can only be stored in a file

In [13]:
representations.to_kmer(6, "../DNABERT/subset.l_3072.6mer", 3072)

0


# Pandas representation

Finally, creating representation in `pandas`. This is provided for the ease of handling, saving, viewing etc., **but rather not for Machine Learning purposes**. Also, it is rather time-consuming. Use with care.

In [14]:
started = time.time()
x=representations.to_pandas_dataframe()
print("Elapsed",time.time()-started, "seconds")

  pandas_dataframe["label"] = self.labels
  pandas_dataframe["length"] = lengths


Elapsed 59.67077374458313 seconds


In [15]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30260,30261,30262,30263,30264,30265,30266,30267,label,length
Elonga_22251:22919_KX,G,T,G,T,T,G,C,T,T,T,...,,,,,,,,,0,668
Elonga_22976:24001_KX,G,T,G,C,G,T,T,G,C,C,...,,,,,,,,,0,1025
Elonga_25370:26395_KX,G,T,G,C,G,T,T,G,C,C,...,,,,,,,,,0,1025
Elonga_26452:27102_KX,G,T,G,T,T,G,C,T,T,T,...,,,,,,,,,0,650
Elonga_25370:26395_KX,G,T,G,C,G,T,T,G,C,C,...,,,,,,,,,0,1025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Elonga_21743:23524_XX,A,T,C,C,T,G,G,A,A,C,...,,,,,,,,,1,1781
Elonga_21743:23524_XX,A,T,C,C,T,G,G,A,A,C,...,,,,,,,,,1,1781
Elonga_21743:23524_XX,A,T,C,C,T,G,G,A,A,C,...,,,,,,,,,1,1781
Elonga_21743:23524_XX,A,T,C,C,T,G,G,A,A,C,...,,,,,,,,,1,1781
