In [1]:
import os
import itertools
from random import *
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [3]:
# Predicted secondary structures are saved in the dataset
datasetnames = ["ALKBH5_Baltz2012","C17ORF85_Baltz2012","C22ORF28_Baltz2012","CAPRIN1_Baltz2012","CLIPSEQ_AGO2",
                "CLIPSEQ_ELAVL1","CLIPSEQ_SFRS1","ICLIP_HNRNPC","ICLIP_TDP43","ICLIP_TIA1","ICLIP_TIAL1","PARCLIP_AGO1234",
                "PARCLIP_ELAVL1","PARCLIP_ELAVL1A","PARCLIP_EWSR1","PARCLIP_FUS","PARCLIP_HUR","PARCLIP_IGF2BP123",
                "PARCLIP_MOV10_Sievers","PARCLIP_PUM2","PARCLIP_QKI","PARCLIP_TAF15","PTBv1","ZC3H7B_Baltz2012"]

In [4]:
kmer=4
save_name="structBPE.json"

## Data pre-processing

In [5]:
def readFa(fa):
    with open(fa,'r') as FA:
        seqName,seq,struct_seq='','',''
        while 1:
            line=FA.readline()
            line=line.strip('\n')
            if (line.startswith('>') or not line) and seqName:
                yield((seqName,seq,struct_seq))
            if line.startswith('>'):
                seqName = line[1:]
                seq=''
                struct_seq=''
            elif line.startswith('(') or line.startswith('.'):
                struct_seq+=line
            else:
                seq+=line
            if not line:break

In [6]:
def struct_to_seq(struct):
    ans = ''
    stack = []
    flag = 0
    for index in range(0,len(struct)):
        if struct[index] == '(':
            ans = ans + 'S'
            stack.append('(')
            flag = 0
        elif struct[index] == ')':
            ans = ans + 'S'
            stack.pop()
            flag = 1
        else:
            if len(stack) == 0:
                if struct.count('(',0,index) == 0 or struct.count(')',index,len(struct)) == 0:
                    ans = ans + 'F'
                else:
                    ans = ans + 'J'
            else:
                if flag == 1:
                    ans = ans + 'M'
                else:
                    for i in range(index, len(struct)):
                        if struct[i] == ')':
                            ans = ans + 'H'
                            break
                        elif struct[i] == '(':
                            ans = ans + 'M'
                            break
    return ans

In [7]:
def struct_to_kmer(sentence, kmer):
    ans = ''
    for i in range(0,len(sentence)-kmer+1):
        ans = ans + sentence[i:i+kmer] + ' '
    return ans

In [None]:
training_struct_positives = []
for datasetname in datasetnames:
    # Read the positive example
    fa = "Addresses where the projected secondary structure is stored"
    for seqName, seq, struct_seq in readFa(fa):
        if len(seq) > 501:
            continue
        index = seq.find("N")  
        struct_seq = struct_seq[:index]  
        struct_seq=struct_to_seq(struct_seq)
        training_struct_positives.append(struct_to_kmer(struct_seq, kmer))

## Training BPE

In [9]:
# Preparing the training data, BPE was trained using the training set positive examples, as the focus was on learning the structural features of RNAs with binding properties
train_data = training_struct_positives
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [10]:
# Training BPE
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[MASK]"])
tokenizer.train_from_iterator(train_data, trainer=trainer)

# Saving Models
tokenizer.save(save_name)