In [1]:
#we are using kmer tokenizer from https://github.com/MsAlEhR/KmerTokenizer

import time
import itertools
from transformers import PreTrainedTokenizer, AutoTokenizer
import json
import os

class KmerTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file=None, kmerlen=6, overlapping=True, maxlen=400, **kwargs):
        self.kmerlen = kmerlen
        self.overlapping = overlapping
        self.maxlen = maxlen
        
        # Initialize vocabulary
        self.VOCAB = [''.join(i) for i in itertools.product(*(['ATCG'] * int(self.kmerlen)))]
        self.VOCAB_SIZE = len(self.VOCAB) + 5
        
        self.tokendict = dict(zip(self.VOCAB, range(5, self.VOCAB_SIZE)))
        self.tokendict['[UNK]'] = 0
        self.tokendict['[SEP]'] = 1
        self.tokendict['[CLS]'] = 2
        self.tokendict['[MASK]'] = 3
        self.tokendict['[PAD]'] = 4
        
        super().__init__(**kwargs)

    def _tokenize(self, text):
        tokens = []
        stoprange = len(text) - (self.kmerlen - 1)
        if self.overlapping:
            for k in range(0, stoprange):
                kmer = text[k:k + self.kmerlen]
                if set(kmer).issubset('ATCG'):
                    tokens.append(kmer)
        else:
            for k in range(0, stoprange, self.kmerlen):
                kmer = text[k:k + self.kmerlen]
                if set(kmer).issubset('ATCG'):
                    tokens.append(kmer)
        return tokens

    def _convert_token_to_id(self, token):
        return self.tokendict.get(token, self.tokendict['[UNK]'])

    def _convert_id_to_token(self, index):
        inv_tokendict = {v: k for k, v in self.tokendict.items()}
        return inv_tokendict.get(index, '[UNK]')

    def convert_tokens_to_string(self, tokens):
        return ' '.join(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        if token_ids_1 is None:
            return [self.tokendict['[CLS]']] + token_ids_0 + [self.tokendict['[SEP]']]
        return [self.tokendict['[CLS]']] + token_ids_0 + [self.tokendict['[SEP]']] + token_ids_1 + [self.tokendict['[SEP]']]

    def get_vocab(self):
        return self.tokendict

    def kmer_tokenize(self, seq_list):
        seq_ind_list = []
        for seq in seq_list:
            tokens = self._tokenize(seq)
            token_ids = [self._convert_token_to_id(token) for token in tokens]
            if len(token_ids) < self.maxlen:
                token_ids.extend([self.tokendict['[PAD]']] * (self.maxlen - len(token_ids)))
            else:
                token_ids = token_ids[:self.maxlen]
            seq_ind_list.append(token_ids)
        return seq_ind_list

    def save_vocabulary(self, save_directory, filename_prefix=None):
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory)
        
        vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + 'vocab.json')
        
        with open(vocab_file, 'w') as f:
            json.dump(self.tokendict, f)
        
        return (vocab_file,)

    def save_pretrained(self, save_directory, **kwargs):
        special_tokens_map_file = os.path.join(save_directory, "special_tokens_map.json")
        with open(special_tokens_map_file, "w") as f:
            json.dump({
                "kmerlen": self.kmerlen,
                "overlapping": self.overlapping,
                "maxlen": self.maxlen
            }, f)
        vocab_files = self.save_vocabulary(save_directory)
        return (special_tokens_map_file,) + vocab_files

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        # Load tokenizer using the parent class method
        tokenizer = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        # Load special tokens map
        special_tokens_map_file = os.path.join(pretrained_model_name_or_path, "special_tokens_map.json")
        if os.path.isfile(special_tokens_map_file):
            with open(special_tokens_map_file, "r") as f:
                special_tokens_map = json.load(f)
            tokenizer.kmerlen = special_tokens_map.get("kmerlen", 6)
            tokenizer.overlapping = special_tokens_map.get("overlapping", True)
            tokenizer.maxlen = special_tokens_map.get("maxlen", 400)
        
        # Load vocabulary
        vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
        if os.path.isfile(vocab_file):
            with open(vocab_file, "r") as f:
                tokendict = json.load(f)
            tokenizer.tokendict = tokendict
        
        return tokenizer

In [2]:
tokenizer = KmerTokenizer(kmerlen=6, overlapping=True, maxlen=4096)
seq_list = ["ATTTTTTTTTTTCCCCCCCCCCCGGGGGGGGATCGATGC"]
tokenized_output = tokenizer.kmer_tokenize(seq_list)
tokenized_output

[[346,
  1370,
  1370,
  1370,
  1370,
  1370,
  1370,
  1371,
  1375,
  1391,
  1455,
  1711,
  2735,
  2735,
  2735,
  2735,
  2735,
  2735,
  2736,
  2740,
  2756,
  2820,
  3076,
  4100,
  4100,
  4100,
  4097,
  4086,
  4043,
  3872,
  3185,
  438,
  1740,
  2851,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,


In [6]:
#so it definitely pads it as well
seq_list = ['ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG']
tokenizer = KmerTokenizer(kmerlen=6, overlapping=True, maxlen=6) #will also cut off and appends with th eappend token
print(tokenizer.kmer_tokenize(seq_list))

[[438, 1739, 2848, 3185, 438, 1739]]


In [8]:
#check vocab
print(tokenizer.get_vocab())
#contains all the possible values already, which is quite good, let's make sure it's the right number of values
#0-4 are the unk sep cls mask and pad, then start from AAAAA at 5
#let's do the math to see how large it should be
print(4**6) #ah and the last one is 4100 which means 4101 and so if we subtract the 5 get 4096 elements, so it's quite good!!
#so this tokenizer will be consistent!

{'AAAAAA': 5, 'AAAAAT': 6, 'AAAAAC': 7, 'AAAAAG': 8, 'AAAATA': 9, 'AAAATT': 10, 'AAAATC': 11, 'AAAATG': 12, 'AAAACA': 13, 'AAAACT': 14, 'AAAACC': 15, 'AAAACG': 16, 'AAAAGA': 17, 'AAAAGT': 18, 'AAAAGC': 19, 'AAAAGG': 20, 'AAATAA': 21, 'AAATAT': 22, 'AAATAC': 23, 'AAATAG': 24, 'AAATTA': 25, 'AAATTT': 26, 'AAATTC': 27, 'AAATTG': 28, 'AAATCA': 29, 'AAATCT': 30, 'AAATCC': 31, 'AAATCG': 32, 'AAATGA': 33, 'AAATGT': 34, 'AAATGC': 35, 'AAATGG': 36, 'AAACAA': 37, 'AAACAT': 38, 'AAACAC': 39, 'AAACAG': 40, 'AAACTA': 41, 'AAACTT': 42, 'AAACTC': 43, 'AAACTG': 44, 'AAACCA': 45, 'AAACCT': 46, 'AAACCC': 47, 'AAACCG': 48, 'AAACGA': 49, 'AAACGT': 50, 'AAACGC': 51, 'AAACGG': 52, 'AAAGAA': 53, 'AAAGAT': 54, 'AAAGAC': 55, 'AAAGAG': 56, 'AAAGTA': 57, 'AAAGTT': 58, 'AAAGTC': 59, 'AAAGTG': 60, 'AAAGCA': 61, 'AAAGCT': 62, 'AAAGCC': 63, 'AAAGCG': 64, 'AAAGGA': 65, 'AAAGGT': 66, 'AAAGGC': 67, 'AAAGGG': 68, 'AATAAA': 69, 'AATAAT': 70, 'AATAAC': 71, 'AATAAG': 72, 'AATATA': 73, 'AATATT': 74, 'AATATC': 75, 'AATATG': 

In [12]:
#here's the real test tho, what if we have a large sequence how fast is it
tokenizer = KmerTokenizer(kmerlen=6, overlapping=True, maxlen=400000)
seq_list = [''.join(['ATCG' for i in range(100000)])] #length fo 400k
start = time.time()
tokenized_output = tokenizer.kmer_tokenize(seq_list)
print(time.time()-start) #0.3 seconds, that is quite fast, this is good enough for our purpose. But it does add roughly 3 hours per epoch... so idk

0.36013031005859375


In [13]:
len(tokenized_output[0])

400000

In [14]:
#we can also test it with 78910 instead?
VOCAB = [''.join(i) for i in itertools.product(*(['ATCG'] * int(4)))]
print(VOCAB)

['AAAA', 'AAAT', 'AAAC', 'AAAG', 'AATA', 'AATT', 'AATC', 'AATG', 'AACA', 'AACT', 'AACC', 'AACG', 'AAGA', 'AAGT', 'AAGC', 'AAGG', 'ATAA', 'ATAT', 'ATAC', 'ATAG', 'ATTA', 'ATTT', 'ATTC', 'ATTG', 'ATCA', 'ATCT', 'ATCC', 'ATCG', 'ATGA', 'ATGT', 'ATGC', 'ATGG', 'ACAA', 'ACAT', 'ACAC', 'ACAG', 'ACTA', 'ACTT', 'ACTC', 'ACTG', 'ACCA', 'ACCT', 'ACCC', 'ACCG', 'ACGA', 'ACGT', 'ACGC', 'ACGG', 'AGAA', 'AGAT', 'AGAC', 'AGAG', 'AGTA', 'AGTT', 'AGTC', 'AGTG', 'AGCA', 'AGCT', 'AGCC', 'AGCG', 'AGGA', 'AGGT', 'AGGC', 'AGGG', 'TAAA', 'TAAT', 'TAAC', 'TAAG', 'TATA', 'TATT', 'TATC', 'TATG', 'TACA', 'TACT', 'TACC', 'TACG', 'TAGA', 'TAGT', 'TAGC', 'TAGG', 'TTAA', 'TTAT', 'TTAC', 'TTAG', 'TTTA', 'TTTT', 'TTTC', 'TTTG', 'TTCA', 'TTCT', 'TTCC', 'TTCG', 'TTGA', 'TTGT', 'TTGC', 'TTGG', 'TCAA', 'TCAT', 'TCAC', 'TCAG', 'TCTA', 'TCTT', 'TCTC', 'TCTG', 'TCCA', 'TCCT', 'TCCC', 'TCCG', 'TCGA', 'TCGT', 'TCGC', 'TCGG', 'TGAA', 'TGAT', 'TGAC', 'TGAG', 'TGTA', 'TGTT', 'TGTC', 'TGTG', 'TGCA', 'TGCT', 'TGCC', 'TGCG', 'TGGA',

In [15]:
numbers = list(map(str, range(7, 11)))

# Generate all possible 4-mer combinations from the numbers
VOCAB = [''.join(i) for i in itertools.product(*([numbers] * 4))]

print(VOCAB)

['7777', '7778', '7779', '77710', '7787', '7788', '7789', '77810', '7797', '7798', '7799', '77910', '77107', '77108', '77109', '771010', '7877', '7878', '7879', '78710', '7887', '7888', '7889', '78810', '7897', '7898', '7899', '78910', '78107', '78108', '78109', '781010', '7977', '7978', '7979', '79710', '7987', '7988', '7989', '79810', '7997', '7998', '7999', '79910', '79107', '79108', '79109', '791010', '71077', '71078', '71079', '710710', '71087', '71088', '71089', '710810', '71097', '71098', '71099', '710910', '710107', '710108', '710109', '7101010', '8777', '8778', '8779', '87710', '8787', '8788', '8789', '87810', '8797', '8798', '8799', '87910', '87107', '87108', '87109', '871010', '8877', '8878', '8879', '88710', '8887', '8888', '8889', '88810', '8897', '8898', '8899', '88910', '88107', '88108', '88109', '881010', '8977', '8978', '8979', '89710', '8987', '8988', '8989', '89810', '8997', '8998', '8999', '89910', '89107', '89108', '89109', '891010', '81077', '81078', '81079', '810

In [23]:
tokendict = dict(zip(VOCAB, range(5, len(VOCAB))))
tokendict

{'7777': 5,
 '7778': 6,
 '7779': 7,
 '77710': 8,
 '7787': 9,
 '7788': 10,
 '7789': 11,
 '77810': 12,
 '7797': 13,
 '7798': 14,
 '7799': 15,
 '77910': 16,
 '77107': 17,
 '77108': 18,
 '77109': 19,
 '771010': 20,
 '7877': 21,
 '7878': 22,
 '7879': 23,
 '78710': 24,
 '7887': 25,
 '7888': 26,
 '7889': 27,
 '78810': 28,
 '7897': 29,
 '7898': 30,
 '7899': 31,
 '78910': 32,
 '78107': 33,
 '78108': 34,
 '78109': 35,
 '781010': 36,
 '7977': 37,
 '7978': 38,
 '7979': 39,
 '79710': 40,
 '7987': 41,
 '7988': 42,
 '7989': 43,
 '79810': 44,
 '7997': 45,
 '7998': 46,
 '7999': 47,
 '79910': 48,
 '79107': 49,
 '79108': 50,
 '79109': 51,
 '791010': 52,
 '71077': 53,
 '71078': 54,
 '71079': 55,
 '710710': 56,
 '71087': 57,
 '71088': 58,
 '71089': 59,
 '710810': 60,
 '71097': 61,
 '71098': 62,
 '71099': 63,
 '710910': 64,
 '710107': 65,
 '710108': 66,
 '710109': 67,
 '7101010': 68,
 '8777': 69,
 '8778': 70,
 '8779': 71,
 '87710': 72,
 '8787': 73,
 '8788': 74,
 '8789': 75,
 '87810': 76,
 '8797': 77,
 '8798

In [16]:
#let's see our inputs to see how quickly they can get tokenized
import numpy as np
split = 'train'
data_path=f'/data/leslie/sarthak/data/enformer/data/{split}_seq.npz'

seq_data = np.load(data_path)

In [18]:
seq = np.array(seq_data['sequence_array'])

In [19]:
print(seq.shape)

(34021, 131072)


In [20]:
seq[0]

array([10,  9,  7, ..., 10,  7, 10], dtype=int8)

# testing one hot

In [1]:
#can just pretokenize the whole test data!
import torch

In [2]:
#let's see our inputs to see how quickly they can get tokenized
import numpy as np
split = 'train'
data_path=f'/data/leslie/sarthak/data/enformer/data/{split}_seq.npz'
seq_data = np.load(data_path)
seq = np.array(seq_data['sequence_array'])

In [6]:
seq[0]

array([10,  9,  7, ..., 10,  7, 10], dtype=int8)

In [5]:
test = torch.nn.functional.one_hot(torch.LongTensor(seq[0]-7), num_classes=4).float()
print(test)
#this is extremely quick!

tensor([[0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.]])


In [9]:
#find a row that has 11
for i in range(len(seq)):
    if 11 in seq[i]:
        print(i)
        break

1383


In [12]:
sum(seq[1383] == 11) #there's 2

2

In [16]:
seq[1383]

array([ 7, 10, 10, ..., 10, 10,  9], dtype=int8)

In [14]:
test = torch.nn.functional.one_hot(torch.LongTensor(seq[1383]-7) % 4, num_classes=4).float() #divide it by 4 to get the remainder so we can deal with the 5

In [15]:
test

tensor([[1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.]])

In [17]:
#let's now get an example sequence
test_seq = seq[:3]
print(test_seq.shape)

(3, 131072)


In [20]:
#now one hot encode it
test = torch.nn.functional.one_hot(torch.LongTensor(test_seq-7) % 4, num_classes=4).float().transpose(1,2)
print(test.shape)

torch.Size([3, 4, 131072])


In [21]:
#now we check to see which elements in test seq that is 11
if 11 in test_seq:
    print('yes') #in this case no

In [22]:
test_seq = seq[1383:1386]
11 in test_seq

True

In [24]:
test_seq.shape

(3, 131072)

In [23]:
#now one hot encode it
test = torch.nn.functional.one_hot(torch.LongTensor(test_seq-7) % 4, num_classes=4).float().transpose(1,2)
print(test.shape)

#now check in test seq where the 11 are
np.where(test_seq == 11)

torch.Size([3, 4, 131072])


(array([0, 0]), array([83403, 88171]))

In [26]:
test_seq[0,88171]

11

In [27]:
#and check that it should be what looks like an A in the test output
test[0,:,88171] #yessirree because 11-7 is 4 and 4%4 is 0 so it's the 0 index that gets it, so for those just set it manually to 0


tensor([1., 0., 0., 0.])

In [28]:
#so we set test[batch_dim, 0, index] = 0 for all the indices that are 11
indices = np.where(test_seq == 11)
for idx in range(len(indices[0])):
    test[indices[0][idx], 0, indices[1][idx]] = 0

[0 0]
[83403 88171]


In [31]:
indices = torch.where(torch.LongTensor(test_seq)==11)
print(indices)

(tensor([0, 0]), tensor([83403, 88171]))


In [33]:
#one final test with how this works
test_seq1 = seq[1383:1386]
test_seq2 = seq[1383]
test1 = torch.nn.functional.one_hot(torch.LongTensor(test_seq1-7) % 4, num_classes=4).float().transpose(1,2)
test2 = torch.nn.functional.one_hot(torch.LongTensor(test_seq2-7) % 4, num_classes=4).float().transpose(0,1)
print(test1.shape, test2.shape)

torch.Size([3, 4, 131072]) torch.Size([4, 131072])


In [34]:
torch.allclose(test1[0], test2) #ok does it exaclty as you'd expect. Documentation says the last dimension

True

# tokenizing the whole genome

In [None]:
#let's just do what we did before, but tokenize the whole genome!
import numpy as np
split = 'test'
data_path=f'/data/leslie/sarthak/data/enformer/data/{split}_seq.npz'
seq_data = np.load(data_path)
seq = np.array(seq_data['sequence_array'])
seq_rc = np.array(seq_data['sequence_array_rc'])
print(seq.shape, seq_rc.shape)

: 

In [5]:
seq[0,:20]

array([10,  7, 10, 10, 10, 10, 10, 10, 10,  7,  7,  9, 10, 10,  9,  8,  8,
        7, 10, 10], dtype=int8)

In [6]:
#so now we need to turn that into the proper tokenization
#first let's turn it to a string by subtracting 7 then turning the array into a vector of strings
seq_str = np.array([''.join(map(str, seq[i]-7)) for i in range(len(seq))])
print(seq_str.shape)

(1937,)


In [8]:
seq_str[0][:20] #10 is now 3
#this look accuurate, let's trust it

'30333333300233211033'

In [9]:
#first look at the character tokenizer
import sys
sys.path.append('/data/leslie/sarthak/hyena/hyena-dna')
from src.dataloaders.datasets.hg38_char_tokenizer import CharacterTokenizer

In [12]:
char_tokenizer = CharacterTokenizer('ACGTN', 1024)

In [13]:
char_tokenizer._vocab_int_to_str

{0: '[CLS]',
 1: '[SEP]',
 2: '[BOS]',
 3: '[MASK]',
 4: '[PAD]',
 5: '[RESERVED]',
 6: '[UNK]',
 7: 'A',
 8: 'C',
 9: 'G',
 10: 'T',
 11: 'N'}

In [14]:
#now we use the tokenizer to tokenize it
import time
import itertools
from transformers import PreTrainedTokenizer, AutoTokenizer
import json
import os

class KmerTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file=None, kmerlen=6, overlapping=True, maxlen=400, **kwargs):
        self.kmerlen = kmerlen
        self.overlapping = overlapping
        self.maxlen = maxlen
        
        # Initialize vocabulary
        self.VOCAB = [''.join(i) for i in itertools.product(*(['ATCG'] * int(self.kmerlen)))]
        self.VOCAB_SIZE = len(self.VOCAB) + 5
        
        self.tokendict = dict(zip(self.VOCAB, range(5, self.VOCAB_SIZE)))
        self.tokendict['[CLS]'] = 0
        self.tokendict['[SEP]'] = 1
        self.tokendict['[BOS]'] = 2
        self.tokendict['[MASK]'] = 3
        self.tokendict['[PAD]'] = 4
        self.tokendict['[EOS]'] = 5 #was reserved above
        self.tokendict['[UNK]'] = 6
        
        
        
        super().__init__(**kwargs)

    def _tokenize(self, text):
        tokens = []
        stoprange = len(text) - (self.kmerlen - 1)
        if self.overlapping:
            for k in range(0, stoprange):
                kmer = text[k:k + self.kmerlen]
                if set(kmer).issubset('ATCG'):
                    tokens.append(kmer)
        else:
            for k in range(0, stoprange, self.kmerlen):
                kmer = text[k:k + self.kmerlen]
                if set(kmer).issubset('ATCG'):
                    tokens.append(kmer)
        return tokens

    def _convert_token_to_id(self, token):
        return self.tokendict.get(token, self.tokendict['[UNK]'])

    def _convert_id_to_token(self, index):
        inv_tokendict = {v: k for k, v in self.tokendict.items()}
        return inv_tokendict.get(index, '[UNK]')

    def convert_tokens_to_string(self, tokens):
        return ' '.join(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        if token_ids_1 is None:
            return [self.tokendict['[CLS]']] + token_ids_0 + [self.tokendict['[SEP]']]
        return [self.tokendict['[CLS]']] + token_ids_0 + [self.tokendict['[SEP]']] + token_ids_1 + [self.tokendict['[SEP]']]

    def get_vocab(self):
        return self.tokendict

    def kmer_tokenize(self, seq_list):
        seq_ind_list = []
        for seq in seq_list:
            tokens = self._tokenize(seq)
            token_ids = [self._convert_token_to_id(token) for token in tokens]
            if len(token_ids) < self.maxlen:
                token_ids.extend([self.tokendict['[PAD]']] * (self.maxlen - len(token_ids)))
            else:
                token_ids = token_ids[:self.maxlen]
            seq_ind_list.append(token_ids)
        return seq_ind_list

    def save_vocabulary(self, save_directory, filename_prefix=None):
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory)
        
        vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + 'vocab.json')
        
        with open(vocab_file, 'w') as f:
            json.dump(self.tokendict, f)
        
        return (vocab_file,)

    def save_pretrained(self, save_directory, **kwargs):
        special_tokens_map_file = os.path.join(save_directory, "special_tokens_map.json")
        with open(special_tokens_map_file, "w") as f:
            json.dump({
                "kmerlen": self.kmerlen,
                "overlapping": self.overlapping,
                "maxlen": self.maxlen
            }, f)
        vocab_files = self.save_vocabulary(save_directory)
        return (special_tokens_map_file,) + vocab_files

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        # Load tokenizer using the parent class method
        tokenizer = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        # Load special tokens map
        special_tokens_map_file = os.path.join(pretrained_model_name_or_path, "special_tokens_map.json")
        if os.path.isfile(special_tokens_map_file):
            with open(special_tokens_map_file, "r") as f:
                special_tokens_map = json.load(f)
            tokenizer.kmerlen = special_tokens_map.get("kmerlen", 6)
            tokenizer.overlapping = special_tokens_map.get("overlapping", True)
            tokenizer.maxlen = special_tokens_map.get("maxlen", 400)
        
        # Load vocabulary
        vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
        if os.path.isfile(vocab_file):
            with open(vocab_file, "r") as f:
                tokendict = json.load(f)
            tokenizer.tokendict = tokendict
        
        return tokenizer

In [15]:
tokenizer = KmerTokenizer(kmerlen=8, overlapping=True, maxlen=4096)

In [18]:
print(tokenizer.VOCAB[:10], len(tokenizer.VOCAB))

['AAAAAAAA', 'AAAAAAAT', 'AAAAAAAC', 'AAAAAAAG', 'AAAAAATA', 'AAAAAATT', 'AAAAAATC', 'AAAAAATG', 'AAAAAACA', 'AAAAAACT'] 65536


In [20]:
#now we want to slightly modify it so not ACGT but 0123
VOCAB = [''.join(i) for i in itertools.product(*(['0123'] * int(8)))]
VOCAB

['00000000',
 '00000001',
 '00000002',
 '00000003',
 '00000010',
 '00000011',
 '00000012',
 '00000013',
 '00000020',
 '00000021',
 '00000022',
 '00000023',
 '00000030',
 '00000031',
 '00000032',
 '00000033',
 '00000100',
 '00000101',
 '00000102',
 '00000103',
 '00000110',
 '00000111',
 '00000112',
 '00000113',
 '00000120',
 '00000121',
 '00000122',
 '00000123',
 '00000130',
 '00000131',
 '00000132',
 '00000133',
 '00000200',
 '00000201',
 '00000202',
 '00000203',
 '00000210',
 '00000211',
 '00000212',
 '00000213',
 '00000220',
 '00000221',
 '00000222',
 '00000223',
 '00000230',
 '00000231',
 '00000232',
 '00000233',
 '00000300',
 '00000301',
 '00000302',
 '00000303',
 '00000310',
 '00000311',
 '00000312',
 '00000313',
 '00000320',
 '00000321',
 '00000322',
 '00000323',
 '00000330',
 '00000331',
 '00000332',
 '00000333',
 '00001000',
 '00001001',
 '00001002',
 '00001003',
 '00001010',
 '00001011',
 '00001012',
 '00001013',
 '00001020',
 '00001021',
 '00001022',
 '00001023',
 '00001030',

In [23]:
#now let's tokenize it!
testseq = seq_str[0]
print(testseq[:20])

30333333300233211033


In [28]:
#now tokenize it
class KmerTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file=None, kmerlen=6, overlapping=True, maxlen=400, **kwargs):
        self.kmerlen = kmerlen
        self.overlapping = overlapping
        self.maxlen = maxlen
        
        # Initialize vocabulary
        self.VOCAB = [''.join(i) for i in itertools.product(*(['0123'] * int(self.kmerlen)))]
        self.VOCAB_SIZE = len(self.VOCAB) + 5
        
        self.tokendict = dict(zip(self.VOCAB, range(5, self.VOCAB_SIZE)))
        self.tokendict['[CLS]'] = 0
        self.tokendict['[SEP]'] = 1
        self.tokendict['[BOS]'] = 2
        self.tokendict['[MASK]'] = 3
        self.tokendict['[PAD]'] = 4
        self.tokendict['[EOS]'] = 5 #was reserved above
        self.tokendict['[UNK]'] = 6
        
        
        
        super().__init__(**kwargs)

    def _tokenize(self, text):
        tokens = []
        stoprange = len(text) - (self.kmerlen - 1)
        if self.overlapping:
            for k in range(0, stoprange):
                kmer = text[k:k + self.kmerlen]
                if set(kmer).issubset('0123'):
                    tokens.append(kmer)
        else:
            for k in range(0, stoprange, self.kmerlen):
                kmer = text[k:k + self.kmerlen]
                if set(kmer).issubset('0123'):
                    tokens.append(kmer)
        return tokens

    def _convert_token_to_id(self, token):
        return self.tokendict.get(token, self.tokendict['[UNK]'])

    def _convert_id_to_token(self, index):
        inv_tokendict = {v: k for k, v in self.tokendict.items()}
        return inv_tokendict.get(index, '[UNK]')

    def convert_tokens_to_string(self, tokens):
        return ' '.join(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        if token_ids_1 is None:
            return [self.tokendict['[CLS]']] + token_ids_0 + [self.tokendict['[SEP]']]
        return [self.tokendict['[CLS]']] + token_ids_0 + [self.tokendict['[SEP]']] + token_ids_1 + [self.tokendict['[SEP]']]

    def get_vocab(self):
        return self.tokendict

    def kmer_tokenize(self, seq_list):
        seq_ind_list = []
        for seq in seq_list:
            tokens = self._tokenize(seq)
            token_ids = [self._convert_token_to_id(token) for token in tokens]
            if len(token_ids) < self.maxlen:
                token_ids.extend([self.tokendict['[PAD]']] * (self.maxlen - len(token_ids)))
            else:
                token_ids = token_ids[:self.maxlen]
            seq_ind_list.append(token_ids)
        return seq_ind_list

    def save_vocabulary(self, save_directory, filename_prefix=None):
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory)
        
        vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + 'vocab.json')
        
        with open(vocab_file, 'w') as f:
            json.dump(self.tokendict, f)
        
        return (vocab_file,)

    def save_pretrained(self, save_directory, **kwargs):
        special_tokens_map_file = os.path.join(save_directory, "special_tokens_map.json")
        with open(special_tokens_map_file, "w") as f:
            json.dump({
                "kmerlen": self.kmerlen,
                "overlapping": self.overlapping,
                "maxlen": self.maxlen
            }, f)
        vocab_files = self.save_vocabulary(save_directory)
        return (special_tokens_map_file,) + vocab_files

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        # Load tokenizer using the parent class method
        tokenizer = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        # Load special tokens map
        special_tokens_map_file = os.path.join(pretrained_model_name_or_path, "special_tokens_map.json")
        if os.path.isfile(special_tokens_map_file):
            with open(special_tokens_map_file, "r") as f:
                special_tokens_map = json.load(f)
            tokenizer.kmerlen = special_tokens_map.get("kmerlen", 6)
            tokenizer.overlapping = special_tokens_map.get("overlapping", True)
            tokenizer.maxlen = special_tokens_map.get("maxlen", 400)
        
        # Load vocabulary
        vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
        if os.path.isfile(vocab_file):
            with open(vocab_file, "r") as f:
                tokendict = json.load(f)
            tokenizer.tokendict = tokendict
        
        return tokenizer

In [29]:
tokenizer = KmerTokenizer(kmerlen=8, overlapping=True, maxlen=4096)
tokenizer.VOCAB

['00000000',
 '00000001',
 '00000002',
 '00000003',
 '00000010',
 '00000011',
 '00000012',
 '00000013',
 '00000020',
 '00000021',
 '00000022',
 '00000023',
 '00000030',
 '00000031',
 '00000032',
 '00000033',
 '00000100',
 '00000101',
 '00000102',
 '00000103',
 '00000110',
 '00000111',
 '00000112',
 '00000113',
 '00000120',
 '00000121',
 '00000122',
 '00000123',
 '00000130',
 '00000131',
 '00000132',
 '00000133',
 '00000200',
 '00000201',
 '00000202',
 '00000203',
 '00000210',
 '00000211',
 '00000212',
 '00000213',
 '00000220',
 '00000221',
 '00000222',
 '00000223',
 '00000230',
 '00000231',
 '00000232',
 '00000233',
 '00000300',
 '00000301',
 '00000302',
 '00000303',
 '00000310',
 '00000311',
 '00000312',
 '00000313',
 '00000320',
 '00000321',
 '00000322',
 '00000323',
 '00000330',
 '00000331',
 '00000332',
 '00000333',
 '00001000',
 '00001001',
 '00001002',
 '00001003',
 '00001010',
 '00001011',
 '00001012',
 '00001013',
 '00001020',
 '00001021',
 '00001022',
 '00001023',
 '00001030',

In [32]:
#now we can tokenize it
tokenized_output = tokenizer.kmer_tokenize([testseq]) #rather slow, but if we do it once, it should be fine!
print(len(tokenized_output), len(tokenized_output[0]), tokenized_output[:10])
# print(tokenized_output)
#ahh cut off at 4096

1 4096 [[53252, 16388, 65537, 65525, 65479, 65296, 64564, 61635, 49918, 3050, 12185, 48728, 63828, 58692, 38147, 21502, 20460, 16292, 65155, 63997, 59368, 40851, 32318, 63724, 58276, 36481, 14837, 59335, 40717, 31782, 61578, 49689, 2133, 8517, 34056, 5139, 20541, 16613, 901, 3592, 14354, 57401, 32982, 842, 3353, 13398, 53577, 17685, 5189, 20744, 17425, 4151, 16589, 805, 3208, 12820, 51266, 8441, 33752, 3923, 15679, 62701, 54181, 20103, 14863, 59437, 41125, 33415, 2575, 10287, 41134, 33450, 2713, 10840, 43346, 42297, 38102, 21322, 19739, 13408, 53620, 17860, 5889, 23544, 28625, 48951, 64718, 62252, 52388, 12931, 51709, 10216, 40852, 32324, 63745, 58357, 36805, 16135, 64528, 61492, 49348, 772, 3074, 12281, 49112, 65363, 64832, 62706, 54204, 20196, 15236, 60930, 47100, 57316, 32642, 65017, 63447, 57165, 32037, 62597, 53768, 18452, 8259, 33023, 1008, 4018, 16057, 64216, 60243, 44351, 46320, 54196, 20161, 15094, 60361, 44824, 48210, 61756, 50402, 4985, 19928, 14162, 56633, 29911, 54093, 197

In [33]:
tokenizer = KmerTokenizer(kmerlen=8, overlapping=True, maxlen=131072)

In [37]:
tokenized_output = tokenizer.kmer_tokenize([testseq]) #rather slow, but if we do it once, it should be fine!
print(len(tokenized_output), len(tokenized_output[0]), tokenized_output[0][:10], tokenized_output[0][-10:])
#last 7 are 4 as there is not obvious direct usage. It's fine since we don't predict on the edges, and ok to lose some information
#but we need some way to store the information in np.uint16, but all te numbers are already used
#wait these final 4 will only be at the end, no need to store it, just remove those last 7. Ideally we use that info still


1 131072 [53252, 16388, 65537, 65525, 65479, 65296, 64564, 61635, 49918, 3050] [7459, 29821, 53735, 4, 4, 4, 4, 4, 4, 4]


In [36]:
print(len(testseq))

131072


In [39]:
#let's alter the method to find one match there? Or we just remove it... hmmm
#let's load up hg38
hg38_loc = '/data/leslie/sarthak/data/chrombpnet_test/hg38.fa'
import pyfaidx
hg38 = pyfaidx.Fasta(hg38_loc)
hg38.keys()

odict_keys(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'chr1_KI270706v1_random', 'chr1_KI270707v1_random', 'chr1_KI270708v1_random', 'chr1_KI270709v1_random', 'chr1_KI270710v1_random', 'chr1_KI270711v1_random', 'chr1_KI270712v1_random', 'chr1_KI270713v1_random', 'chr1_KI270714v1_random', 'chr2_KI270715v1_random', 'chr2_KI270716v1_random', 'chr3_GL000221v1_random', 'chr4_GL000008v2_random', 'chr5_GL000208v1_random', 'chr9_KI270717v1_random', 'chr9_KI270718v1_random', 'chr9_KI270719v1_random', 'chr9_KI270720v1_random', 'chr11_KI270721v1_random', 'chr14_GL000009v2_random', 'chr14_GL000225v1_random', 'chr14_KI270722v1_random', 'chr14_GL000194v1_random', 'chr14_KI270723v1_random', 'chr14_KI270724v1_random', 'chr14_KI270725v1_random', 'chr14_KI270726v1_random', 'chr15_KI270727v1_random', 'chr16_KI270728v1_random', 'chr17_GL0

In [40]:
#find where this test seq is in the hg38
testseq = seq_str[0]
for chrom in hg38.keys():
    if testseq in hg38[chrom]:
        print(chrom)
        break
#takes a very long time lol

KeyboardInterrupt: 

In [45]:
#instead we'll save out the tokenized sequence, remove the last 7, and add a special bos and eos token and then some padding
#actually let's make it better by just reducing the input size of the model by that. That makes way more sense!
# then no worry about it, but can just save as is
#make a new array to store the tokenized results
from tqdm import tqdm
seq_tokenized = np.zeros((len(seq_str), 131072-7), dtype=np.uint16)
for i,testseq in tqdm(enumerate(seq_str), total=len(seq_str)):
    tokenized_output = tokenizer.kmer_tokenize([testseq])
    seq_tokenized[i] = np.array(tokenized_output[0][:-7], dtype=np.uint16)
#4ish minutes isn't bad

  0%|          | 0/1937 [00:00<?, ?it/s]

100%|██████████| 1937/1937 [04:26<00:00,  7.27it/s]


In [46]:
#check the size in GB
seq_tokenized.nbytes/1e9 #this is perfect

0.50774581

In [48]:
#now we verify if it is correct
seq[800,:30]

array([ 7,  9,  9,  8,  8,  8,  7,  9,  7,  9,  7,  9,  8, 10, 10,  7,  7,
        8,  8,  9, 10,  9,  8,  8,  8,  8, 10, 10, 10,  8], dtype=int8)

In [49]:
#let's see if this makes sense
seq_tokenized[800,:30]

array([10583, 42317, 38183, 21645, 21031, 18574,  8748, 34980,  8833,
       35317, 10182, 40714, 31771, 61536, 49523,  1470,  5866, 23450,
       28250, 47452, 58724, 38276, 22018, 22521, 24534, 32586, 64796,
       62564, 53635, 17919], dtype=uint16)

In [52]:
print(tokenizer.convert_ids_to_tokens(seq_tokenized[800,:30]))

['02211102', '22111020', '21110202', '11102020', '11020202', '10202021', '02020213', '20202133', '02021330', '20213300', '02133001', '21330011', '13300112', '33001123', '30011232', '00112321', '01123211', '11232111', '12321111', '23211113', '32111133', '21111333', '11113331', '11133310', '11333101', '13331011', '33310113', '33101133', '31011332', '10113322']


In [53]:
#and now convert the rc
seq_rc = np.array(seq_data['sequence_array_rc'])
seq_str_rc = np.array([''.join(map(str, seq_rc[i]-7)) for i in range(len(seq_rc))])
print(seq_str_rc.shape)

(1937,)


In [54]:
seq_tokenized_rc = np.zeros((len(seq_str_rc), 131072-7), dtype=np.uint16)
for i,testseq in tqdm(enumerate(seq_str_rc), total=len(seq_str_rc)):
    tokenized_output = tokenizer.kmer_tokenize([testseq])
    seq_tokenized_rc[i] = np.array(tokenized_output[0][:-7], dtype=np.uint16)

100%|██████████| 1937/1937 [04:28<00:00,  7.22it/s]


In [55]:
#let's save it out
np.savez(f'/data/leslie/sarthak/data/enformer/data/{split}_seq_kmer_8.npz', sequence_array=seq_tokenized, sequence_array_rc=seq_tokenized_rc)

In [56]:
#and make it a function that takes in a split
def tokenize_kmer(split):
    data_path=f'/data/leslie/sarthak/data/enformer/data/{split}_seq.npz'
    seq_data = np.load(data_path)
    seq = np.array(seq_data['sequence_array'])
    seq_rc = np.array(seq_data['sequence_array_rc'])
    print('Loaded data, now converting to strings')
    seq_str = np.array([''.join(map(str, seq[i]-7)) for i in range(len(seq))])
    seq_str_rc = np.array([''.join(map(str, seq_rc[i]-7)) for i in range(len(seq_rc))])
    tokenizer = KmerTokenizer(kmerlen=8, overlapping=True, maxlen=131072)
    seq_tokenized = np.zeros((len(seq_str), 131072-7), dtype=np.uint16)
    for i,testseq in tqdm(enumerate(seq_str), total=len(seq_str)):
        tokenized_output = tokenizer.kmer_tokenize([testseq])
        seq_tokenized[i] = np.array(tokenized_output[0][:-7], dtype=np.uint16)
    seq_tokenized_rc = np.zeros((len(seq_str_rc), 131072-7), dtype=np.uint16)
    for i,testseq in tqdm(enumerate(seq_str_rc), total=len(seq_str_rc)):
        tokenized_output = tokenizer.kmer_tokenize([testseq])
        seq_tokenized_rc[i] = np.array(tokenized_output[0][:-7], dtype=np.uint16)
    np.savez(f'/data/leslie/sarthak/data/enformer/data/{split}_seq_kmer_8.npz', sequence_array=seq_tokenized, sequence_array_rc=seq_tokenized_rc)

In [57]:
#now let's do it for train and val too
split = 'val'
tokenize_kmer(split)

Loaded data, now converting to strings


100%|██████████| 2213/2213 [05:07<00:00,  7.21it/s]
100%|██████████| 2213/2213 [05:08<00:00,  7.17it/s]


In [58]:
split = 'train'
tokenize_kmer(split)
#ven turning to strings is quite slow!
#this took a while and a lot of memory

Loaded data, now converting to strings


# some modifications to make it work!

In [1]:
import itertools
from transformers import PreTrainedTokenizer
import json
import os
import numpy as np
from tqdm import tqdm

#now tokenize it
class KmerTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file=None, kmerlen=6, overlapping=True, maxlen=400, **kwargs):
        self.kmerlen = kmerlen
        self.overlapping = overlapping
        self.maxlen = maxlen
        
        # Initialize vocabulary
        self.VOCAB = [''.join(i) for i in itertools.product(*(['0123'] * int(self.kmerlen)))]
        self.VOCAB_SIZE = len(self.VOCAB) + 5
        
        self.tokendict = dict(zip(self.VOCAB, range(0, self.VOCAB_SIZE)))
        self.tokendict['[CLS]'] = 0
        self.tokendict['[SEP]'] = 1
        self.tokendict['[BOS]'] = 2
        self.tokendict['[MASK]'] = 3
        self.tokendict['[PAD]'] = 4
        self.tokendict['[EOS]'] = 5 #was reserved above
        self.tokendict['[UNK]'] = 6
        
        
        
        super().__init__(**kwargs)

    def _tokenize(self, text):
        tokens = []
        stoprange = len(text) - (self.kmerlen - 1)
        if self.overlapping:
            for k in range(0, stoprange):
                kmer = text[k:k + self.kmerlen]
                if set(kmer).issubset('0123'):
                    tokens.append(kmer)
        else:
            for k in range(0, stoprange, self.kmerlen):
                kmer = text[k:k + self.kmerlen]
                if set(kmer).issubset('0123'):
                    tokens.append(kmer)
        return tokens

    def _convert_token_to_id(self, token):
        return self.tokendict.get(token, self.tokendict['[UNK]'])

    def _convert_id_to_token(self, index):
        inv_tokendict = {v: k for k, v in self.tokendict.items()}
        return inv_tokendict.get(index, '[UNK]')

    def convert_tokens_to_string(self, tokens):
        return ' '.join(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        if token_ids_1 is None:
            return [self.tokendict['[CLS]']] + token_ids_0 + [self.tokendict['[SEP]']]
        return [self.tokendict['[CLS]']] + token_ids_0 + [self.tokendict['[SEP]']] + token_ids_1 + [self.tokendict['[SEP]']]

    def get_vocab(self):
        return self.tokendict

    def kmer_tokenize(self, seq_list):
        seq_ind_list = []
        for seq in seq_list:
            tokens = self._tokenize(seq)
            token_ids = [self._convert_token_to_id(token) for token in tokens]
            if len(token_ids) < self.maxlen:
                token_ids.extend([self.tokendict['[PAD]']] * (self.maxlen - len(token_ids)))
            else:
                token_ids = token_ids[:self.maxlen]
            seq_ind_list.append(token_ids)
        return seq_ind_list

    def save_vocabulary(self, save_directory, filename_prefix=None):
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory)
        
        vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + 'vocab.json')
        
        with open(vocab_file, 'w') as f:
            json.dump(self.tokendict, f)
        
        return (vocab_file,)

    def save_pretrained(self, save_directory, **kwargs):
        special_tokens_map_file = os.path.join(save_directory, "special_tokens_map.json")
        with open(special_tokens_map_file, "w") as f:
            json.dump({
                "kmerlen": self.kmerlen,
                "overlapping": self.overlapping,
                "maxlen": self.maxlen
            }, f)
        vocab_files = self.save_vocabulary(save_directory)
        return (special_tokens_map_file,) + vocab_files

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        # Load tokenizer using the parent class method
        tokenizer = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        # Load special tokens map
        special_tokens_map_file = os.path.join(pretrained_model_name_or_path, "special_tokens_map.json")
        if os.path.isfile(special_tokens_map_file):
            with open(special_tokens_map_file, "r") as f:
                special_tokens_map = json.load(f)
            tokenizer.kmerlen = special_tokens_map.get("kmerlen", 6)
            tokenizer.overlapping = special_tokens_map.get("overlapping", True)
            tokenizer.maxlen = special_tokens_map.get("maxlen", 400)
        
        # Load vocabulary
        vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
        if os.path.isfile(vocab_file):
            with open(vocab_file, "r") as f:
                tokendict = json.load(f)
            tokenizer.tokendict = tokendict
        
        return tokenizer

In [3]:
split = 'test'
data_path=f'/data/leslie/sarthak/data/enformer/data/{split}_seq.npz'
seq_data = np.load(data_path)
seq = np.array(seq_data['sequence_array'])
seq_rc = np.array(seq_data['sequence_array_rc'])
print('Loaded data, now converting to strings')
seq_str = np.array([''.join(map(str, seq[i]-7)) for i in range(len(seq))])
seq_str_rc = np.array([''.join(map(str, seq_rc[i]-7)) for i in range(len(seq_rc))])
tokenizer = KmerTokenizer(kmerlen=8, overlapping=True, maxlen=131072)

Loaded data, now converting to strings


In [4]:
#so previously we used the tokenizer kmer function, what if we use the other one
testseq = seq_str[0]
print(testseq[:20])

30333333300233211033


In [7]:
out1 = tokenizer._tokenize(testseq) #ok so this doesn't work as it just kmerizes it but needs ot be tokenized
out1

['30333333',
 '03333333',
 '33333330',
 '33333300',
 '33333002',
 '33330023',
 '33300233',
 '33002332',
 '30023321',
 '00233211',
 '02332110',
 '23321103',
 '33211033',
 '32110333',
 '21103332',
 '11033321',
 '10333213',
 '03332133',
 '33321332',
 '33213320',
 '32133203',
 '21332032',
 '13320321',
 '33203213',
 '32032133',
 '20321330',
 '03213300',
 '32133002',
 '21330020',
 '13300201',
 '33002011',
 '30020110',
 '00201100',
 '02011000',
 '20110003',
 '01100032',
 '11000320',
 '10003200',
 '00032000',
 '00320003',
 '03200031',
 '32000310',
 '20003101',
 '00031011',
 '00310110',
 '03101101',
 '31011010',
 '10110100',
 '01101000',
 '11010003',
 '10100030',
 '01000302',
 '10003020',
 '00030200',
 '00302003',
 '03020033',
 '30200331',
 '02003310',
 '20033103',
 '00331032',
 '03310322',
 '33103220',
 '31032200',
 '10322002',
 '03220022',
 '32200220',
 '22002200',
 '20022002',
 '00220022',
 '02200222',
 '22002221',
 '20022211',
 '00222110',
 '02221103',
 '22211031',
 '22110310',
 '21103101',

In [8]:
len(out1) #cuz exclude last 7

131065

In [9]:
#compare that with this
out2 = tokenizer.kmer_tokenize([testseq])
out2

[[53247,
  16383,
  65532,
  65520,
  65474,
  65291,
  64559,
  61630,
  49913,
  3045,
  12180,
  48723,
  63823,
  58687,
  38142,
  21497,
  20455,
  16287,
  65150,
  63992,
  59363,
  40846,
  32313,
  63719,
  58271,
  36476,
  14832,
  59330,
  40712,
  31777,
  61573,
  49684,
  2128,
  8512,
  34051,
  5134,
  20536,
  16608,
  896,
  3587,
  14349,
  57396,
  32977,
  837,
  3348,
  13393,
  53572,
  17680,
  5184,
  20739,
  17420,
  4146,
  16584,
  800,
  3203,
  12815,
  51261,
  8436,
  33747,
  3918,
  15674,
  62696,
  54176,
  20098,
  14858,
  59432,
  41120,
  33410,
  2570,
  10282,
  41129,
  33445,
  2708,
  10835,
  43341,
  42292,
  38097,
  21317,
  19734,
  13403,
  53615,
  17855,
  5884,
  23539,
  28620,
  48946,
  64713,
  62247,
  52383,
  12926,
  51704,
  10211,
  40847,
  32319,
  63740,
  58352,
  36800,
  16130,
  64523,
  61487,
  49343,
  767,
  3069,
  12276,
  49107,
  65358,
  64827,
  62701,
  54199,
  20191,
  15231,
  60925,
  47095,
  5731

In [2]:
#also let's look at self.tokendict
tokenizer = KmerTokenizer(kmerlen=8, overlapping=True, maxlen=131072)
tokenizer.tokendict

{'00000000': 0,
 '00000001': 1,
 '00000002': 2,
 '00000003': 3,
 '00000010': 4,
 '00000011': 5,
 '00000012': 6,
 '00000013': 7,
 '00000020': 8,
 '00000021': 9,
 '00000022': 10,
 '00000023': 11,
 '00000030': 12,
 '00000031': 13,
 '00000032': 14,
 '00000033': 15,
 '00000100': 16,
 '00000101': 17,
 '00000102': 18,
 '00000103': 19,
 '00000110': 20,
 '00000111': 21,
 '00000112': 22,
 '00000113': 23,
 '00000120': 24,
 '00000121': 25,
 '00000122': 26,
 '00000123': 27,
 '00000130': 28,
 '00000131': 29,
 '00000132': 30,
 '00000133': 31,
 '00000200': 32,
 '00000201': 33,
 '00000202': 34,
 '00000203': 35,
 '00000210': 36,
 '00000211': 37,
 '00000212': 38,
 '00000213': 39,
 '00000220': 40,
 '00000221': 41,
 '00000222': 42,
 '00000223': 43,
 '00000230': 44,
 '00000231': 45,
 '00000232': 46,
 '00000233': 47,
 '00000300': 48,
 '00000301': 49,
 '00000302': 50,
 '00000303': 51,
 '00000310': 52,
 '00000311': 53,
 '00000312': 54,
 '00000313': 55,
 '00000320': 56,
 '00000321': 57,
 '00000322': 58,
 '00000

In [3]:
keys_list = list(tokenizer.tokendict.keys())
print(keys_list[-10:])

['33333331', '33333332', '33333333', '[CLS]', '[SEP]', '[BOS]', '[MASK]', '[PAD]', '[EOS]', '[UNK]']


In [4]:
len(keys_list) #65536 is the number we actually use from the token dict
#but it's fine just tokenizes from that to here, but we don't care about it anyways!

65543

In [5]:
#creat a smaller length kmer tokenizer
tokenizer = KmerTokenizer(kmerlen=3, overlapping=True, maxlen=4096)
tokenizer.tokendict

{'000': 0,
 '001': 1,
 '002': 2,
 '003': 3,
 '010': 4,
 '011': 5,
 '012': 6,
 '013': 7,
 '020': 8,
 '021': 9,
 '022': 10,
 '023': 11,
 '030': 12,
 '031': 13,
 '032': 14,
 '033': 15,
 '100': 16,
 '101': 17,
 '102': 18,
 '103': 19,
 '110': 20,
 '111': 21,
 '112': 22,
 '113': 23,
 '120': 24,
 '121': 25,
 '122': 26,
 '123': 27,
 '130': 28,
 '131': 29,
 '132': 30,
 '133': 31,
 '200': 32,
 '201': 33,
 '202': 34,
 '203': 35,
 '210': 36,
 '211': 37,
 '212': 38,
 '213': 39,
 '220': 40,
 '221': 41,
 '222': 42,
 '223': 43,
 '230': 44,
 '231': 45,
 '232': 46,
 '233': 47,
 '300': 48,
 '301': 49,
 '302': 50,
 '303': 51,
 '310': 52,
 '311': 53,
 '312': 54,
 '313': 55,
 '320': 56,
 '321': 57,
 '322': 58,
 '323': 59,
 '330': 60,
 '331': 61,
 '332': 62,
 '333': 63,
 '[CLS]': 0,
 '[SEP]': 1,
 '[BOS]': 2,
 '[MASK]': 3,
 '[PAD]': 4,
 '[EOS]': 5,
 '[UNK]': 6}

In [None]:
#if we had these tokens would map to this, but we don't, so it's all good!

# testing dataset

In [1]:
import sys
sys.path.append('/data/leslie/sarthak/hyena/hyena-dna')
import src.dataloaders.datasets.enformer_dataset as enformer_dataset
dataset = enformer_dataset.EnformerDataset('test', 131_072, rc_aug = True, kmer_len = 8)

In [2]:
dataset.seq.shape #this shape looks right

(1937, 131065)

In [3]:
ex1 = dataset[0]
print(ex1[0].shape)
print(ex1[1].shape)
#padded because we input length as longer so it pads, but if we don't want to pad, just input the smaller 131065 length

torch.Size([131072])
torch.Size([896, 4675])
