# mamba requires making a RC dictionary or complement map

In [1]:
#it's quite simple for ACGT, but what about for kmers? let's start with an easy example

seq = 'ACGTACGTTTTTNATCTTCTTT' #use kmer of length 4
def reverse_complement(seq):
    complement = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'N':'N'}
    return ''.join([complement[base] for base in seq[::-1]])

print(seq)
print(reverse_complement(seq))


ACGTACGTTTTTNATCTTCTTT
AAAGAAGATNAAAAACGTACGT


In [3]:
#so if it's tokenized, it's easy to find the RC
#but if we have a kmer, it's a bit harder
import itertools
import numpy as np
# numbers = (7, 8, 9, 10, 11)
numbers = ('A', 'C', 'G', 'T', 'N')
length = 6

# Generate all combinations of length 8
combinations = itertools.product(numbers, repeat=length)

# Create a dictionary mapping each combination to a number
combination_dict = {comb: idx for idx, comb in enumerate(combinations)}
combination_dict.keys()

dict_keys([('A', 'A', 'A', 'A', 'A', 'A'), ('A', 'A', 'A', 'A', 'A', 'C'), ('A', 'A', 'A', 'A', 'A', 'G'), ('A', 'A', 'A', 'A', 'A', 'T'), ('A', 'A', 'A', 'A', 'A', 'N'), ('A', 'A', 'A', 'A', 'C', 'A'), ('A', 'A', 'A', 'A', 'C', 'C'), ('A', 'A', 'A', 'A', 'C', 'G'), ('A', 'A', 'A', 'A', 'C', 'T'), ('A', 'A', 'A', 'A', 'C', 'N'), ('A', 'A', 'A', 'A', 'G', 'A'), ('A', 'A', 'A', 'A', 'G', 'C'), ('A', 'A', 'A', 'A', 'G', 'G'), ('A', 'A', 'A', 'A', 'G', 'T'), ('A', 'A', 'A', 'A', 'G', 'N'), ('A', 'A', 'A', 'A', 'T', 'A'), ('A', 'A', 'A', 'A', 'T', 'C'), ('A', 'A', 'A', 'A', 'T', 'G'), ('A', 'A', 'A', 'A', 'T', 'T'), ('A', 'A', 'A', 'A', 'T', 'N'), ('A', 'A', 'A', 'A', 'N', 'A'), ('A', 'A', 'A', 'A', 'N', 'C'), ('A', 'A', 'A', 'A', 'N', 'G'), ('A', 'A', 'A', 'A', 'N', 'T'), ('A', 'A', 'A', 'A', 'N', 'N'), ('A', 'A', 'A', 'C', 'A', 'A'), ('A', 'A', 'A', 'C', 'A', 'C'), ('A', 'A', 'A', 'C', 'A', 'G'), ('A', 'A', 'A', 'C', 'A', 'T'), ('A', 'A', 'A', 'C', 'A', 'N'), ('A', 'A', 'A', 'C', 'C', 'A'

In [12]:
#now let's tokenize it
tokenized = []
for idx,nuc in enumerate(seq[:-length+1]):
    shortseq = seq[idx:idx+length]
    print(shortseq)
    print(combination_dict[tuple(shortseq)])
    print('')
    tokenized.append(combination_dict[tuple(shortseq)])

ACGTAC
951

CGTACG
4757

GTACGT
8163

TACGTT
9568

ACGTTT
968

CGTTTT
4843

GTTTTT
8593

TTTTTN
11719

TTTTNA
11720

TTTNAT
11728

TTNATC
11766

TNATCT
11958

NATCTT
12918

ATCTTC
2091

TCTTCT
10458

CTTCTT
5418

TTCTTT
11468



In [13]:
tokenized_rc = []
rc_seq = reverse_complement(seq)
for idx,nuc in enumerate(rc_seq[:-length+1]):
    shortseq = rc_seq[idx:idx+length]
    print(shortseq)
    print(combination_dict[tuple(shortseq)])
    print('')
    tokenized_rc.append(combination_dict[tuple(shortseq)])

AAAGAA
50

AAGAAG
252

AGAAGA
1260

GAAGAT
6303

AAGATN
269

AGATNA
1345

GATNAA
6725

ATNAAA
2375

TNAAAA
11875

NAAAAA
12500

AAAAAC
1

AAAACG
7

AAACGT
38

AACGTA
190

ACGTAC
951

CGTACG
4757

GTACGT
8163



In [14]:
#we can also tokenize the reverse complement
print(tokenized)
print(tokenized_rc)

[951, 4757, 8163, 9568, 968, 4843, 8593, 11719, 11720, 11728, 11766, 11958, 12918, 2091, 10458, 5418, 11468]
[50, 252, 1260, 6303, 269, 1345, 6725, 2375, 11875, 12500, 1, 7, 38, 190, 951, 4757, 8163]


In [15]:
#now need to find a way to reverse complement this... this will be tough! 
#so I wrote it out and I think we can just RC each individual point then reverse the whole sequence, but that means the map is just to map to its RC?
#let's test with 2mer

numbers = ('A', 'C', 'G', 'T', 'N')
length = 2

# Generate all combinations of length 8
combinations = itertools.product(numbers, repeat=length)

# Create a dictionary mapping each combination to a number
combination_dict = {comb: idx for idx, comb in enumerate(combinations)}
combination_dict.keys()

dict_keys([('A', 'A'), ('A', 'C'), ('A', 'G'), ('A', 'T'), ('A', 'N'), ('C', 'A'), ('C', 'C'), ('C', 'G'), ('C', 'T'), ('C', 'N'), ('G', 'A'), ('G', 'C'), ('G', 'G'), ('G', 'T'), ('G', 'N'), ('T', 'A'), ('T', 'C'), ('T', 'G'), ('T', 'T'), ('T', 'N'), ('N', 'A'), ('N', 'C'), ('N', 'G'), ('N', 'T'), ('N', 'N')])

In [16]:
seq = 'ACGTACNNNNACCTGTGAAACGTNCTNNNTTCCAACCCGTTT'
rc_seq = reverse_complement(seq)

tokenized = []
tokenized_rc = []
for idx,nuc in enumerate(seq[:-length+1]):
    shortseq = seq[idx:idx+length]
    tokenized.append(combination_dict[tuple(shortseq)])
    shortseq_rc = rc_seq[idx:idx+length]
    tokenized_rc.append(combination_dict[tuple(shortseq_rc)])

print(tokenized)
print(tokenized_rc)

[1, 7, 13, 15, 1, 9, 24, 24, 24, 20, 1, 6, 8, 17, 13, 17, 10, 0, 0, 1, 7, 13, 19, 21, 8, 19, 24, 24, 23, 18, 16, 6, 5, 0, 1, 6, 6, 7, 13, 18, 18]
[0, 0, 1, 7, 12, 12, 13, 18, 17, 12, 10, 0, 4, 24, 24, 20, 2, 14, 20, 1, 7, 13, 18, 18, 16, 5, 1, 5, 2, 12, 13, 19, 24, 24, 24, 22, 13, 15, 1, 7, 13]


In [20]:
complement_map = {}
for key in combination_dict.keys():
    complement_map[key] = tuple(reverse_complement(key))
print(complement_map)

{('A', 'A'): ('T', 'T'), ('A', 'C'): ('G', 'T'), ('A', 'G'): ('C', 'T'), ('A', 'T'): ('A', 'T'), ('A', 'N'): ('N', 'T'), ('C', 'A'): ('T', 'G'), ('C', 'C'): ('G', 'G'), ('C', 'G'): ('C', 'G'), ('C', 'T'): ('A', 'G'), ('C', 'N'): ('N', 'G'), ('G', 'A'): ('T', 'C'), ('G', 'C'): ('G', 'C'), ('G', 'G'): ('C', 'C'), ('G', 'T'): ('A', 'C'), ('G', 'N'): ('N', 'C'), ('T', 'A'): ('T', 'A'), ('T', 'C'): ('G', 'A'), ('T', 'G'): ('C', 'A'), ('T', 'T'): ('A', 'A'), ('T', 'N'): ('N', 'A'), ('N', 'A'): ('T', 'N'), ('N', 'C'): ('G', 'N'), ('N', 'G'): ('C', 'N'), ('N', 'T'): ('A', 'N'), ('N', 'N'): ('N', 'N')}


In [24]:
#no that's wrong, complement map shouldn't be the keys but the values
complement_map = {}
for key in combination_dict.keys():
    value = combination_dict[key]
    rc_value = combination_dict[tuple(reverse_complement(key))]
    complement_map[value] = rc_value

In [25]:
complement_map
#manually seems 1 and 13, 0 and 18, 15 and 15 are some ones
#all of those line up here!!

{0: 18,
 1: 13,
 2: 8,
 3: 3,
 4: 23,
 5: 17,
 6: 12,
 7: 7,
 8: 2,
 9: 22,
 10: 16,
 11: 11,
 12: 6,
 13: 1,
 14: 21,
 15: 15,
 16: 10,
 17: 5,
 18: 0,
 19: 20,
 20: 19,
 21: 14,
 22: 9,
 23: 4,
 24: 24}

In [26]:
newrc = []
for token in tokenized:
    newrc.append(complement_map[token])
print(newrc[::-1])
print(tokenized_rc)
#hey this works perfectly!! I assume this will translate to higher kmers as well

[0, 0, 1, 7, 12, 12, 13, 18, 17, 12, 10, 0, 4, 24, 24, 20, 2, 14, 20, 1, 7, 13, 18, 18, 16, 5, 1, 5, 2, 12, 13, 19, 24, 24, 24, 22, 13, 15, 1, 7, 13]
[0, 0, 1, 7, 12, 12, 13, 18, 17, 12, 10, 0, 4, 24, 24, 20, 2, 14, 20, 1, 7, 13, 18, 18, 16, 5, 1, 5, 2, 12, 13, 19, 24, 24, 24, 22, 13, 15, 1, 7, 13]


In [27]:
#let's test it with the numbers as well
import itertools
import numpy as np
numbers = (7, 8, 9, 10, 11)
length = 6

# Generate all combinations of length 8
combinations = itertools.product(numbers, repeat=length)

# Create a dictionary mapping each combination to a number
combination_dict = {comb: idx for idx, comb in enumerate(combinations)}
combination_dict.keys()

dict_keys([(7, 7, 7, 7, 7, 7), (7, 7, 7, 7, 7, 8), (7, 7, 7, 7, 7, 9), (7, 7, 7, 7, 7, 10), (7, 7, 7, 7, 7, 11), (7, 7, 7, 7, 8, 7), (7, 7, 7, 7, 8, 8), (7, 7, 7, 7, 8, 9), (7, 7, 7, 7, 8, 10), (7, 7, 7, 7, 8, 11), (7, 7, 7, 7, 9, 7), (7, 7, 7, 7, 9, 8), (7, 7, 7, 7, 9, 9), (7, 7, 7, 7, 9, 10), (7, 7, 7, 7, 9, 11), (7, 7, 7, 7, 10, 7), (7, 7, 7, 7, 10, 8), (7, 7, 7, 7, 10, 9), (7, 7, 7, 7, 10, 10), (7, 7, 7, 7, 10, 11), (7, 7, 7, 7, 11, 7), (7, 7, 7, 7, 11, 8), (7, 7, 7, 7, 11, 9), (7, 7, 7, 7, 11, 10), (7, 7, 7, 7, 11, 11), (7, 7, 7, 8, 7, 7), (7, 7, 7, 8, 7, 8), (7, 7, 7, 8, 7, 9), (7, 7, 7, 8, 7, 10), (7, 7, 7, 8, 7, 11), (7, 7, 7, 8, 8, 7), (7, 7, 7, 8, 8, 8), (7, 7, 7, 8, 8, 9), (7, 7, 7, 8, 8, 10), (7, 7, 7, 8, 8, 11), (7, 7, 7, 8, 9, 7), (7, 7, 7, 8, 9, 8), (7, 7, 7, 8, 9, 9), (7, 7, 7, 8, 9, 10), (7, 7, 7, 8, 9, 11), (7, 7, 7, 8, 10, 7), (7, 7, 7, 8, 10, 8), (7, 7, 7, 8, 10, 9), (7, 7, 7, 8, 10, 10), (7, 7, 7, 8, 10, 11), (7, 7, 7, 8, 11, 7), (7, 7, 7, 8, 11, 8), (7, 7, 7, 8, 1

In [31]:
seq = [7,8,9,10,11,11,11,11,7,7,7,9,8,10,10,8,8]
rc_map = {7:10, 8:9, 9:8, 10:7, 11:11}
rc_seq = [rc_map[base] for base in seq[::-1]]

tokenized = []
tokenized_rc = []
for idx,nuc in enumerate(seq[:-length+1]):
    shortseq = seq[idx:idx+length]
    tokenized.append(combination_dict[tuple(shortseq)])
    shortseq_rc = rc_seq[idx:idx+length]
    tokenized_rc.append(combination_dict[tuple(shortseq_rc)])

In [32]:
print(seq)
print(rc_seq)

[7, 8, 9, 10, 11, 11, 11, 11, 7, 7, 7, 9, 8, 10, 10, 8, 8]
[9, 9, 7, 7, 9, 8, 10, 10, 10, 11, 11, 11, 11, 7, 8, 9, 10]


In [33]:
print(tokenized)
print(tokenized_rc)

[974, 4874, 8749, 12495, 15600, 15500, 15002, 12511, 58, 293, 1466, 7331]
[7511, 6308, 293, 1468, 7344, 5474, 11749, 11874, 12495, 15601, 15507, 15038]


In [36]:
def rc_number(seq):
    rc_map = {7:10, 8:9, 9:8, 10:7, 11:11}
    return [rc_map[base] for base in seq[::-1]]

In [37]:
#now test it
complement_map = {}
for key in combination_dict.keys():
    value = combination_dict[key]
    rc_value = combination_dict[tuple(rc_number(key))]
    complement_map[value] = rc_value
newrc = []
for token in tokenized:
    newrc.append(complement_map[token])
print(newrc[::-1])
print(tokenized_rc)

#hey man, seems we can define a complement map pretty easily...

[7511, 6308, 293, 1468, 7344, 5474, 11749, 11874, 12495, 15601, 15507, 15038]
[7511, 6308, 293, 1468, 7344, 5474, 11749, 11874, 12495, 15601, 15507, 15038]


In [39]:
#now for our 6mers, let's save out a complement map and save it out as json
import json
with open('/data/leslie/sarthak/data/enformer/data/complement_map_6mer.json', 'w') as f:
    json.dump(complement_map, f)

In [40]:
#now load it in
with open('/data/leslie/sarthak/data/enformer/data/complement_map_6mer.json', 'r') as f:
    complement_map = json.load(f)
print(complement_map)

{'0': 11718, '1': 8593, '2': 5468, '3': 2343, '4': 14843, '5': 11093, '6': 7968, '7': 4843, '8': 1718, '9': 14218, '10': 10468, '11': 7343, '12': 4218, '13': 1093, '14': 13593, '15': 9843, '16': 6718, '17': 3593, '18': 468, '19': 12968, '20': 12343, '21': 9218, '22': 6093, '23': 2968, '24': 15468, '25': 11593, '26': 8468, '27': 5343, '28': 2218, '29': 14718, '30': 10968, '31': 7843, '32': 4718, '33': 1593, '34': 14093, '35': 10343, '36': 7218, '37': 4093, '38': 968, '39': 13468, '40': 9718, '41': 6593, '42': 3468, '43': 343, '44': 12843, '45': 12218, '46': 9093, '47': 5968, '48': 2843, '49': 15343, '50': 11468, '51': 8343, '52': 5218, '53': 2093, '54': 14593, '55': 10843, '56': 7718, '57': 4593, '58': 1468, '59': 13968, '60': 10218, '61': 7093, '62': 3968, '63': 843, '64': 13343, '65': 9593, '66': 6468, '67': 3343, '68': 218, '69': 12718, '70': 12093, '71': 8968, '72': 5843, '73': 2718, '74': 15218, '75': 11343, '76': 8218, '77': 5093, '78': 1968, '79': 14468, '80': 10718, '81': 7593, 

In [43]:
complement_map['293'] #input should be string, output should be int

293

In [46]:
#in this case find 293 in the combination dict
for key in combination_dict.keys():
    if combination_dict[key] == 293:
        print(key)
#rc will be 77981010, which is exactly what we see is the RC, so we're good!!!

(7, 7, 9, 8, 10, 10)


# making one for 8mer

In [1]:
#let's test it with the numbers as well
import itertools
import numpy as np
numbers = (7, 8, 9, 10, 11)
length = 8

# Generate all combinations of length 8
combinations = itertools.product(numbers, repeat=length)

# Create a dictionary mapping each combination to a number
combination_dict = {comb: idx for idx, comb in enumerate(combinations)}
print(len(combination_dict.keys())) #the number looks right

390625


In [2]:
def rc_number(seq):
    rc_map = {7:10, 8:9, 9:8, 10:7, 11:11}
    return [rc_map[base] for base in seq[::-1]]
#now test it
complement_map = {}
for key in combination_dict.keys():
    value = combination_dict[key]
    rc_value = combination_dict[tuple(rc_number(key))]
    complement_map[value] = rc_value
# newrc = []
# for token in tokenized:
#     newrc.append(complement_map[token])
# print(newrc[::-1])
# print(tokenized_rc)

In [4]:
complement_map[1]

214843

In [5]:
#dump in json
import json
with open('/data/leslie/sarthak/data/enformer/data/complement_map_8mer.json', 'w') as f:
    json.dump(complement_map, f)

In [6]:
#now load it
with open('/data/leslie/sarthak/data/enformer/data/complement_map_8mer.json', 'r') as f:
    complement_map = json.load(f)

In [8]:
complement_map['1'] #yeah so now it uses string, previously was int, but should be fine

214843