Problem Statement.

The DNA sequence is composed of a series of nucleotides abbreviated as 'A', 'C', 'G', and 'T'.

    For example, "ACGAATTCCG" is a DNA sequence.

When studying DNA, it is useful to identify repeated sequences within the DNA.

Given a string s that represents a DNA sequence, return all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule. You may return the answer in any order.

 

Example 1:

Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
Output: ["AAAAACCCCC","CCCCCAAAAA"]

Example 2:

Input: s = "AAAAAAAAAAAAA"
Output: ["AAAAAAAAAA"]

 

Constraints:

    1 <= s.length <= 105
    s[i] is either 'A', 'C', 'G', or 'T'.

# Bitmask - O((N - L) * L) runtime, O((N - L) * L) space

In [3]:
from typing import List

class Solution:
    def findRepeatedDnaSequences(self, s: str) -> List[str]:
        L, n = 10, len(s)
        if n <= L: return []
        
        a = 4
        aL = pow(a, L) 
        
        to_int = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        nums = [to_int.get(s[i]) for i in range(n)]
        
        bitmask = 0
        for i in range(L):
            bitmask <<= 2
            bitmask |= nums[i]
            
        seen, output = {bitmask}, set()
        for start in range(1, n - L + 1):
            bitmask <<= 2
            bitmask |= nums[start + L - 1]
            
            bitmask &= ~(3 << 2 * L)

            if bitmask in seen: 
                output.add(s[start:start + L])
            seen.add(bitmask)
                
        return list(output)

# Rabin-Karp - Rolling Hash - O((N - L) * L) runtime, O((N - L) * L) space

In [1]:
from typing import List

class Solution:
    def findRepeatedDnaSequences(self, s: str) -> List[str]:
        L, n = 10, len(s)
        if n <= L: return []
        
        a = 4
        aL = pow(a, L) 
        
        to_int = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        nums = [to_int.get(s[i]) for i in range(n)]
        
        h = 0
        for i in range(L):
            h = h * a + nums[i]
            
        seen, output = {h}, set()
        for start in range(1, n - L + 1):
            h = h * a - nums[start - 1] * aL + nums[start + L - 1]

            if h in seen: 
                output.add(s[start:start + L])
            seen.add(h)
                
        return list(output)

In [4]:
instance = Solution()
instance.findRepeatedDnaSequences("AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT")

['AAAAACCCCC', 'CCCCCAAAAA']