<a href="https://colab.research.google.com/github/Thrishankkuntimaddi/Data-Structures-and-Algorithms-Advanced/blob/main/8%20-%20String.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview of Pattern Searching

I/P : txt = "GEEKSFORGEEKS" ; pat = "EKS"

O/P : 2 10

I/P : txt = "AAAA" ; pat = "AAA"

O/P : 0 1

m -> pattern length

n -> Text length

1 <= m <= n

----------------

-> No Prepocessing

Naive : O((n-m+1) * m)

Naive when all characters of pattern are Distinct : O(n)

----------------

-> Preprocessing Pattern

Rabin karp : O((n-m+1)*m) But Better than Naive on Average

----------------

-> Preprocessing Text

Suffix Tree : O(m)

# Pattern Searching

I/P : text = "geeks for geeks" ; pat = "geeks"

O/P : 0 10

In [None]:
# Naive Solution

def find(txt, pat):
  pos = txt.find(pat)

  while pos >= 0:
    print(pos)
    pos = txt.find(pat, pos + 1)

text = "geeks for geeks"
pat = "geeks"
find(text, pat)

# Time Complexity : O(n×(n−m+1))
# Space Complexity : O(n+m)

0
10


In [None]:
# Super Naive

def naivepat(txt, pat):
  m = len(pat)
  n = len(txt)

  for i in range(n-m+1):
    j = 0

    while j < m:
      if pat[j] != txt[j+i]:
        break
      j = j + 1

      if j == m:
        print(i, end = " ")

text = "geeks for geeks"
pat = "geeks"
find(text, pat)

# Time Complexity : O((n−m+1) * m)
# Space Complexity : O(1)

0
10


In [None]:
# Improved Naive for Distinct in Pattern

def distpat(txt, pat):
  m, n = len(pat), len(txt)
  i = 0

  while i <= (n-m):
    for j in range(m):
      if pat[j] != txt[i+j]:
        break
    if j == m:
      print(i, end = ' ')

    if j == 0:
      i += 1

    else:
      i += j

text = "geeks for geeks"
pat = "geeks"
find(text, pat)

# Time Complexity : O(n−m+1)
# Space Complexity : O(1)

0
10


# Rabin Karp Algorithm

1) Like Naive Algorithm, slide the pattern one by one

2) Compare hash value of pattern and current text window. If hash values match, then only comapare individual characters

p : Hash Value of pattern
t : Hash Value of current window of text

### Rolling Hash

ti+1 = ti + txt[i + m] - txt[i]

m = length of pattern

### Improved hash

h("abc") = 1 * d^2 + 2 * d^1 + 3 * d^0 = 1 * 5^2 + 2 * 5^1 + 3 * 5^0 = 38

h("dab") = 4 * d^2 + 1 * d^1 + 2 * d^0 = 4 * 5^2 + 1 * 5^1 + 2 * 5^0 = 107

### Rolling

ti+1 = d(ti - txt[i] * d^(m-1)) + txt[i + m ]

m -> length of pattern

In [None]:
d = 256

def RKSearch(pat, txt, q):
    m, n = len(pat), len(txt)
    h = 1

    for i in range(m-1):
        h = (h * d) % q

    p, t = 0, 0

    for i in range(m):
        p = (p * d + ord(pat[i])) % q
        t = (t * d + ord(txt[i])) % q

    for i in range(n - m + 1):
        if p == t:
            match = True
            for j in range(m):
                if txt[i + j] != pat[j]:
                    match = False
                    break
            if match:
                print(i, end=" ")

        if i < n - m:
            t = (d * (t - ord(txt[i]) * h) + ord(txt[i + m])) % q

            if t < 0:
                t = t + q

text = "geeks for geeks"
pat = "geeks"
q = 101
RKSearch(pat, text, q)

# Time Complexity : O((n-m+1) * m)

0 10 

# KMP Algorithm (Constructing Longest Prefix Suffix Array)

Proper prefixes of 'abc' : " ", "a", "ab"

Proper suffixes of 'abc' : " ", "c", "bc", "abc"

I/P : str = "ababc"

O/P : lps[] = [0, 0, 1, 2, 0]

I/P : str = "aaaa"

O/P : lps[] = [0, 1, 2, 3]

I/P : str = "abacabad"

O/P : lps[] = [0, 0, 1, 0, 1, 2, 3, 0]


In [None]:
def longProperPrefixSuffix(str1, n):
  for len in range(n-1, -1, -1):
    for j in range(len):
      if str1[j] != str1[n - len + j]:
        break
    else:
      return len
  return 0

def fillLps(str1, lps):
  lps[0] = 0
  for i in range(1, len(str1)):
    lps[i] = longProperPrefixSuffix(str1, i + 1)

str1 = "ababc"
lps = [i for i in range(len(str1))]
fillLps(str1, lps)
print(lps)

# Time Complexity : O(n^2)
# Space Complexity : O(n)

[0, 0, 1, 2, 0]


In [None]:
# Efficient Solution

#     If len = lpe[i-1] and str[len] and str[i] are same, then lps[i] = len + 1
#     If str[i] and str[len] are not Same
#          a) if len == 0
#          b) else, we recursively apply lps[]
#                         len = lps[len-1]
#                   then comapare str[i] with str[len]

In [2]:
def fillLps(str1, lps):
    lps[0] = 0
    i = 1
    length = 0
    n = len(list(str1))

    while i < n:
        if str1[i] == str1[length]:
            length += 1
            lps[i] = length
            i += 1
        else:
            if length == 0:
                lps[i] = 0
                i += 1
            else:
                length = lps[length - 1]

str1 = "ababc"
lps = [0] * len(str1)
fillLps(str1, lps)
print(lps)

# Time Complexity : O(n)
# Space Complexity : O(n)

[0, 0, 1, 2, 0]


# KMP String Matching

I/P : txt = "abcdefg" ; pat = "bcd"

O/P : 1

I/P : txt = "aaaaab" ; pat = "aaaa"

O/P : 0 1

In [3]:
# Naive Solution

def naiveStringMatching(txt, pat):
    n = len(txt)
    m = len(pat)

    for i in range(n - m + 1):
        j = 0
        while j < m and txt[i + j] == pat[j]:
            j += 1

        if j == m:
            print(i, end=" ")

txt = "abcdefg"
pat = "bcd"
naiveStringMatching(txt, pat)
print()

txt = "aaaaab"
pat = "aaaa"
naiveStringMatching(txt, pat)

# Time Complexity : O((n-m)*m)
# Space Complexity : O(1)

1 
0 1 

In [4]:
# Rabin Karp Algorithm

d = 256

def rabinKarp(txt, pat, q):
    n = len(txt)
    m = len(pat)
    p_hash = 0
    t_hash = 0
    h = 1
    results = []

    for i in range(m - 1):
        h = (h * d) % q

    for i in range(m):
        p_hash = (d * p_hash + ord(pat[i])) % q
        t_hash = (d * t_hash + ord(txt[i])) % q

    for i in range(n - m + 1):
        if p_hash == t_hash:
            match = True
            for j in range(m):
                if txt[i + j] != pat[j]:
                    match = False
                    break
            if match:
                results.append(i)

        if i < n - m:
            t_hash = (d * (t_hash - ord(txt[i]) * h) + ord(txt[i + m])) % q

            if t_hash < 0:
                t_hash = t_hash + q

    return results

txt = "abcdefg"
pat = "bcd"
q = 101
result = rabinKarp(txt, pat, q)
print(result)

txt = "aaaaab"
pat = "aaaa"
result = rabinKarp(txt, pat, q)
print(result)

# Time Complexity : O(n + m)
# Space Complexity : O(1)

[1]
[0, 1]


In [5]:
# KMP : Knuth-Morris-Pratt

def fillLPS(pat, lps):
    M = len(pat)
    length = 0
    lps[0] = 0
    i = 1

    while i < M:
        if pat[i] == pat[length]:
            length += 1
            lps[i] = length
            i += 1
        else:
            if length != 0:
                length = lps[length - 1]
            else:
                lps[i] = 0
                i += 1

def KMP(pat, txt):
    N = len(txt)
    M = len(pat)

    lps = [0] * M
    fillLPS(pat, lps)

    i = 0
    j = 0

    while i < N:
        if pat[j] == txt[i]:
            i += 1
            j += 1

        if j == M:
            print("Pattern found at index", i - j)
            j = lps[j - 1]

        elif i < N and pat[j] != txt[i]:
            if j != 0:
                j = lps[j - 1]
            else:
                i += 1

txt = "aaaaabaaaaac"
pat = "aaaa"
KMP(pat, txt)

# Time Complexity : O(2^n)

Pattern found at index 0
Pattern found at index 1
Pattern found at index 6
Pattern found at index 7


# Anagram Search

I/P : txt = "geeksforgeeks" ; pat = "forg"

O/P : Yes

I/P : txt = "geeksforgeeks" ; pat = "rseek"

O/P : No

In [7]:
# Naive Solution

CHAR = 256

def areAnagram(pat, txt, i):
  count = [0] * CHAR

  for j in range(len(pat)):
    count[ord(pat[j])] += 1
    count[ord(txt[i+j])] -= 1

  for j in range(CHAR):
    if count[j] != 0:
      return False

  return True

def ispresent(txt, pat):
  n = len(txt)
  m = len(pat)

  for i in range(n-m+1):
    if areAnagram(pat, txt, i):
      return True
  return False

txt = "geeksforgeeks"
pat = "forg"

ispresent(txt, pat)

# Time Complexity : O((n-m+1)*m)
# Space Complexity : O(1)

True

In [9]:
# Efficient Solution

CHAR = 256

def areAnagram(CT, CP):
  for i in range(CHAR):
    if CT[i] != CP[i]:
      return False

  return True

def isPresent(txt, pat):
  n = len(txt)
  m = len(pat)

  CT = [0] * CHAR
  CP = [0] * CHAR

  for i in range(m):
    CT[ord(txt[i])] += 1
    CP[ord(pat[i])] += 1

  for i in range(m, n):
    if areAnagram(CT, CP):
      return True

    CT[ord(txt[i])] += 1
    CT[ord(txt[i - m])] -= 1

  return False

txt = "geeksforgeeks"
pat = "forg"

isPresent(txt, pat)

# Time Complexity : O(n * CHAR)
# Space Complexity : O(CHAR)

True

# Lexicographic rank of a string

    String   Rank

    ABC       1
    ACB       2
    BAC       3
    BCA       4
    CAB       5
    CBA       6

I/P : str = "ABC"

O/P : 1

I/P : str = "BAC"

O/P : 3


### Naive Solution

Pen & Paper Method : O(n * n!)

In [11]:
# Efficient Solution

CHAR = 256

def fact(n):
    result = 1
    for i in range(2, n + 1):
        result *= i
    return result

def lesRank(str1):
  res = 1
  n = len(str1)
  mul = fact(n)
  count = [0] * CHAR

  for i in range(n):
    count[ord(str1[i])] += 1

  for i in range(1, CHAR):
    count[i] += count[i-1]

  for i in range(n-1):
    mul = mul//(n-i)

    res += count[ord(str1[i]) - 1] * mul

    for j in range(ord(str1[i]), CHAR):
      count[j] -= 1

  return res

str1 = "BAC"
lesRank(str1)

# Time Complexity : O(n * CHAR) = O(n)

3

# Longest SubString with Distinct Characters

I/P : str = "abcdabc"

O/P : 4

I/P : str = "aaa"

O/P : 1

In [12]:
# Naive Solution

def areDistinct(str1, i, j):
  visited = [0] * 256

  for k in range(i, j+1):
    if visited[ord(str1[k])] == True:
      return False

    visited[ord(str1[k])] = True

  return True

def longestDistinct(str1):
  n = len(str1)
  res = 0

  for i in range(n):
    for j in range(i, n):
      if areDistinct(str1, i, j):
        res = max(res, j - i + 1)

  return res

str1 = "abcdabc"
longestDistinct(str1)

# Time Complexity : O(n^3)

4

In [14]:
# Better Solution

def longestDistinct(str1):
  n = len(str1)
  res = 0

  for i in range(n):
    visited = [0] * 256

    for j in range(i, n):
      if visited[ord(str1[j])] == True:
        break
      else:
        res = max(res, j - i + 1)
        visited[ord(str1[j])] = True

  return res

vstr1 = "abcdabc"
longestDistinct(str1)

# Time Complexity : O(n^2)

4

In [15]:
# Efficient Solution

def longestDistinct(str1):
  n = len(str1)
  res = 0
  prev = [-1] * 256
  i = 0

  for j in range(n):
    i = max(i, prev[ord(str1[j])] + 1)
    maxEnd = j - i + 1
    res = max(res, maxEnd)
    prev[ord(str1[j])] = j

  return res

vstr1 = "abcdabc"
longestDistinct(str1)

# Time Complexity : O(n)

4