Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
300 changes: 226 additions & 74 deletions strings/rabin_karp.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,243 @@
# Numbers of alphabet which we call base
alphabet_size = 256
# Modulus to hash a string
modulus = 1000003
"""
Rabin-Karp String Matching Algorithm

The Rabin-Karp algorithm uses hashing to find patterns in text.
It employs a rolling hash technique for efficient pattern searching.

def rabin_karp(pattern: str, text: str) -> bool:
"""
The Rabin-Karp Algorithm for finding a pattern within a piece of text
with complexity O(nm), most efficient when it is used with multiple patterns
as it is able to check if any of a set of patterns match a section of text in o(1)
given the precomputed hashes.
Time Complexity:
- Average case: O(n + m) where n is text length, m is pattern length
- Worst case: O(nm) when many spurious hits occur

Space Complexity: O(1) for single pattern, O(k) for k patterns

This will be the simple version which only assumes one pattern is being searched
for but it's not hard to modify
Applications:
- Plagiarism detection
- DNA sequence matching
- Multiple pattern searching
- Finding duplicate content
"""

1) Calculate pattern hash

2) Step through the text one character at a time passing a window with the same
length as the pattern
calculating the hash of the text within the window compare it with the hash
of the pattern. Only testing equality if the hashes match
def rabin_karp_search(
text: str, pattern: str, base: int = 256, modulus: int = 101
) -> list[int]:
"""
Search for a pattern in text using Rabin-Karp algorithm.

Args:
text: The text to search in
pattern: The pattern to search for
base: The base for hash calculation (default: 256 for ASCII)
modulus: The modulus for hash calculation (prime number)

Returns:
List of starting indices where pattern is found

Examples:
>>> rabin_karp_search("hello world hello", "hello")
[0, 12]
>>> rabin_karp_search("aaaa", "aa")
[0, 1, 2]
>>> rabin_karp_search("abc", "xyz")
[]
>>> rabin_karp_search("", "a")
[]
>>> rabin_karp_search("a", "")
[]
>>> rabin_karp_search("abcdefg", "cde")
[2]
>>> rabin_karp_search("ABABDABACDABABCABAB", "ABABCABAB")
[10]
>>> rabin_karp_search("test test test", "test")
[0, 5, 10]
"""
p_len = len(pattern)
t_len = len(text)
if p_len > t_len:
return False
if not pattern or not text or len(pattern) > len(text):
return []

p_hash = 0
n = len(text)
m = len(pattern)
matches = []

# Calculate hash value for pattern and first window of text
pattern_hash = 0
text_hash = 0
modulus_power = 1
h = 1

# Calculating the hash of pattern and substring of text
for i in range(p_len):
p_hash = (ord(pattern[i]) + p_hash * alphabet_size) % modulus
text_hash = (ord(text[i]) + text_hash * alphabet_size) % modulus
if i == p_len - 1:
continue
modulus_power = (modulus_power * alphabet_size) % modulus
# The value of h would be "pow(base, m-1) % modulus"
for _ in range(m - 1):
h = (h * base) % modulus

for i in range(t_len - p_len + 1):
if text_hash == p_hash and text[i : i + p_len] == pattern:
return True
if i == t_len - p_len:
continue
# Calculate the https://en.wikipedia.org/wiki/Rolling_hash
text_hash = (
(text_hash - ord(text[i]) * modulus_power) * alphabet_size
+ ord(text[i + p_len])
) % modulus
return False
# Calculate initial hash values
for i in range(m):
pattern_hash = (base * pattern_hash + ord(pattern[i])) % modulus
text_hash = (base * text_hash + ord(text[i])) % modulus

# Slide the pattern over text one by one
for i in range(n - m + 1):
# Check if hash values match and verify to avoid spurious hits
if pattern_hash == text_hash and text[i : i + m] == pattern:
matches.append(i)

# Calculate hash for next window (rolling hash)
if i < n - m:
# Remove leading character and add trailing character
text_hash = (
base * (text_hash - ord(text[i]) * h) + ord(text[i + m])
) % modulus

# Handle negative hash values
if text_hash < 0:
text_hash += modulus

return matches


def test_rabin_karp() -> None:
def rabin_karp_multiple(
text: str, patterns: list[str], base: int = 256, modulus: int = 101
) -> dict[str, list[int]]:
"""
>>> test_rabin_karp()
Success.
Search for multiple patterns in text using Rabin-Karp algorithm.

This is more efficient than running single pattern search multiple times
because we only scan the text once.

Args:
text: The text to search in
patterns: List of patterns to search for
base: The base for hash calculation
modulus: The modulus for hash calculation

Returns:
Dictionary mapping each pattern to list of indices where found

Examples:
>>> result = rabin_karp_multiple("hello world hello", ["hello", "world"])
>>> result == {"hello": [0, 12], "world": [6]}
True
>>> result = rabin_karp_multiple("aaaa", ["aa", "aaa"])
>>> result == {"aa": [0, 1, 2], "aaa": [0, 1]}
True
>>> result = rabin_karp_multiple("test", ["abc", "xyz"])
>>> result == {"abc": [], "xyz": []}
True
>>> result = rabin_karp_multiple("", ["a", "b"])
>>> result == {"a": [], "b": []}
True
>>> result = rabin_karp_multiple("abcdef", ["ab", "cd", "ef"])
>>> result == {"ab": [0], "cd": [2], "ef": [4]}
True
"""
# Test 1)
pattern = "abc1abc12"
text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc"
text2 = "alskfjaldsk23adsfabcabc"
assert rabin_karp(pattern, text1)
assert not rabin_karp(pattern, text2)

# Test 2)
pattern = "ABABX"
text = "ABABZABABYABABX"
assert rabin_karp(pattern, text)

# Test 3)
pattern = "AAAB"
text = "ABAAAAAB"
assert rabin_karp(pattern, text)

# Test 4)
pattern = "abcdabcy"
text = "abcxabcdabxabcdabcdabcy"
assert rabin_karp(pattern, text)

# Test 5)
pattern = "Lü"
text = "Lüsai"
assert rabin_karp(pattern, text)
pattern = "Lue"
assert not rabin_karp(pattern, text)
print("Success.")
if not text or not patterns:
return {pattern: [] for pattern in patterns}

# Group patterns by length for efficient processing
patterns_by_length: dict[int, list[str]] = {}
for pattern in patterns:
if pattern: # Skip empty patterns
length = len(pattern)
if length not in patterns_by_length:
patterns_by_length[length] = []
patterns_by_length[length].append(pattern)

results: dict[str, list[int]] = {pattern: [] for pattern in patterns}

# Process each group of patterns with same length
for pattern_length, pattern_group in patterns_by_length.items():
if pattern_length > len(text):
continue

# Calculate pattern hashes
pattern_hashes = {}
for pattern in pattern_group:
pattern_hash = 0
for char in pattern:
pattern_hash = (base * pattern_hash + ord(char)) % modulus
pattern_hashes[pattern] = pattern_hash

# Calculate hash for first window
text_hash = 0
h = 1
for _ in range(pattern_length - 1):
h = (h * base) % modulus

for i in range(pattern_length):
text_hash = (base * text_hash + ord(text[i])) % modulus

# Slide the window over text
for i in range(len(text) - pattern_length + 1):
# Check if current hash matches any pattern hash
for pattern, pattern_hash in pattern_hashes.items():
# Verify to avoid spurious hits
if (
text_hash == pattern_hash
and text[i : i + pattern_length] == pattern
):
results[pattern].append(i)

# Calculate hash for next window
if i < len(text) - pattern_length:
text_hash = (
base * (text_hash - ord(text[i]) * h)
+ ord(text[i + pattern_length])
) % modulus

if text_hash < 0:
text_hash += modulus

return results


def rabin_karp_search_optimized(
text: str, pattern: str, base: int = 256, modulus: int = 1_000_000_007
) -> list[int]:
"""
Optimized version with larger modulus to reduce collisions.

Using a larger prime modulus (10^9 + 7) significantly reduces
the probability of hash collisions, improving average-case performance.

Args:
text: The text to search in
pattern: The pattern to search for
base: The base for hash calculation
modulus: Large prime modulus (default: 10^9 + 7)

Returns:
List of starting indices where pattern is found

Examples:
>>> rabin_karp_search_optimized("hello world", "world")
[6]
>>> rabin_karp_search_optimized("aaabaaaa", "aaaa")
[4]
>>> rabin_karp_search_optimized("abc", "d")
[]
"""
return rabin_karp_search(text, pattern, base, modulus)


if __name__ == "__main__":
test_rabin_karp()
import doctest

doctest.testmod()

# Performance demonstration
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit" * 100
pattern = "consectetur"

print("Rabin-Karp String Matching Algorithm Demo")
print("=" * 50)

# Single pattern search
matches = rabin_karp_search(text, pattern)
print(f"\nSearching for '{pattern}' in text ({len(text)} chars)")
print(f"Found {len(matches)} matches at indices: {matches[:5]}...")

# Multiple pattern search
patterns = ["Lorem", "ipsum", "consectetur", "adipiscing"]
results = rabin_karp_multiple(text, patterns)
print(f"\nSearching for {len(patterns)} patterns:")
for p, indices in results.items():
print(f" '{p}': {len(indices)} matches")

print("\n✓ All tests passed!")