From 7ff7662e3866daa33f3924269db71bce6fe80a99 Mon Sep 17 00:00:00 2001 From: Yanick Witschi Date: Fri, 29 Sep 2023 15:19:44 +0200 Subject: [PATCH] Version 2 (#1) * Added an Utf8Alphabet implementation * Completely revamped implementation. Dropped hierarchical logic to reduce storage requirements * Added tests for the Levenshtein implementation * Fixed Utf8Alphabet * Added support for Damerau-Levenshtein * Fixed algorithm * Removed Damerau-Levenshtein attempt * CS --- README.md | 23 ++-- ecs.php | 9 +- src/Alphabet/InMemoryAlphabet.php | 18 +-- src/Alphabet/Utf8Alphabet.php | 21 ++++ src/Config.php | 8 +- src/DataStore/DataStoreInterface.php | 16 +++ src/DataStore/InMemoryDataStore.php | 30 +++++ src/DataStore/NullDataStore.php | 16 +++ src/Levenshtein.php | 2 +- src/StateSet/CostAnnotatedStateSet.php | 34 ++--- src/StateSet/InMemoryStateSet.php | 70 ++--------- src/StateSet/StateSetInterface.php | 18 +-- src/StateSetIndex.php | 168 ++++++++++++++----------- tests/Alphabet/Utf8AlphabetTest.php | 22 ++++ tests/LevenshteinTest.php | 18 +++ tests/StateSetIndexTest.php | 27 +++- 16 files changed, 309 insertions(+), 191 deletions(-) create mode 100644 src/Alphabet/Utf8Alphabet.php create mode 100644 src/DataStore/DataStoreInterface.php create mode 100644 src/DataStore/InMemoryDataStore.php create mode 100644 src/DataStore/NullDataStore.php create mode 100644 tests/Alphabet/Utf8AlphabetTest.php create mode 100644 tests/LevenshteinTest.php diff --git a/README.md b/README.md index fdff120..8dba035 100644 --- a/README.md +++ b/README.md @@ -21,14 +21,16 @@ composer require toflar/state-set-index ```php namespace App; -use Toflar\StateSetIndex\Alphabet\InMemoryAlphabet; +use Toflar\StateSetIndex\Alphabet\Utf8Alphabet +use Toflar\StateSetIndex\DataStore\InMemoryDataStore; use Toflar\StateSetIndex\StateSet\InMemoryStateSet; use Toflar\StateSetIndex\StateSetIndex; $stateSetIndex = new StateSetIndex( new Config(6, 4), - new InMemoryAlphabet(), - new InMemoryStateSet() + new Utf8Alphabet(), + new InMemoryStateSet(), + new InMemoryDataStore() ); $stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']); @@ -44,15 +46,20 @@ you want to index and or search. ## Customization This library ships with the algorithm readily prepared for you to use. The main customization areas will be -the alphabet (both the way it maps characters to labels) as well as the state set storage, if you want to make the index +the alphabet (both the way it maps characters to labels) and the state set storage, if you want to make the index persistent. Hence, there are two interfaces that allow you to implement your own logic: * The `AlphabetInterface` is very straight-forward. It only consists of a `map(string $char, int $alphabetSize)` method which the library needs to map characters to an internal label. Whether you load/store the alphabet in some - database is up to you. The library ships with an `InMemoryAlphabet` for reference and simple use cases. -* The `StateSetInterface` is more complex but is essentially responsible to load and store information about the - state set of your index. Again, whether you load/store the state set in some - database is up to you. The library ships with an `InMemoryStateSet` for reference and simple use cases. + database is up to you. The library ships with an `InMemoryAlphabet` for reference and simple use cases. You don't + even need to store the alphabet as we already have one with the UTF-8 codepoints, that's what `Utf8Alphabet` is + for. In case you don't want to customize the labels, use `Utf8Alphabet`. +* The `StateSetInterface` is responsible to load and store information about the state set of your index. Again, + how you load/store the state set in some database is up to you. The library ships with an `InMemoryStateSet` + for reference and simple use cases and tests. +* The `DataStoreInterface` is responsible for storing the string you index alongside its assigned state. Sometimes + you want to completely customize storage in which case you can use the `NullDataStore` and only use the + assignments you get as a return value from calling `$stateSetIndex->index()`. You can not only ask for the final matching results using `$stateSetIndex->findMatchingStates('Mustre', 2)` which is already filtered using a multibyte implementation of the Levenshtein algorithm, but you can also access intermediary diff --git a/ecs.php b/ecs.php index 68658ce..b38c36e 100644 --- a/ecs.php +++ b/ecs.php @@ -2,8 +2,9 @@ declare(strict_types=1); -use PhpCsFixer\Fixer\Import\NoUnusedImportsFixer; -use PhpCsFixer\Fixer\Phpdoc\NoSuperfluousPhpdocTagsFixer; +use PhpCsFixer\Fixer\ClassNotation\OrderedClassElementsFixer; +use PhpCsFixer\Fixer\FunctionNotation\NativeFunctionInvocationFixer; +use PhpCsFixer\Fixer\Operator\NotOperatorWithSuccessorSpaceFixer; use Symplify\EasyCodingStandard\Config\ECSConfig; use Symplify\EasyCodingStandard\ValueObject\Set\SetList; @@ -23,5 +24,7 @@ ]); // Always move private elements to the bottom - $ecsConfig->rule(\PhpCsFixer\Fixer\ClassNotation\OrderedClassElementsFixer::class); + $ecsConfig->ruleWithConfiguration(OrderedClassElementsFixer::class, ['sort_algorithm' => 'alpha']); + $ecsConfig->rule(NativeFunctionInvocationFixer::class); + $ecsConfig->skip([NotOperatorWithSuccessorSpaceFixer::class]); }; diff --git a/src/Alphabet/InMemoryAlphabet.php b/src/Alphabet/InMemoryAlphabet.php index 9e63c31..9ada81f 100644 --- a/src/Alphabet/InMemoryAlphabet.php +++ b/src/Alphabet/InMemoryAlphabet.php @@ -14,19 +14,21 @@ public function __construct( ) { } - public function all(): array + public function add(string $char, int $label): self { - return $this->alphabet; + $this->alphabet[$char] = $label; + + return $this; } - public function has(string $char): bool + public function all(): array { - return isset($this->alphabet[$char]); + return $this->alphabet; } public function count(): int { - return count($this->alphabet); + return \count($this->alphabet); } public function get(string $char): ?int @@ -34,11 +36,9 @@ public function get(string $char): ?int return $this->alphabet[$char] ?? null; } - public function add(string $char, int $label): self + public function has(string $char): bool { - $this->alphabet[$char] = $label; - - return $this; + return isset($this->alphabet[$char]); } public function map(string $char, int $alphabetSize): int diff --git a/src/Alphabet/Utf8Alphabet.php b/src/Alphabet/Utf8Alphabet.php new file mode 100644 index 0000000..a005e22 --- /dev/null +++ b/src/Alphabet/Utf8Alphabet.php @@ -0,0 +1,21 @@ +> + */ + private array $cache = []; + + public function map(string $char, int $alphabetSize): int + { + if (!isset($this->cache[$alphabetSize][$char])) { + // +1 in order to never assign 0 + $this->cache[$alphabetSize][$char] = (mb_ord($char, 'UTF-8') % $alphabetSize) + 1; + } + + return $this->cache[$alphabetSize][$char]; + } +} diff --git a/src/Config.php b/src/Config.php index f84f65e..e870674 100644 --- a/src/Config.php +++ b/src/Config.php @@ -10,13 +10,13 @@ public function __construct( ) { } - public function getIndexLength(): int + public function getAlphabetSize(): int { - return $this->indexLength; + return $this->alphabetSize; } - public function getAlphabetSize(): int + public function getIndexLength(): int { - return $this->alphabetSize; + return $this->indexLength; } } diff --git a/src/DataStore/DataStoreInterface.php b/src/DataStore/DataStoreInterface.php new file mode 100644 index 0000000..33be25d --- /dev/null +++ b/src/DataStore/DataStoreInterface.php @@ -0,0 +1,16 @@ +> + */ + public function getForStates(array $states = []): array; +} diff --git a/src/DataStore/InMemoryDataStore.php b/src/DataStore/InMemoryDataStore.php new file mode 100644 index 0000000..8454c04 --- /dev/null +++ b/src/DataStore/InMemoryDataStore.php @@ -0,0 +1,30 @@ +> + */ + private array $data = []; + + public function add(int $state, string $string): void + { + $this->data[$state][] = $string; + } + + public function all(): array + { + return $this->data; + } + + public function getForStates(array $states = []): array + { + if ([] === $states) { + return $this->data; + } + + return array_intersect_key($this->data, array_flip($states)); + } +} diff --git a/src/DataStore/NullDataStore.php b/src/DataStore/NullDataStore.php new file mode 100644 index 0000000..677546b --- /dev/null +++ b/src/DataStore/NullDataStore.php @@ -0,0 +1,16 @@ + - */ - public function all(): array - { - return $this->set; - } - - public function states(): array - { - return array_values(array_keys($this->set)); - } - /** * Adds a state with a cost to this set. * If this sets already contains the given state with a higher cost, replaces it. */ public function add(int $state, int $cost): void { - if (! isset($this->set[$state])) { + if (!isset($this->set[$state])) { $this->set[$state] = $cost; return; } @@ -43,6 +28,16 @@ public function add(int $state, int $cost): void } } + /** + * Key: State + * Value: Cost + * @return array + */ + public function all(): array + { + return $this->set; + } + public function mergeWith(CostAnnotatedStateSet $stateSet): self { $clone = clone $this; @@ -53,4 +48,11 @@ public function mergeWith(CostAnnotatedStateSet $stateSet): self return $clone; } + + public function states(): array + { + $states = array_values(array_keys($this->set)); + sort($states); + return $states; + } } diff --git a/src/StateSet/InMemoryStateSet.php b/src/StateSet/InMemoryStateSet.php index 6ff2b28..8b9d6ec 100644 --- a/src/StateSet/InMemoryStateSet.php +++ b/src/StateSet/InMemoryStateSet.php @@ -5,39 +5,16 @@ class InMemoryStateSet implements StateSetInterface { /** - * Key: State - * Value: array - * - * @var array> + * @param array $states array */ - private array $states = []; - - /** - * @var array> - */ - private array $children = []; - - /** - * Key: State - * Value: Mapped char - * @var array - */ - private array $mappedChars = []; - - /** - * Key: State - * Value: Matching strings - * @var array> - */ - private array $acceptedStrings = []; + public function __construct( + private array $states = [] + ) { + } - public function add(int $state, int $parentState, int $mappedChar): self + public function add(int $state): void { - $this->states[$state] = [$parentState, $mappedChar]; - $this->mappedChars[$state] = $mappedChar; - $this->children[$parentState][$state] = true; - - return $this; + $this->states[$state] = true; } public function all(): array @@ -45,37 +22,8 @@ public function all(): array return $this->states; } - public function getChildrenOfState(int $state): array - { - if (! isset($this->children[$state])) { - return []; - } - - return array_keys($this->children[$state]); - } - - public function getCharForState(int $state): int - { - if (! isset($this->mappedChars[$state])) { - throw new \LogicException('No mapped char for state. Check your alphabet!'); - } - - return $this->mappedChars[$state]; - } - - public function acceptString(int $state, string $string): self - { - $this->acceptedStrings[$state][] = $string; - - return $this; - } - - public function getAcceptedStrings(array $matchingStates = []): array + public function has(int $state): bool { - if ([] === $matchingStates) { - return $this->acceptedStrings; - } - - return array_intersect_key($this->acceptedStrings, array_flip($matchingStates)); + return isset($this->states[$state]); } } diff --git a/src/StateSet/StateSetInterface.php b/src/StateSet/StateSetInterface.php index 7697293..7627351 100644 --- a/src/StateSet/StateSetInterface.php +++ b/src/StateSet/StateSetInterface.php @@ -4,22 +4,12 @@ interface StateSetInterface { - public function add(int $state, int $parentState, int $mappedChar): self; - - public function getChildrenOfState(int $state): array; - - public function getCharForState(int $state): int; + public function add(int $state): void; /** - * Accept a string with a given state. + * @return array */ - public function acceptString(int $state, string $string): self; + public function all(): array; - /** - * Returns the matching strings per state. Key is the state and the value is an array of matching strings - * for that state. If no argument is passed, the entire accepted strings dataset is returned. - * - * @return array> - */ - public function getAcceptedStrings(array $matchingStates = []): array; + public function has(int $state): bool; } diff --git a/src/StateSetIndex.php b/src/StateSetIndex.php index bef8f61..4c267b9 100644 --- a/src/StateSetIndex.php +++ b/src/StateSetIndex.php @@ -3,6 +3,7 @@ namespace Toflar\StateSetIndex; use Toflar\StateSetIndex\Alphabet\AlphabetInterface; +use Toflar\StateSetIndex\DataStore\DataStoreInterface; use Toflar\StateSetIndex\StateSet\CostAnnotatedStateSet; use Toflar\StateSetIndex\StateSet\StateSetInterface; @@ -13,58 +14,19 @@ class StateSetIndex */ private array $indexCache = []; + /** + * @var array + */ + private array $matchingStatesCache = []; + public function __construct( private Config $config, private AlphabetInterface $alphabet, - private StateSetInterface $stateSet + public StateSetInterface $stateSet, + private DataStoreInterface $dataStore, ) { } - public function getConfig(): Config - { - return $this->config; - } - - public function getAlphabet(): AlphabetInterface - { - return $this->alphabet; - } - - public function getStateSet(): StateSetInterface - { - return $this->stateSet; - } - - /** - * Indexes an array of strings and returns an array where all strings have their state assigned. - * - * @return array - */ - public function index(array $strings): array - { - $assigned = []; - - foreach ($strings as $string) { - if (isset($this->indexCache[$string])) { - $assigned[$string] = $this->indexCache[$string]; - continue; - } - - $state = 0; - $this->loopOverEveryCharacter($string, function (int $mappedChar) use (&$state) { - $newState = (int) ($state * $this->config->getAlphabetSize() + $mappedChar); - - $this->stateSet->add($newState, $state, $mappedChar); - $state = $newState; - }); - - $assigned[$string] = $this->indexCache[$string] = $state; - $this->stateSet->acceptString($state, $string); - } - - return $assigned; - } - /** * Returns the matching strings. * @@ -73,11 +35,16 @@ public function index(array $strings): array public function find(string $string, int $editDistance): array { $acceptedStringsPerState = $this->findAcceptedStrings($string, $editDistance); - + $stringLength = mb_strlen($string); $filtered = []; foreach ($acceptedStringsPerState as $acceptedStrings) { foreach ($acceptedStrings as $acceptedString) { + // Early aborts (cheaper) for cases we know are absolutely never going to match + if (abs($stringLength - mb_strlen($acceptedString)) > $editDistance) { + continue; + } + if (Levenshtein::distance($string, $acceptedString) <= $editDistance) { $filtered[] = $acceptedString; } @@ -95,9 +62,7 @@ public function find(string $string, int $editDistance): array */ public function findAcceptedStrings(string $string, int $editDistance): array { - $states = $this->findMatchingStates($string, $editDistance); - - return $this->stateSet->getAcceptedStrings($states); + return $this->dataStore->getForStates($this->findMatchingStates($string, $editDistance)); } /** @@ -105,11 +70,19 @@ public function findAcceptedStrings(string $string, int $editDistance): array * * @return array */ - public function findMatchingStates(string $string, int $editDistance) + public function findMatchingStates(string $string, int $editDistance): array { + $cacheKey = $string . ';' . $editDistance; + + // Seen this already, skip + if (isset($this->matchingStatesCache[$cacheKey])) { + return $this->matchingStatesCache[$cacheKey]; + } + + // Initial states $states = $this->getReachableStates(0, $editDistance); - $this->loopOverEveryCharacter($string, function (int $mappedChar) use (&$states, $editDistance) { + $this->loopOverEveryCharacter($string, function (int $mappedChar, $char) use (&$states, $editDistance) { $nextStates = new CostAnnotatedStateSet(); foreach ($states->all() as $state => $cost) { @@ -120,14 +93,18 @@ public function findMatchingStates(string $string, int $editDistance) $newStates->add($state, $cost + 1); } - foreach ($this->stateSet->getChildrenOfState($state) as $childState) { - $childChar = $this->stateSet->getCharForState($childState); - if ($childChar === $mappedChar) { - // Match - $newStates->add($childState, $cost); - } elseif ($cost + 1 <= $editDistance) { - // Substitution - $newStates->add($childState, $cost + 1); + // Match & Substitution + for ($i = 1; $i <= $this->config->getAlphabetSize(); $i++) { + $newState = (int) ($state * $this->config->getAlphabetSize() + $i); + + if ($this->stateSet->has($newState)) { + if ($i === $this->getAlphabet()->map($char, $this->config->getAlphabetSize())) { + // Match + $newStates->add($newState, $cost); + } elseif ($cost + 1 <= $editDistance) { + // Substitution + $newStates->add($newState, $cost + 1); + } } } @@ -144,21 +121,53 @@ public function findMatchingStates(string $string, int $editDistance) $states = $nextStates; }); - return $states->states(); + return $this->matchingStatesCache[$cacheKey] = $states->states(); + } + + public function getAlphabet(): AlphabetInterface + { + return $this->alphabet; + } + + public function getConfig(): Config + { + return $this->config; + } + + public function getStateSet(): StateSetInterface + { + return $this->stateSet; } /** - * @param \Closure(int) $closure + * Indexes an array of strings and returns an array where all strings have their state assigned. + * + * @return array */ - private function loopOverEveryCharacter(string $string, \Closure $closure): void + public function index(array $strings): array { - $indexedSubstringLength = min($this->config->getIndexLength(), mb_strlen($string)); - $indexedSubstring = mb_substr($string, 0, $indexedSubstringLength); + $assigned = []; - foreach (mb_str_split($indexedSubstring) as $char) { - $mappedChar = $this->alphabet->map($char, $this->config->getAlphabetSize()); - $closure($mappedChar); + foreach ($strings as $string) { + // Seen this already, skip + if (isset($this->indexCache[$string])) { + $assigned[$string] = $this->indexCache[$string]; + continue; + } + + $state = 0; + $this->loopOverEveryCharacter($string, function (int $mappedChar) use (&$state) { + $newState = (int) ($state * $this->config->getAlphabetSize() + $mappedChar); + + $this->stateSet->add($newState); + $state = $newState; + }); + + $assigned[$string] = $this->indexCache[$string] = $state; + $this->dataStore->add($state, $string); } + + return $assigned; } private function getReachableStates(int $startState, int $editDistance, int $currentDistance = 0): CostAnnotatedStateSet @@ -172,10 +181,29 @@ private function getReachableStates(int $startState, int $editDistance, int $cur // A state is always able to reach itself $reachable->add($startState, $currentDistance); - foreach ($this->stateSet->getChildrenOfState($startState) as $child) { - $reachable = $reachable->mergeWith($this->getReachableStates($child, $editDistance, $currentDistance + 1)); + for ($i = 0; $i <= $editDistance; $i++) { + for ($c = 0; $c < $this->config->getAlphabetSize(); $c++) { + $state = $startState + $c * $i; + if ($this->stateSet->has($state)) { + $reachable->add($startState, $currentDistance); + } + } } return $reachable; } + + /** + * @param \Closure(int) $closure + */ + private function loopOverEveryCharacter(string $string, \Closure $closure): void + { + $indexedSubstringLength = min($this->config->getIndexLength(), mb_strlen($string)); + $indexedSubstring = mb_substr($string, 0, $indexedSubstringLength); + + foreach (mb_str_split($indexedSubstring) as $char) { + $mappedChar = $this->alphabet->map($char, $this->config->getAlphabetSize()); + $closure($mappedChar, $char); + } + } } diff --git a/tests/Alphabet/Utf8AlphabetTest.php b/tests/Alphabet/Utf8AlphabetTest.php new file mode 100644 index 0000000..776c74e --- /dev/null +++ b/tests/Alphabet/Utf8AlphabetTest.php @@ -0,0 +1,22 @@ +assertSame(98, $alphabet->map('a', 100)); // a is #97 + $this->assertSame(8, $alphabet->map('a', 10)); + $this->assertSame(8, $alphabet->map('a', 10)); // Testing repetitive calls + $this->assertSame(5, $alphabet->map('@', 10)); + $this->assertSame(4, $alphabet->map('!', 10)); + $this->assertSame(4, $alphabet->map('é', 10)); + $this->assertSame(10, $alphabet->map('愛', 10)); + $this->assertSame(1, $alphabet->map('(', 10)); // Must be 1, not 0 + } +} \ No newline at end of file diff --git a/tests/LevenshteinTest.php b/tests/LevenshteinTest.php new file mode 100644 index 0000000..718cbe5 --- /dev/null +++ b/tests/LevenshteinTest.php @@ -0,0 +1,18 @@ +assertSame(1, Levenshtein::distance('hello', 'helo')); + $this->assertSame(2, Levenshtein::distance('hello', 'heo')); + $this->assertSame(1, Levenshtein::distance('héllo', 'hello')); + $this->assertSame(2, Levenshtein::distance('garçonnière', 'garconniere')); + $this->assertSame(1, Levenshtein::distance('garçonnière', 'garçonniere')); + } +} \ No newline at end of file diff --git a/tests/StateSetIndexTest.php b/tests/StateSetIndexTest.php index 15b7332..1b887c1 100644 --- a/tests/StateSetIndexTest.php +++ b/tests/StateSetIndexTest.php @@ -4,7 +4,9 @@ use PHPUnit\Framework\TestCase; use Toflar\StateSetIndex\Alphabet\InMemoryAlphabet; +use Toflar\StateSetIndex\Alphabet\Utf8Alphabet; use Toflar\StateSetIndex\Config; +use Toflar\StateSetIndex\DataStore\InMemoryDataStore; use Toflar\StateSetIndex\StateSet\InMemoryStateSet; use Toflar\StateSetIndex\StateSetIndex; @@ -12,8 +14,6 @@ class StateSetIndexTest extends TestCase { public function testResultsMatchResearchPaper(): void { - $stringSet = ['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']; - $stateSetIndex = new StateSetIndex( new Config(6, 4), new InMemoryAlphabet([ @@ -29,13 +29,30 @@ public function testResultsMatchResearchPaper(): void 'm' => 2, 'a' => 3, ]), - new InMemoryStateSet() + new InMemoryStateSet(), + new InMemoryDataStore() ); - $stateSetIndex->index($stringSet); + $stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']); - $this->assertSame([467, 104, 419, 1677, 1811], $stateSetIndex->findMatchingStates('Mustre', 2)); + $this->assertSame([104, 419, 467, 1677, 1811], $stateSetIndex->findMatchingStates('Mustre', 2)); $this->assertSame([1811 => ['Mueller'], 1677 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2)); $this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2)); + + // Should consider transposition (Damerau-Levenshtein) as distance of 2 + $this->assertSame([104, 419, 467, 1677, 1811], $stateSetIndex->findMatchingStates('Mustremann', 2)); + $this->assertSame(['Mustermann'], $stateSetIndex->find('Mustremann', 2)); + $this->assertSame([419], $stateSetIndex->findMatchingStates('Mustremann', 1)); + $this->assertSame([], $stateSetIndex->find('Mustremann', 1)); + } + + public function testWithUtf8Alphabet(): void + { + $stateSetIndex = new StateSetIndex(new Config(6, 4), new Utf8Alphabet(), new InMemoryStateSet(), new InMemoryDataStore()); + $stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']); + + $this->assertSame([177, 710, 2710, 2843], $stateSetIndex->findMatchingStates('Mustre', 2)); + $this->assertSame([2710 => ['Mueller'], 2843 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2)); + $this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2)); } } \ No newline at end of file