Skip to content

Commit

Permalink
Version 2 (#1)
Browse files Browse the repository at this point in the history
* Added an Utf8Alphabet implementation

* Completely revamped implementation. Dropped hierarchical logic to reduce storage requirements

* Added tests for the Levenshtein implementation

* Fixed Utf8Alphabet

* Added support for Damerau-Levenshtein

* Fixed algorithm

* Removed Damerau-Levenshtein attempt

* CS
  • Loading branch information
Toflar committed Sep 29, 2023
1 parent a4277c0 commit 7ff7662
Show file tree
Hide file tree
Showing 16 changed files with 309 additions and 191 deletions.
23 changes: 15 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,16 @@ composer require toflar/state-set-index
```php
namespace App;

use Toflar\StateSetIndex\Alphabet\InMemoryAlphabet;
use Toflar\StateSetIndex\Alphabet\Utf8Alphabet
use Toflar\StateSetIndex\DataStore\InMemoryDataStore;
use Toflar\StateSetIndex\StateSet\InMemoryStateSet;
use Toflar\StateSetIndex\StateSetIndex;

$stateSetIndex = new StateSetIndex(
new Config(6, 4),
new InMemoryAlphabet(),
new InMemoryStateSet()
new Utf8Alphabet(),
new InMemoryStateSet(),
new InMemoryDataStore()
);

$stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']);
Expand All @@ -44,15 +46,20 @@ you want to index and or search.
## Customization

This library ships with the algorithm readily prepared for you to use. The main customization areas will be
the alphabet (both the way it maps characters to labels) as well as the state set storage, if you want to make the index
the alphabet (both the way it maps characters to labels) and the state set storage, if you want to make the index
persistent. Hence, there are two interfaces that allow you to implement your own logic:

* The `AlphabetInterface` is very straight-forward. It only consists of a `map(string $char, int $alphabetSize)` method
which the library needs to map characters to an internal label. Whether you load/store the alphabet in some
database is up to you. The library ships with an `InMemoryAlphabet` for reference and simple use cases.
* The `StateSetInterface` is more complex but is essentially responsible to load and store information about the
state set of your index. Again, whether you load/store the state set in some
database is up to you. The library ships with an `InMemoryStateSet` for reference and simple use cases.
database is up to you. The library ships with an `InMemoryAlphabet` for reference and simple use cases. You don't
even need to store the alphabet as we already have one with the UTF-8 codepoints, that's what `Utf8Alphabet` is
for. In case you don't want to customize the labels, use `Utf8Alphabet`.
* The `StateSetInterface` is responsible to load and store information about the state set of your index. Again,
how you load/store the state set in some database is up to you. The library ships with an `InMemoryStateSet`
for reference and simple use cases and tests.
* The `DataStoreInterface` is responsible for storing the string you index alongside its assigned state. Sometimes
you want to completely customize storage in which case you can use the `NullDataStore` and only use the
assignments you get as a return value from calling `$stateSetIndex->index()`.

You can not only ask for the final matching results using `$stateSetIndex->findMatchingStates('Mustre', 2)` which is
already filtered using a multibyte implementation of the Levenshtein algorithm, but you can also access intermediary
Expand Down
9 changes: 6 additions & 3 deletions ecs.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

declare(strict_types=1);

use PhpCsFixer\Fixer\Import\NoUnusedImportsFixer;
use PhpCsFixer\Fixer\Phpdoc\NoSuperfluousPhpdocTagsFixer;
use PhpCsFixer\Fixer\ClassNotation\OrderedClassElementsFixer;
use PhpCsFixer\Fixer\FunctionNotation\NativeFunctionInvocationFixer;
use PhpCsFixer\Fixer\Operator\NotOperatorWithSuccessorSpaceFixer;
use Symplify\EasyCodingStandard\Config\ECSConfig;
use Symplify\EasyCodingStandard\ValueObject\Set\SetList;

Expand All @@ -23,5 +24,7 @@
]);

// Always move private elements to the bottom
$ecsConfig->rule(\PhpCsFixer\Fixer\ClassNotation\OrderedClassElementsFixer::class);
$ecsConfig->ruleWithConfiguration(OrderedClassElementsFixer::class, ['sort_algorithm' => 'alpha']);
$ecsConfig->rule(NativeFunctionInvocationFixer::class);
$ecsConfig->skip([NotOperatorWithSuccessorSpaceFixer::class]);
};
18 changes: 9 additions & 9 deletions src/Alphabet/InMemoryAlphabet.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,31 @@ public function __construct(
) {
}

public function all(): array
public function add(string $char, int $label): self
{
return $this->alphabet;
$this->alphabet[$char] = $label;

return $this;
}

public function has(string $char): bool
public function all(): array
{
return isset($this->alphabet[$char]);
return $this->alphabet;
}

public function count(): int
{
return count($this->alphabet);
return \count($this->alphabet);
}

public function get(string $char): ?int
{
return $this->alphabet[$char] ?? null;
}

public function add(string $char, int $label): self
public function has(string $char): bool
{
$this->alphabet[$char] = $label;

return $this;
return isset($this->alphabet[$char]);
}

public function map(string $char, int $alphabetSize): int
Expand Down
21 changes: 21 additions & 0 deletions src/Alphabet/Utf8Alphabet.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?php

namespace Toflar\StateSetIndex\Alphabet;

class Utf8Alphabet implements AlphabetInterface
{
/**
* @var array<int, array<string, int>>
*/
private array $cache = [];

public function map(string $char, int $alphabetSize): int
{
if (!isset($this->cache[$alphabetSize][$char])) {
// +1 in order to never assign 0
$this->cache[$alphabetSize][$char] = (mb_ord($char, 'UTF-8') % $alphabetSize) + 1;
}

return $this->cache[$alphabetSize][$char];
}
}
8 changes: 4 additions & 4 deletions src/Config.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ public function __construct(
) {
}

public function getIndexLength(): int
public function getAlphabetSize(): int
{
return $this->indexLength;
return $this->alphabetSize;
}

public function getAlphabetSize(): int
public function getIndexLength(): int
{
return $this->alphabetSize;
return $this->indexLength;
}
}
16 changes: 16 additions & 0 deletions src/DataStore/DataStoreInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

namespace Toflar\StateSetIndex\DataStore;

interface DataStoreInterface
{
public function add(int $state, string $string): void;

/**
* Returns the matching strings per state. Key is the state and the value is an array of matching strings
* for that state.
*
* @return array<int,array<string>>
*/
public function getForStates(array $states = []): array;
}
30 changes: 30 additions & 0 deletions src/DataStore/InMemoryDataStore.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?php

namespace Toflar\StateSetIndex\DataStore;

class InMemoryDataStore implements DataStoreInterface
{
/**
* @var array<int, array<string>>
*/
private array $data = [];

public function add(int $state, string $string): void
{
$this->data[$state][] = $string;
}

public function all(): array
{
return $this->data;
}

public function getForStates(array $states = []): array
{
if ([] === $states) {
return $this->data;
}

return array_intersect_key($this->data, array_flip($states));
}
}
16 changes: 16 additions & 0 deletions src/DataStore/NullDataStore.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

namespace Toflar\StateSetIndex\DataStore;

class NullDataStore implements DataStoreInterface
{
public function add(int $state, string $string): void
{
// noop
}

public function getForStates(array $states = []): array
{
return [];
}
}
2 changes: 1 addition & 1 deletion src/Levenshtein.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class Levenshtein
{
public static function distance(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1)
public static function distance(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1): int
{
$string1 = mb_convert_encoding($string1, 'ASCII', 'utf8');
$string2 = mb_convert_encoding($string2, 'ASCII', 'utf8');
Expand Down
34 changes: 18 additions & 16 deletions src/StateSet/CostAnnotatedStateSet.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,13 @@ class CostAnnotatedStateSet
*/
private array $set = [];

/**
* Key: State
* Value: Cost
* @return array<int, int>
*/
public function all(): array
{
return $this->set;
}

public function states(): array
{
return array_values(array_keys($this->set));
}

/**
* Adds a state with a cost to this set.
* If this sets already contains the given state with a higher cost, replaces it.
*/
public function add(int $state, int $cost): void
{
if (! isset($this->set[$state])) {
if (!isset($this->set[$state])) {
$this->set[$state] = $cost;
return;
}
Expand All @@ -43,6 +28,16 @@ public function add(int $state, int $cost): void
}
}

/**
* Key: State
* Value: Cost
* @return array<int, int>
*/
public function all(): array
{
return $this->set;
}

public function mergeWith(CostAnnotatedStateSet $stateSet): self
{
$clone = clone $this;
Expand All @@ -53,4 +48,11 @@ public function mergeWith(CostAnnotatedStateSet $stateSet): self

return $clone;
}

public function states(): array
{
$states = array_values(array_keys($this->set));
sort($states);
return $states;
}
}
70 changes: 9 additions & 61 deletions src/StateSet/InMemoryStateSet.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,77 +5,25 @@
class InMemoryStateSet implements StateSetInterface
{
/**
* Key: State
* Value: array<parent,mappedChar>
*
* @var array<int, array<int,int>>
* @param array $states array<int, bool>
*/
private array $states = [];

/**
* @var array<int, array<int>>
*/
private array $children = [];

/**
* Key: State
* Value: Mapped char
* @var array<int, int>
*/
private array $mappedChars = [];

/**
* Key: State
* Value: Matching strings
* @var array<int, array<string>>
*/
private array $acceptedStrings = [];
public function __construct(
private array $states = []
) {
}

public function add(int $state, int $parentState, int $mappedChar): self
public function add(int $state): void
{
$this->states[$state] = [$parentState, $mappedChar];
$this->mappedChars[$state] = $mappedChar;
$this->children[$parentState][$state] = true;

return $this;
$this->states[$state] = true;
}

public function all(): array
{
return $this->states;
}

public function getChildrenOfState(int $state): array
{
if (! isset($this->children[$state])) {
return [];
}

return array_keys($this->children[$state]);
}

public function getCharForState(int $state): int
{
if (! isset($this->mappedChars[$state])) {
throw new \LogicException('No mapped char for state. Check your alphabet!');
}

return $this->mappedChars[$state];
}

public function acceptString(int $state, string $string): self
{
$this->acceptedStrings[$state][] = $string;

return $this;
}

public function getAcceptedStrings(array $matchingStates = []): array
public function has(int $state): bool
{
if ([] === $matchingStates) {
return $this->acceptedStrings;
}

return array_intersect_key($this->acceptedStrings, array_flip($matchingStates));
return isset($this->states[$state]);
}
}
18 changes: 4 additions & 14 deletions src/StateSet/StateSetInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,12 @@

interface StateSetInterface
{
public function add(int $state, int $parentState, int $mappedChar): self;

public function getChildrenOfState(int $state): array;

public function getCharForState(int $state): int;
public function add(int $state): void;

/**
* Accept a string with a given state.
* @return array<int>
*/
public function acceptString(int $state, string $string): self;
public function all(): array;

/**
* Returns the matching strings per state. Key is the state and the value is an array of matching strings
* for that state. If no argument is passed, the entire accepted strings dataset is returned.
*
* @return array<int,array<string>>
*/
public function getAcceptedStrings(array $matchingStates = []): array;
public function has(int $state): bool;
}

0 comments on commit 7ff7662

Please sign in to comment.