Skip to content

Commit

Permalink
feat(extraction): add extractors
Browse files Browse the repository at this point in the history
Signed-off-by: SphericalKat <amolele@gmail.com>
  • Loading branch information
SphericalKat committed Mar 28, 2021
1 parent e6783f3 commit 29e345e
Show file tree
Hide file tree
Showing 9 changed files with 184 additions and 19 deletions.
6 changes: 3 additions & 3 deletions coverage_badge.svg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
39 changes: 30 additions & 9 deletions example/fuzzywuzzy.dart
@@ -1,22 +1,43 @@
import 'package:fuzzywuzzy/algorithms/weighted_ratio.dart';
import 'package:fuzzywuzzy/extractor.dart';
import 'package:fuzzywuzzy/fuzzywuzzy.dart';

void main() {
// Simple ratio
print(ratio("mysmilarstring", "myawfullysimilarstirng"));
print(ratio("mysmilarstring", "mysimilarstring"));
print(ratio('mysmilarstring', 'myawfullysimilarstirng'));
print(ratio('mysmilarstring', 'mysimilarstring'));

// Partial ratio
print(partialRatio("similar", "somewhresimlrbetweenthisstring"));
print(partialRatio('similar', 'somewhresimlrbetweenthisstring'));

// Token sort ratio
print(tokenSortPartialRatio("order words out of", "words out of order"));
print(tokenSortRatio("order words out of", "words out of order"));
print(tokenSortPartialRatio('order words out of', 'words out of order'));
print(tokenSortRatio('order words out of', 'words out of order'));

// Token set ratio
print(tokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"));
print(tokenSetPartialRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"));
print(tokenSetRatio('fuzzy was a bear', 'fuzzy fuzzy fuzzy bear'));
print(tokenSetPartialRatio('fuzzy was a bear', 'fuzzy fuzzy fuzzy bear'));

// Weighted ratio
print(weightedRatio("The quick brown fox jimps ofver the small lazy dog",
"the quick brown fox jumps over the small lazy dog"));
print(weightedRatio('The quick brown fox jimps ofver the small lazy dog',
'the quick brown fox jumps over the small lazy dog'));

// Extracting top 4 choices above 50% match
print(
extractTop(
query: 'goolge',
choices: [
'google',
'bing',
'facebook',
'linkedin',
'twitter',
'googleplus',
'bingnews',
'plexoogl'
],
limit: 4,
cutoff: 50
),
);
}
2 changes: 2 additions & 0 deletions lib/algorithms/weighted_ratio.dart
Expand Up @@ -8,6 +8,8 @@ class WeightedRatio implements Applicable {
static const PARTIAL_SCALE = 0.90;
static const TRY_PARTIALS = true;

const WeightedRatio();

@override
int apply(String s1, String s2) {
var len1 = s1.length;
Expand Down
75 changes: 73 additions & 2 deletions lib/extractor.dart
@@ -1,5 +1,76 @@
import 'package:fuzzywuzzy/applicable.dart';
import 'package:fuzzywuzzy/model/extracted_result.dart';
import 'package:collection/collection.dart';

/// Class for extracting matches from a given list
class Extractor {
final int cutoff;
final int _cutoff;

Extractor([this._cutoff = 0]);

/// Returns the list of choices with their associated scores of similarity in a list of [ExtractedResult]
List<ExtractedResult> extractWithoutOrder(
String query, List<String> choices, Applicable func) {
var yields = List<ExtractedResult>.empty(growable: true);
var index = 0;

for (var s in choices) {
var score = func.apply(query, s);

if (score >= _cutoff) {
yields.add(ExtractedResult(s, score, index));
}
index++;
}

return yields;
}

/// Find the single best match above a score in a list of choices
ExtractedResult extractOne(
String query, List<String> choices, Applicable func) {
var extracted = extractWithoutOrder(query, choices, func);

return extracted.reduce(
(value, element) => value.score > element.score ? value : element);
}

/// Creates a **sorted** list of [ExtractedResult] from the most similar choices
/// to the least.
List<ExtractedResult> extractSorted(
String query, List<String> choices, Applicable func) {
var best = extractWithoutOrder(query, choices, func)..sort();
return best.reversed.toList();
}

/// Creates a **sorted** list of [ExtractedResult] which contain the top [limit] most similar choices using k-top heap sort
List<ExtractedResult> extractTop(
String query, List<String> choices, Applicable func, int limit) {
var best = extractWithoutOrder(query, choices, func);
var results = _findTopKHeap(best, limit);
return results.reversed.toList();
}

List<ExtractedResult> _findTopKHeap(List<ExtractedResult> arr, int k) {
var pq = PriorityQueue<ExtractedResult>();

Extractor([this.cutoff = 0]);
for (var x in arr) {
if (pq.length < k) {
pq.add(x);
} else if (x.compareTo(pq.first) > 0) {
pq.removeFirst();
pq.add(x);
}
}
var res = List<ExtractedResult>.empty(growable: true);
for (var i = k; i > 0; i--) {
try {
var polled = pq.removeFirst();
res.add(polled);
} catch (e) {
continue;
}
}
return res;
}
}
50 changes: 48 additions & 2 deletions lib/fuzzywuzzy.dart
@@ -1,40 +1,86 @@
import 'package:fuzzywuzzy/applicable.dart';
import 'package:fuzzywuzzy/extractor.dart';
import 'package:fuzzywuzzy/model/extracted_result.dart';

import 'algorithms/token_set.dart';
import 'algorithms/token_sort.dart';
import 'algorithms/weighted_ratio.dart';
import 'ratios/partial_ratio.dart';
import 'ratios/simple_ratio.dart';

/// Calculates a Levenshtein simple ratio between the strings
/// This indicates a measure of similarity
int ratio(String s1, String s2) {
return SimpleRatio().apply(s1, s2);
}

/// Inconsistent substrings lead to problems in matching.
/// This ratio uses a heuristic called "best partial" for when two strings are
/// of noticeably different lengths
int partialRatio(String s1, String s2) {
return PartialRatio().apply(s1, s2);
}

/// Find all alphanumeric tokens in the string and sort these tokens
/// and then take ratio of resulting joined strings.
int tokenSortRatio(String s1, String s2) {
return TokenSort().apply(s1, s2, SimpleRatio());
}

/// Find all alphanumeric tokens in the string and sort these tokens
/// and then take partial ratio of resulting joined strings.
int tokenSortPartialRatio(String s1, String s2) {
return TokenSort().apply(s1, s2, PartialRatio());
}

/// Splits the strings into tokens and computes intersections and remainders
/// between the tokens of the two strings. A comparison string is then
/// built up and is compared using the simple ratio algorithm.
/// Useful for strings where words appear redundantly
int tokenSetRatio(String s1, String s2) {
return TokenSet().apply(s1, s2, SimpleRatio());
}

/// Splits the strings into tokens and computes intersections and remainders
/// between the tokens of the two strings. A comparison string is then
/// built up and is compared using the partial ratio algorithm.
/// Useful for strings where words appear redundantly
int tokenSetPartialRatio(String s1, String s2) {
return TokenSet().apply(s1, s2, PartialRatio());
}

/// Calculates a weighted ratio between [s1] and [s2] using the best option from
/// Calculates a weighted ratio between [s1] and [s2] using the best option from
/// the above fuzzy matching algorithms
///
///
/// Example:
/// ```dart
/// weightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog") // 97
/// ```
int weightedRatio(String s1, String s2) {
return WeightedRatio().apply(s1.toLowerCase(), s2.toLowerCase());
}

/// Returns a sorted list of [ExtractedResult] which contains the top [limit]
/// most similar choices. Will reject any items with scores below the [cutoff].
/// Default [cutoff] is 0
/// Uses [WeightedRatio] as the default algorithm.
List<ExtractedResult> extractTop(
{required String query,
required List<String> choices,
required int limit,
int cutoff = 0,
Applicable ratio = const WeightedRatio()}) {
var extractor = Extractor(cutoff);
return extractor.extractTop(query, choices, ratio, limit);
}

/// Returns a sorted list of [ExtractedResult] without any cutoffs.
/// Uses [WeightedRatio] as the default algorithm.
List<ExtractedResult> extractSorted(
{required String query,
required List<String> choices,
int cutoff = 0,
Applicable ratio = const WeightedRatio()}) {
var extractor = Extractor(cutoff);
return extractor.extractSorted(query, choices, ratio);
}
19 changes: 19 additions & 0 deletions lib/model/extracted_result.dart
@@ -0,0 +1,19 @@
/// A model class representing results extracted from a list
class ExtractedResult implements Comparable<ExtractedResult> {
final String string;
final int score;
final int index;

/// Creates a new [ExtractedResult] with the given [string], [score] and [index]
ExtractedResult(this.string, this.score, this.index);

@override
int compareTo(ExtractedResult other) {
return score.compareTo(other.score);
}

@override
String toString() {
return '(string $string, score: $score, index: $index)';
}
}
7 changes: 5 additions & 2 deletions lib/ratios/partial_ratio.dart
Expand Up @@ -17,8 +17,7 @@ class PartialRatio implements Applicable {
longer = s1;
}

var matchingBlocks =
DiffUtils.getMatchingBlocks(shorter, longer);
var matchingBlocks = DiffUtils.getMatchingBlocks(shorter, longer);

var scores = <double>[];

Expand All @@ -34,6 +33,10 @@ class PartialRatio implements Applicable {

var ratio = DiffUtils.getRatio(shorter, longSubstr);

if (ratio.isNaN) {
continue;
}

if (ratio > 0.995) {
return 100;
} else {
Expand Down
2 changes: 1 addition & 1 deletion pubspec.lock
Expand Up @@ -58,7 +58,7 @@ packages:
source: hosted
version: "1.1.0"
collection:
dependency: transitive
dependency: "direct main"
description:
name: collection
url: "https://pub.dartlang.org"
Expand Down
3 changes: 3 additions & 0 deletions pubspec.yaml
Expand Up @@ -7,6 +7,9 @@ repository: "https://github.com/sphericalkat/dart-fuzzywuzzy"
environment:
sdk: ">=2.12.0 <3.0.0"

dependencies:
collection: ^1.15.0

dev_dependencies:
test:
test_coverage: ^0.4.2
Expand Down

0 comments on commit 29e345e

Please sign in to comment.