From f52609d0c7579ad3d055e4b19342c71bbbf0d813 Mon Sep 17 00:00:00 2001 From: mateuszz0000 Date: Wed, 27 May 2020 14:35:44 +0200 Subject: [PATCH 1/9] Added jaro_winkler first version --- strings/jaro_winkler.py | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 strings/jaro_winkler.py diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py new file mode 100644 index 000000000000..1136c6ca9f48 --- /dev/null +++ b/strings/jaro_winkler.py @@ -0,0 +1,42 @@ +""" https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance """ +import math + + +def jaro_winkler(first_string: str, second_string: str) -> float: + """ + >>> jaro_winkler("something", "gmosehitt") + 52 + """ + max_distance = math.floor(max(len(first_string), len(second_string)) / 2) - 1 + match_count = 0 + + # matching characters + for i, c1 in enumerate(first_string): + for j, c2 in enumerate(second_string): + if c1 == c2 and abs(i - j) < max_distance: + match_count += 1 + + # transposition + not_match = 0 + for c1, c2 in zip(first_string, second_string): + if c1 != c2: + not_match += 1 + + if not match_count: + jaro = 0 + else: + jaro = 1/3 * (match_count/len(first_string) + match_count/len(second_string) + (match_count - not_match/2)/match_count) + + # common prefix + prefix_len = 0 + for c1, c2 in zip(first_string[:4], second_string[:4]): + if c1 == c2: + prefix_len += 1 + else: + break + + return jaro + 0.1 * prefix_len * (1 - jaro) + + +if __name__ == '__main__': + print(jaro_winkler("martha", "marhta")) \ No newline at end of file From bc32a9f8d28ad4226a2204ee73b3e293ac048c46 Mon Sep 17 00:00:00 2001 From: mateuszz0000 Date: Wed, 27 May 2020 14:52:25 +0200 Subject: [PATCH 2/9] Added doctests --- strings/jaro_winkler.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py index 1136c6ca9f48..27fc91cb83ae 100644 --- a/strings/jaro_winkler.py +++ b/strings/jaro_winkler.py @@ -4,13 +4,31 @@ def jaro_winkler(first_string: str, second_string: str) -> float: """ - >>> jaro_winkler("something", "gmosehitt") - 52 + Jaro–Winkler distance is a string metric measuring an edit distance between two sequences. + Output value is between 0.0 and 1.0. + + >>> jaro_winkler("martha", "marhta") + 0.9611111111111111 + + >>> jaro_winkler("CRATE", "TRACE") + 0.6222222222222222 + + >>> jaro_winkler("test", "dbdbdbdb") + 0.0 + + >>> jaro_winkler("test", "test") + 1.0 + + >>> jaro_winkler("hello world", "HeLLo W0rlD") + 0.5303030303030303 + + >>> jaro_winkler("test", "") + 0.0 """ max_distance = math.floor(max(len(first_string), len(second_string)) / 2) - 1 - match_count = 0 # matching characters + match_count = 0 for i, c1 in enumerate(first_string): for j, c2 in enumerate(second_string): if c1 == c2 and abs(i - j) < max_distance: @@ -25,9 +43,12 @@ def jaro_winkler(first_string: str, second_string: str) -> float: if not match_count: jaro = 0 else: - jaro = 1/3 * (match_count/len(first_string) + match_count/len(second_string) + (match_count - not_match/2)/match_count) + jaro = 1/3 * ( + match_count/len(first_string) + + match_count/len(second_string) + + (match_count - not_match/2)/match_count) - # common prefix + # common prefix up to 4 characters prefix_len = 0 for c1, c2 in zip(first_string[:4], second_string[:4]): if c1 == c2: @@ -39,4 +60,4 @@ def jaro_winkler(first_string: str, second_string: str) -> float: if __name__ == '__main__': - print(jaro_winkler("martha", "marhta")) \ No newline at end of file + print(jaro_winkler("hello", "world")) From a07914f3fa1297f75db9307cb1ef7e65c9a265e4 Mon Sep 17 00:00:00 2001 From: mateuszz0000 Date: Wed, 27 May 2020 14:58:31 +0200 Subject: [PATCH 3/9] Fix flake warnings --- strings/jaro_winkler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py index 27fc91cb83ae..52fa8b903157 100644 --- a/strings/jaro_winkler.py +++ b/strings/jaro_winkler.py @@ -43,10 +43,10 @@ def jaro_winkler(first_string: str, second_string: str) -> float: if not match_count: jaro = 0 else: - jaro = 1/3 * ( - match_count/len(first_string) + - match_count/len(second_string) + - (match_count - not_match/2)/match_count) + jaro = 1 / 3 * ( + match_count / len(first_string) + + match_count / len(second_string) + + (match_count - not_match / 2) / match_count) # common prefix up to 4 characters prefix_len = 0 From f5d1500ebe437c8b44e6d4e2ca9c4ba83e79b4b3 Mon Sep 17 00:00:00 2001 From: mateuszz0000 Date: Wed, 27 May 2020 15:10:43 +0200 Subject: [PATCH 4/9] Refactor --- strings/jaro_winkler.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py index 52fa8b903157..f633b3b4031e 100644 --- a/strings/jaro_winkler.py +++ b/strings/jaro_winkler.py @@ -35,13 +35,12 @@ def jaro_winkler(first_string: str, second_string: str) -> float: match_count += 1 # transposition - not_match = 0 - for c1, c2 in zip(first_string, second_string): - if c1 != c2: - not_match += 1 + not_match = len( + [(c1, c2) for c1, c2 in zip(first_string, second_string) if c1 != c2] + ) if not match_count: - jaro = 0 + jaro = 0.0 else: jaro = 1 / 3 * ( match_count / len(first_string) From c39c9de8a6b2c4355517d62cfca04f5a14ed3a11 Mon Sep 17 00:00:00 2001 From: mateuszz0000 Date: Thu, 28 May 2020 00:29:44 +0200 Subject: [PATCH 5/9] Fixes bug in jaro winkler implementation --- strings/jaro_winkler.py | 46 +++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py index f633b3b4031e..fe9e805586ea 100644 --- a/strings/jaro_winkler.py +++ b/strings/jaro_winkler.py @@ -2,7 +2,7 @@ import math -def jaro_winkler(first_string: str, second_string: str) -> float: +def jaro_winkler(str1: str, str2: str) -> float: """ Jaro–Winkler distance is a string metric measuring an edit distance between two sequences. Output value is between 0.0 and 1.0. @@ -11,7 +11,7 @@ def jaro_winkler(first_string: str, second_string: str) -> float: 0.9611111111111111 >>> jaro_winkler("CRATE", "TRACE") - 0.6222222222222222 + 0.7333333333333334 >>> jaro_winkler("test", "dbdbdbdb") 0.0 @@ -20,36 +20,50 @@ def jaro_winkler(first_string: str, second_string: str) -> float: 1.0 >>> jaro_winkler("hello world", "HeLLo W0rlD") - 0.5303030303030303 + 0.6363636363636364 >>> jaro_winkler("test", "") 0.0 + + >>> jaro_winkler("hello", "world") + 0.4666666666666666 + + >>> jaro_winkler("hell**o", "*world") + 0.4365079365079365 """ - max_distance = math.floor(max(len(first_string), len(second_string)) / 2) - 1 + + def get_matched_characters(_str1, _str2): + matched = [] + limit = math.floor(min(len(_str1), len(_str2)) / 2) + for i, l in enumerate(_str1): + left, right = int(max(0, i - limit)), int(min(i + limit + 1, len(_str2))) + if l in _str2[left:right]: + matched.append(l) + _str2 = _str2[0:_str2.index(l)] + " " + _str2[_str2.index(l) + 1:] + + return ''.join(matched) # matching characters - match_count = 0 - for i, c1 in enumerate(first_string): - for j, c2 in enumerate(second_string): - if c1 == c2 and abs(i - j) < max_distance: - match_count += 1 + matching_1 = get_matched_characters(str1, str2) + matching_2 = get_matched_characters(str2, str1) + match_count = len(matching_1) # transposition - not_match = len( - [(c1, c2) for c1, c2 in zip(first_string, second_string) if c1 != c2] - ) + transpositions = math.floor(len( + [(c1, c2) for c1, c2 in zip(matching_1, matching_2) if c1 != c2] + )) / 2 if not match_count: jaro = 0.0 else: jaro = 1 / 3 * ( - match_count / len(first_string) - + match_count / len(second_string) - + (match_count - not_match / 2) / match_count) + match_count / len(str1) + + match_count / len(str2) + + (match_count - transpositions) / match_count) # common prefix up to 4 characters prefix_len = 0 - for c1, c2 in zip(first_string[:4], second_string[:4]): + for c1, c2 in zip(str1[:4], str2[:4]): if c1 == c2: prefix_len += 1 else: From fb505bcc5052e65edc7e7f947c565d25730cd27d Mon Sep 17 00:00:00 2001 From: mateuszz0000 Date: Thu, 28 May 2020 08:40:40 +0200 Subject: [PATCH 6/9] Commit suggestions --- strings/jaro_winkler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py index fe9e805586ea..34d74f06bb99 100644 --- a/strings/jaro_winkler.py +++ b/strings/jaro_winkler.py @@ -36,10 +36,11 @@ def get_matched_characters(_str1, _str2): matched = [] limit = math.floor(min(len(_str1), len(_str2)) / 2) for i, l in enumerate(_str1): - left, right = int(max(0, i - limit)), int(min(i + limit + 1, len(_str2))) + left = int(max(0, i - limit)) + right = int(min(i + limit + 1, len(_str2))) if l in _str2[left:right]: matched.append(l) - _str2 = _str2[0:_str2.index(l)] + " " + _str2[_str2.index(l) + 1:] + _str2 = f"{_str2[0:_str2.index(l)]} {_str2[_str2.index(l) + 1:]}" return ''.join(matched) From 23b0fac9eeecd1f09ae68596b814449c2c315093 Mon Sep 17 00:00:00 2001 From: mateuszz0000 Date: Thu, 28 May 2020 08:45:53 +0200 Subject: [PATCH 7/9] Missing comming suggestions --- strings/jaro_winkler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py index 34d74f06bb99..7a4571947c05 100644 --- a/strings/jaro_winkler.py +++ b/strings/jaro_winkler.py @@ -34,7 +34,7 @@ def jaro_winkler(str1: str, str2: str) -> float: def get_matched_characters(_str1, _str2): matched = [] - limit = math.floor(min(len(_str1), len(_str2)) / 2) + limit = min(len(_str1), len(_str2)) // 2 for i, l in enumerate(_str1): left = int(max(0, i - limit)) right = int(min(i + limit + 1, len(_str2))) From d1f65b4c82c6556f3d37db5b3469e59a68a675be Mon Sep 17 00:00:00 2001 From: mateuszz0000 Date: Thu, 28 May 2020 08:56:48 +0200 Subject: [PATCH 8/9] Remove unused math module --- strings/jaro_winkler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py index 7a4571947c05..9ce3a6660944 100644 --- a/strings/jaro_winkler.py +++ b/strings/jaro_winkler.py @@ -1,5 +1,4 @@ """ https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance """ -import math def jaro_winkler(str1: str, str2: str) -> float: @@ -50,9 +49,9 @@ def get_matched_characters(_str1, _str2): match_count = len(matching_1) # transposition - transpositions = math.floor(len( + transpositions = len( [(c1, c2) for c1, c2 in zip(matching_1, matching_2) if c1 != c2] - )) / 2 + ) // 2 if not match_count: jaro = 0.0 From 8d8d68cfcdb3afe70cdcefba4b9c26214cfbd52d Mon Sep 17 00:00:00 2001 From: John Law Date: Sat, 30 May 2020 16:26:54 +0200 Subject: [PATCH 9/9] Import doctest --- strings/jaro_winkler.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/strings/jaro_winkler.py b/strings/jaro_winkler.py index 9ce3a6660944..73827c2330c0 100644 --- a/strings/jaro_winkler.py +++ b/strings/jaro_winkler.py @@ -1,4 +1,4 @@ -""" https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance """ +"""https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance""" def jaro_winkler(str1: str, str2: str) -> float: @@ -8,30 +8,23 @@ def jaro_winkler(str1: str, str2: str) -> float: >>> jaro_winkler("martha", "marhta") 0.9611111111111111 - >>> jaro_winkler("CRATE", "TRACE") 0.7333333333333334 - >>> jaro_winkler("test", "dbdbdbdb") 0.0 - >>> jaro_winkler("test", "test") 1.0 - >>> jaro_winkler("hello world", "HeLLo W0rlD") 0.6363636363636364 - >>> jaro_winkler("test", "") 0.0 - >>> jaro_winkler("hello", "world") 0.4666666666666666 - >>> jaro_winkler("hell**o", "*world") 0.4365079365079365 """ - def get_matched_characters(_str1, _str2): + def get_matched_characters(_str1: str, _str2: str) -> str: matched = [] limit = min(len(_str1), len(_str2)) // 2 for i, l in enumerate(_str1): @@ -73,4 +66,6 @@ def get_matched_characters(_str1, _str2): if __name__ == '__main__': + import doctest + doctest.testmod() print(jaro_winkler("hello", "world"))