# NLTK and jarowinkler

In [1]:
from nltk.metrics import binary_distance, edit_distance, jaccard_distance, masi_distance
from jarowinkler import jaro_similarity, jarowinkler_similarity
import re

In [2]:
def normalize_string(s):
    # Convert to lowercase
    s = s.lower()
    # Remove non-alphanumeric characters and spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    # Remove extra spaces
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [3]:
# Example text data
human_word = "Head_and_Neck_Part"
mouse_word = "head/neck"

nhw = normalize_string(human_word)
nmw = normalize_string(mouse_word)
nhw, nmw

('head and neck part', 'head neck')

In [4]:
# binary distance
# prints 0.0 if the strings are the same, 1.0 if they are different
print(binary_distance(nhw, nmw))

1.0


In [5]:
# edit distance aka Levenshtein distance
# prints the number of changes needed to transform one string into the other
print(edit_distance(nhw, nmw))

9


In [6]:
# jaccard distance

print(jaccard_distance(set(nhw.split()), set(nmw.split())))

0.5


In [7]:
# jaro similarity
print(jaro_similarity(nhw, nmw))

0.8333333333333334


In [8]:
# jaro-winkler similarity
print(jarowinkler_similarity(nhw, nmw))

0.9


In [9]:
# masi distance
print(masi_distance(set(nhw.split()), set(nmw.split())))

0.665
