fuzzywuzzy
=======

Is a fuzzy string matching library, excellent for categorizing related elements from piles of things.


### Let's generate some strings

In [1]:
from pwgen import pwgen

strs = pwgen(10, 5, capitalize=False, no_numerals=True)
print("Strings:", strs)

Strings: ['TNsiqHcCfs', 'hxBFrIoOlD', 'aUUuLDxmEV', 'pOIAJKMuWE', 'VVBwTXpzFM']


#### Turn them into a list of similar strings, say translation files with implicit mapping... and shuffle them for a good measure

In [2]:
from random import shuffle
fr, en = ([x+"_fr.pdf" for x in strs],[x+"_en.pdf" for x in strs])
shuffle(fr)
shuffle(en)
print('English:', en)
print('French:', fr)

English: ['TNsiqHcCfs_en.pdf', 'VVBwTXpzFM_en.pdf', 'hxBFrIoOlD_en.pdf', 'aUUuLDxmEV_en.pdf', 'pOIAJKMuWE_en.pdf']
French: ['TNsiqHcCfs_fr.pdf', 'aUUuLDxmEV_fr.pdf', 'pOIAJKMuWE_fr.pdf', 'VVBwTXpzFM_fr.pdf', 'hxBFrIoOlD_fr.pdf']


#### We can do string manipulation and string comparison to get the right parts, but that's a lot of works, let fuzziness do the job for us

In [3]:
from fuzzywuzzy import fuzz
for f, e in zip(fr, en):
    print("%s, %s ratio %d" % (f, e, fuzz.ratio(f, e)))

TNsiqHcCfs_fr.pdf, TNsiqHcCfs_en.pdf ratio 88
aUUuLDxmEV_fr.pdf, VVBwTXpzFM_en.pdf ratio 35
pOIAJKMuWE_fr.pdf, hxBFrIoOlD_en.pdf ratio 35
VVBwTXpzFM_fr.pdf, aUUuLDxmEV_en.pdf ratio 35
hxBFrIoOlD_fr.pdf, pOIAJKMuWE_en.pdf ratio 35




#### Higher ratios have give us some meaningful comparisons, lower -- not really.

Let's try a routine that picks the number with highest ratio.

In [47]:
def get_matching(cmp, in_list):
    out_ratio = 0
    out_item = None
    for item in in_list:
        fuzz_ratio = fuzz.ratio(cmp, item)
        if fuzz_ratio > out_ratio:
            out_ratio = fuzz_ratio
            out_item = item
    
    return (out_item, out_ratio)

for en_term in en:
    matching = get_matching(en_term, fr)
    print("%s %s matching ratio %d" % (en_term, matching[0], matching[1]))

dCJTyXiEQA_en.pdf dCJTyXiEQA_fr.pdf matching ratio 88
sjSPylPhsn_en.pdf sjSPylPhsn_fr.pdf matching ratio 88
EbMLTThIEj_en.pdf EbMLTThIEj_fr.pdf matching ratio 88
BErDwPNGmr_en.pdf BErDwPNGmr_fr.pdf matching ratio 88
gmwbXDucaZ_en.pdf gmwbXDucaZ_fr.pdf matching ratio 88


### sorting based on fuzziness ratio

In [48]:
term = en[0]
print("sort term:", term)
fuzziness_cmp = lambda x: fuzz.ratio(term, x)
print("Sorted by funzziness:", [(k, fuzziness_cmp(k)) for k in sorted(fr, key=fuzziness_cmp, reverse=True)])

sort term: dCJTyXiEQA_en.pdf
Sorted by funzziness: [('dCJTyXiEQA_fr.pdf', 88), ('EbMLTThIEj_fr.pdf', 41), ('BErDwPNGmr_fr.pdf', 35), ('gmwbXDucaZ_fr.pdf', 35), ('sjSPylPhsn_fr.pdf', 35)]


#### Let's add some fuzziness to the input data and see how that goes

random_modify whill randomly replace 0 to 3 characters with a random lower-case character

In [49]:
from random import choice, randrange
import string

def random_modify(in_str):
    out_str = bytearray(in_str, 'utf-8')
    az = bytearray(string.ascii_lowercase, 'utf-8')
    for letter in [randrange(0, 10) for x in range(randrange(3))]:
        replacement = choice(az)
        out_str[letter] = replacement
    return out_str.decode('utf-8')

print("before & after")
print(en)
en = [random_modify(x) for x in en]
print(en)

before & after
['dCJTyXiEQA_en.pdf', 'sjSPylPhsn_en.pdf', 'EbMLTThIEj_en.pdf', 'BErDwPNGmr_en.pdf', 'gmwbXDucaZ_en.pdf']
['dCJqyXiEQA_en.pdf', 'sjSPylPvsn_en.pdf', 'gkMLTThIEj_en.pdf', 'dErDwPNkmr_en.pdf', 'gmwbXDvcaZ_en.pdf']


### Let's compare again

In [50]:
for en_term in en:
    matching = get_matching(en_term, fr)
    print("%s %s matching ratio %d" % (en_term, matching[0], matching[1]))

dCJqyXiEQA_en.pdf dCJTyXiEQA_fr.pdf matching ratio 82
sjSPylPvsn_en.pdf sjSPylPhsn_fr.pdf matching ratio 82
gkMLTThIEj_en.pdf EbMLTThIEj_fr.pdf matching ratio 76
dErDwPNkmr_en.pdf BErDwPNGmr_fr.pdf matching ratio 76
gmwbXDvcaZ_en.pdf gmwbXDucaZ_fr.pdf matching ratio 82


### Now let's do the same thing to french and compare fuzziness

In [51]:
print("before & after")
print(fr)
fr = [random_modify(x) for x in fr]
print(fr)

before & after
['EbMLTThIEj_fr.pdf', 'BErDwPNGmr_fr.pdf', 'dCJTyXiEQA_fr.pdf', 'gmwbXDucaZ_fr.pdf', 'sjSPylPhsn_fr.pdf']
['EbMLTThIEj_fr.pdf', 'mErDwPNGmr_fr.pdf', 'dCJTfXiEQA_fr.pdf', 'gmwmXDujaZ_fr.pdf', 'siSPylPwsn_fr.pdf']


### And compare again

In [52]:
for en_term in en:
    matching = get_matching(en_term, fr)
    print("%s %s matching ratio %d" % (en_term, matching[0], matching[1]))

dCJqyXiEQA_en.pdf dCJTfXiEQA_fr.pdf matching ratio 76
sjSPylPvsn_en.pdf siSPylPwsn_fr.pdf matching ratio 76
gkMLTThIEj_en.pdf EbMLTThIEj_fr.pdf matching ratio 76
dErDwPNkmr_en.pdf mErDwPNGmr_fr.pdf matching ratio 76
gmwbXDvcaZ_en.pdf gmwmXDujaZ_fr.pdf matching ratio 71
