In [1]:
!python --version
__author__ = "bsangramsing@gmail.com"

Python 3.7.3


# Spell Correction

In [2]:
import sys
# !{sys.executable} -m pip install fuzzywuzzy
# alternative for 4-10x faster computation: 
!{sys.executable} -m pip install fuzzywuzzy[speedup]



In [3]:
from fuzzywuzzy import fuzz

In [4]:
fuzz.ratio("Electronic City Phase One", "Electronic City Phase One, Aurangabad")

81

In [5]:
fuzz.partial_ratio("Electronic City Phase One", "Electronic City Phase One, Aurangabad")

100

In [6]:
fuzz.ratio('Narendra Modi', 'Narendra D. Modi')

90

In [7]:
fuzz.partial_ratio('Narendra Modi', 'Narendra D. Modi')

77

In [8]:
fuzz.token_sort_ratio('Narendra Modi', 'Narendra D. Modi')

93

In [9]:
fuzz.token_set_ratio('Narendra Modi', 'Narendra D. Modi')

100

In [10]:
from fuzzywuzzy import process

In [11]:
query = 'Maharashtra'
choices = ['Maharashtra', 'Marathi', 'Maharashtra Govt.']
# Get a list of matches ordered by score, default limit to 5
print(process.extract(query, choices))

# If we want only the top one
process.extractOne(query, choices)

[('Maharashtra', 100), ('Maharashtra Govt.', 95), ('Marathi', 56)]


('Maharashtra', 100)

In [12]:
query = 'Aurangabad'
choices = ['Aurangabad', 'Aurangabad']
print(process.extract(query, choices))
process.extractOne(query, choices)

[('Aurangabad', 100), ('Aurangabad', 100)]


('Aurangabad', 100)

In [13]:
# Let's take an example of a common search typo in online shopping:
query = 'chili'
choices = ['chilli', 'chilled', 'chilling']
print(process.extract(query, choices))
process.extractOne(query, choices)

[('chilli', 91), ('chilling', 77), ('chilled', 67)]


('chilli', 91)

## Jellyfish

In [14]:
import sys
!{sys.executable} -m pip install jellyfish



In [26]:
import jellyfish
correct_example = ('Narendra Modi', 'Narendra Modi')
damodardas_example = ('Narendra Modi', 'Narendra D. Modi')
modi_typo_example = ('Narendra Modi', 'Narendar Modi')
maharashtra_typo_example = ('Maharashtra', 'Marathi')

examples = [correct_example, damodardas_example, modi_typo_example, maharashtra_typo_example]

In [27]:
def calculate_distance(function, examples=examples):
    for ele in examples:
        print(f'{ele}: {function(*ele)}') 

In [28]:
calculate_distance(jellyfish.levenshtein_distance)

('Narendra Modi', 'Narendra Modi'): 0
('Narendra Modi', 'Narendra D. Modi'): 3
('Narendra Modi', 'Narendar Modi'): 2
('Maharashtra', 'Marathi'): 6


In [22]:
calculate_distance(jellyfish.damerau_levenshtein_distance)

('Narendra Modi', 'Narendra Modi'): 0
('Narendra Modi', 'Narendra D. Modi'): 3
('Narendra Modi', 'Narendar Modi'): 1
('Maharashtra', 'Marathi'): 6


In [23]:
calculate_distance(jellyfish.hamming_distance)

('Narendra Modi', 'Narendra Modi'): 0
('Narendra Modi', 'Narendra D. Modi'): 7
('Narendra Modi', 'Narendar Modi'): 2
('Maharashtra', 'Marathi'): 8


In [24]:
calculate_distance(jellyfish.jaro_distance) 

('Narendra Modi', 'Narendra Modi'): 1.0
('Narendra Modi', 'Narendra D. Modi'): 0.9375
('Narendra Modi', 'Narendar Modi'): 0.9743589743589745
('Maharashtra', 'Marathi'): 0.7453102453102454


In [25]:
calculate_distance(jellyfish.jaro_winkler)

('Narendra Modi', 'Narendra Modi'): 1.0
('Narendra Modi', 'Narendra D. Modi'): 0.9625
('Narendra Modi', 'Narendar Modi'): 0.9846153846153847
('Maharashtra', 'Marathi'): 0.7962481962481963


### Phonetic Word Similarity

#### What is a phonetic encoding?

In [29]:
jellyfish.soundex('Jellyfish')

'J412'

In [30]:
jellyfish.nysiis('Jellyfish')

'JALYF'

In [31]:
jellyfish.metaphone('Jellyfish')

'JLFX'

In [32]:
jellyfish.match_rating_codex('Jellyfish')

'JLLFSH'

#### Metaphone + Levenshtein

In [33]:
jellyfish.levenshtein_distance(jellyfish.metaphone('write'), jellyfish.metaphone('right'))

0

In [34]:
examples+= [('write', 'right'), ('Mangalore', 'Bangalore'), ('Delhi', 'Dilli')] # adding a few examples to show how cool this is

In [35]:
def calculate_phonetic_distance(phonetic_func, distance_func, examples=examples):
    print("Word\t\tSound\t\tWord\t\t\tSound\t\tPhonetic Distance")
    for ele in examples:
        correct, typo = ele[0], ele[1]
        phonetic_correct, phonetic_typo = phonetic_func(correct), phonetic_func(typo)
        phonetic_distance = distance_func(phonetic_correct, phonetic_typo)
        print(f'{correct:<10}\t{phonetic_correct:<10}\t{typo:<20}\t{phonetic_typo:<10}\t{phonetic_distance:<10}') 
        
calculate_phonetic_distance(phonetic_func=jellyfish.metaphone, distance_func=jellyfish.levenshtein_distance)        

Word		Sound		Word			Sound		Phonetic Distance
Narendra Modi	NRNTR MT  	Narendra Modi       	NRNTR MT  	0         
Narendra Modi	NRNTR MT  	Narendra D. Modi    	NRNTR T MT	2         
Narendra Modi	NRNTR MT  	Narendar Modi       	NRNTR MT  	0         
Maharashtra	MHRXTR    	Marathi             	MR0       	4         
write     	RT        	right               	RT        	0         
Mangalore 	MNKLR     	Bangalore           	BNKLR     	1         
Delhi     	TLH       	Dilli               	TL        	1         


#### American Soundex

In [36]:
calculate_phonetic_distance(phonetic_func=jellyfish.soundex, distance_func=jellyfish.levenshtein_distance)        

Word		Sound		Word			Sound		Phonetic Distance
Narendra Modi	N653      	Narendra Modi       	N653      	0         
Narendra Modi	N653      	Narendra D. Modi    	N653      	0         
Narendra Modi	N653      	Narendar Modi       	N653      	0         
Maharashtra	M623      	Marathi             	M630      	2         
write     	W630      	right               	R230      	2         
Mangalore 	M524      	Bangalore           	B524      	1         
Delhi     	D400      	Dilli               	D400      	0         


## Updating the Original Corpus with FlashText

In [37]:
import sys
!{sys.executable} -m pip install flashtext

Collecting flashtext
  Downloading https://files.pythonhosted.org/packages/81/d8/2cd0656eae456d615c2f1efbcae8dfca2cb871a31f34ba8925aba47d5e09/flashtext-2.7.tar.gz
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/sangram/Library/Caches/pip/wheels/37/db/d7/fe74f7cb8e5c3afed90fe6f4967c933a6f13d81ab6b3d3128c
Successfully built flashtext
Installing collected packages: flashtext
Successfully installed flashtext-2.7


In [38]:
from flashtext.keyword import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Delhi', 'NCR') # notice we are adding tuples here
keyword_processor.add_keyword('Bombay', 'Mumbai')
keywords_found = keyword_processor.extract_keywords('I love the food in Delhi and the people in Bombay')
keywords_found
# ['NCR', 'Mumbai']

['NCR', 'Mumbai']

How about we replace them now?

In [39]:
from flashtext.keyword import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Delhi', 'NCR')
keyword_processor.add_keyword('Bombay', 'Mumbai')
replaced_sentence = keyword_processor.replace_keywords('I love the food in Delhi and the people in Bombay')
replaced_sentence
# 'I love the food in NCR and the people in Mumbai'

'I love the food in NCR and the people in Mumbai'