In [1]:
!python --version
__author__ = "nirant.bits@gmail.com"

Python 3.6.6 :: Anaconda, Inc.


# Spell Correction

In [2]:
import sys
# !{sys.executable} -m pip install fuzzywuzzy
# alternative for 4-10x faster computation: 
!{sys.executable} -m pip install fuzzywuzzy[speedup]

Collecting fuzzywuzzy[speedup]
  Downloading https://files.pythonhosted.org/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.17.0


In [3]:
from fuzzywuzzy import fuzz

In [4]:
fuzz.ratio("Electronic City Phase One", "Electronic City Phase One, Bangalore")

82

In [5]:
fuzz.partial_ratio("Electronic City Phase One", "Electronic City Phase One, Bangalore")

100

In [6]:
fuzz.ratio('Narendra Modi', 'Narendra D. Modi')

90

In [7]:
fuzz.partial_ratio('Narendra Modi', 'Narendra D. Modi')

77

In [8]:
fuzz.token_sort_ratio('Narendra Modi', 'Narendra D. Modi')

93

In [9]:
fuzz.token_set_ratio('Narendra Modi', 'Narendra D. Modi')

100

In [10]:
from fuzzywuzzy import process

In [11]:
query = 'Gujrat'
choices = ['Gujarat', 'Gujjar', 'Gujarat Govt.']
# Get a list of matches ordered by score, default limit to 5
print(process.extract(query, choices))

# If we want only the top one
process.extractOne(query, choices)

[('Gujarat', 92), ('Gujarat Govt.', 75), ('Gujjar', 67)]


('Gujarat', 92)

In [12]:
query = 'Banglore'
choices = ['Bangalore', 'Bengaluru']
print(process.extract(query, choices))
process.extractOne(query, choices)

[('Bangalore', 94), ('Bengaluru', 59)]


('Bangalore', 94)

In [13]:
# Let's take an example of a common search typo in online shopping:
query = 'chili'
choices = ['chilli', 'chilled', 'chilling']
print(process.extract(query, choices))
process.extractOne(query, choices)

[('chilli', 91), ('chilling', 77), ('chilled', 67)]


('chilli', 91)

## Jellyfish

In [8]:
import sys
!{sys.executable} -m pip install jellyfish

Collecting jellyfish
  Downloading https://files.pythonhosted.org/packages/61/3f/60ac86fb43dfbf976768e80674b5538e535f6eca5aa7806cf2fdfd63550f/jellyfish-0.6.1.tar.gz (132kB)
Building wheels for collected packages: jellyfish
  Running setup.py bdist_wheel for jellyfish: started
  Running setup.py bdist_wheel for jellyfish: finished with status 'done'
  Stored in directory: C:\Users\nirantk\AppData\Local\pip\Cache\wheels\9c\6f\33\92bb9a4b4562a60ba6a80cedbab8907e48bc7a8b1f369ea0ae
Successfully built jellyfish
Installing collected packages: jellyfish
Successfully installed jellyfish-0.6.1


In [9]:
import jellyfish
correct_example = ('Narendra Modi', 'Narendra Modi')
damodardas_example = ('Narendra Modi', 'Narendra D. Modi')
modi_typo_example = ('Narendra Modi', 'Narendar Modi')
gujarat_typo_example = ('Gujarat', 'Gujrat')

examples = [correct_example, damodardas_example, modi_typo_example, gujarat_typo_example]

In [10]:
def calculate_distance(function, examples=examples):
    for ele in examples:
        print(f'{ele}: {function(*ele)}') 

In [11]:
calculate_distance(jellyfish.levenshtein_distance)

('Narendra Modi', 'Narendra Modi'): 0
('Narendra Modi', 'Narendra D. Modi'): 3
('Narendra Modi', 'Narendar Modi'): 2
('Gujarat', 'Gujrat'): 1


In [12]:
calculate_distance(jellyfish.damerau_levenshtein_distance)

('Narendra Modi', 'Narendra Modi'): 0
('Narendra Modi', 'Narendra D. Modi'): 3
('Narendra Modi', 'Narendar Modi'): 1
('Gujarat', 'Gujrat'): 1


In [13]:
calculate_distance(jellyfish.hamming_distance)

('Narendra Modi', 'Narendra Modi'): 0
('Narendra Modi', 'Narendra D. Modi'): 7
('Narendra Modi', 'Narendar Modi'): 2
('Gujarat', 'Gujrat'): 4


In [14]:
calculate_distance(jellyfish.jaro_distance) 

('Narendra Modi', 'Narendra Modi'): 1.0
('Narendra Modi', 'Narendra D. Modi'): 0.9375
('Narendra Modi', 'Narendar Modi'): 0.9743589743589745
('Gujarat', 'Gujrat'): 0.8968253968253969


In [21]:
calculate_distance(jellyfish.jaro_winkler)

('Narendra Modi', 'Narendra Modi'): 1.0
('Narendra Modi', 'Narendra D. Modi'): 0.9625
('Narendra Modi', 'Narendar Modi'): 0.9846153846153847
('Gujarat', 'Gujrat'): 0.9277777777777778


### Phonetic Word Similarity

#### What is a phonetic encoding?

In [15]:
jellyfish.soundex('Jellyfish')

'J412'

In [16]:
jellyfish.nysiis('Jellyfish')

'JALYF'

In [17]:
jellyfish.metaphone('Jellyfish')

'JLFX'

In [18]:
jellyfish.match_rating_codex('Jellyfish')

'JLLFSH'

#### Metaphone + Levenshtein

In [20]:
jellyfish.levenshtein_distance(jellyfish.metaphone('write'), jellyfish.metaphone('right'))

0

In [21]:
examples+= [('write', 'right'), ('Mangalore', 'Bangalore'), ('Delhi', 'Dilli')] # adding a few examples to show how cool this is

In [28]:
def calculate_phonetic_distance(phonetic_func, distance_func, examples=examples):
    print("Word\t\tSound\t\tWord\t\t\tSound\t\tPhonetic Distance")
    for ele in examples:
        correct, typo = ele[0], ele[1]
        phonetic_correct, phonetic_typo = phonetic_func(correct), phonetic_func(typo)
        phonetic_distance = distance_func(phonetic_correct, phonetic_typo)
        print(f'{correct:<10}\t{phonetic_correct:<10}\t{typo:<20}\t{phonetic_typo:<10}\t{phonetic_distance:<10}') 
        
calculate_phonetic_distance(phonetic_func=jellyfish.metaphone, distance_func=jellyfish.levenshtein_distance)        

Word		Sound		Word			Sound		Phonetic Distance
Narendra Modi	NRNTR MT  	Narendra Modi       	NRNTR MT  	0         
Narendra Modi	NRNTR MT  	Narendra D. Modi    	NRNTR T MT	2         
Narendra Modi	NRNTR MT  	Narendar Modi       	NRNTR MT  	0         
Gujarat   	KJRT      	Gujrat              	KJRT      	0         
write     	RT        	right               	RT        	0         
Mangalore 	MNKLR     	Bangalore           	BNKLR     	1         
Delhi     	TLH       	Dilli               	TL        	1         


#### American Soundex

In [29]:
calculate_phonetic_distance(phonetic_func=jellyfish.soundex, distance_func=jellyfish.levenshtein_distance)        

Word		Sound		Word			Sound		Phonetic Distance
Narendra Modi	N653      	Narendra Modi       	N653      	0         
Narendra Modi	N653      	Narendra D. Modi    	N653      	0         
Narendra Modi	N653      	Narendar Modi       	N653      	0         
Gujarat   	G263      	Gujrat              	G263      	0         
write     	W630      	right               	R230      	2         
Mangalore 	M524      	Bangalore           	B524      	1         
Delhi     	D400      	Dilli               	D400      	0         


## Updating the Original Corpus with FlashText

In [22]:
import sys
!{sys.executable} -m pip install flashtext

Collecting flashtext
  Downloading https://files.pythonhosted.org/packages/81/d8/2cd0656eae456d615c2f1efbcae8dfca2cb871a31f34ba8925aba47d5e09/flashtext-2.7.tar.gz
Building wheels for collected packages: flashtext
  Running setup.py bdist_wheel for flashtext: started
  Running setup.py bdist_wheel for flashtext: finished with status 'done'
  Stored in directory: C:\Users\nirantk\AppData\Local\pip\Cache\wheels\37\db\d7\fe74f7cb8e5c3afed90fe6f4967c933a6f13d81ab6b3d3128c
Successfully built flashtext
Installing collected packages: flashtext
Successfully installed flashtext-2.7


In [31]:
from flashtext.keyword import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Delhi', 'NCR') # notice we are adding tuples here
keyword_processor.add_keyword('Bombay', 'Mumbai')
keywords_found = keyword_processor.extract_keywords('I love the food in Delhi and the people in Bombay')
keywords_found
# ['NCR', 'Mumbai']

['NCR', 'Mumbai']

How about we replace them now?

In [32]:
from flashtext.keyword import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Delhi', 'NCR')
keyword_processor.add_keyword('Bombay', 'Mumbai')
replaced_sentence = keyword_processor.replace_keywords('I love the food in Delhi and the people in Bombay')
replaced_sentence
# 'I love the food in NCR and the people in Mumbai'

'I love the food in NCR and the people in Mumbai'