### Detect unusual words in text

In [42]:
import nltk
from nltk import word_tokenize


In [2]:
sent1 = """Just forced myself to eat a slice. I'm really not hungry tho. 
           Mark is getting worried. He knows I'm sick when I turn down pizza. Lol"""
sent2 = "I call you later, don't have nw. If urgnt, sms me."
sent3 = "Watching a telugu movie..wat abt u?"


In [7]:
def find_unusual_words(text):
    text_vocab_set = set(w.lower() for w in text if w.isalpha())
    english_vocab_set = set(w.lower() for w in nltk.corpus.words.words())
    unusual_set = text_vocab_set - english_vocab_set
    return sorted(unusual_set)


In [8]:

print(find_unusual_words(nltk.wordpunct_tokenize(sent1)))

['knows', 'lol']


In [10]:
print(find_unusual_words(nltk.wordpunct_tokenize(sent2)))

['nw', 'sms', 'urgnt']


In [12]:

print(find_unusual_words(nltk.wordpunct_tokenize(sent3)))

['abt']


#### Detect possible spelling mistakes

In [26]:
unusual_words_found = ['knows', 'lol', 'nw', 'sms', 'urgnt', 'abt']
from nltk.metrics import edit_distance
possible_suggestions={}
english_vocab_set = set(w.lower() for w in nltk.corpus.words.words())
for unusual_word in unusual_words_found:
    for word in english_vocab_set:
        dist = edit_distance(unusual_word,word)
        if dist<len(unusual_word)/2:
            if unusual_word not in possible_suggestions.keys():
                possible_suggestions[unusual_word] = [word]
            else:
                possible_suggestions[unusual_word].append(word)
  
print(possible_suggestions["lol"])
                

['tol', 'lola', 'pol', 'loo', 'lob', 'sol', 'lop', 'loa', 'kol', 'loll', 'lot', 'vol', 'lof', 'col', 'log', 'dol', 'gol', 'low', 'loy', 'lox', 'lou', 'lod', 'lo', 'lolo']


#### Detect names of people in the text


In [69]:
def names_in_text(text):
    names=[]
    words_set = set(i for i in text if i.isalpha())
    male_names = nltk.corpus.names.words('male.txt')
    female_names = nltk.corpus.names.words('female.txt')
    for w in words_set:
        if w in male_names or w in female_names:
            names.append(w)
    return names
sent1 = "John and Mary go to the church every Sunday"
sent2 = "No man has ever seen the dark side of the Moon"
print(names_in_text(word_tokenize(sent1)))
print(names_in_text(word_tokenize(sent2)))



['John', 'Mary']
[]


In [77]:
from nltk.corpus import wordnet as wn
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')
# Get all possible meanings of the word "dog
print(wn.synsets("dog"))
# Get all lemma names of "dog"
print(dog.lemma_names())
#Get all hypernyms of "dog"
print(wn.synset('dog.n.01').hypernyms())
# A hypernym is the generic term where as a hyponym is a specific term
# For the word dog, the hypernyms are 'canine' and 'domestic_animal'
#Get all hyponyms of "dog"
print(wn.synset('dog.n.01').hyponyms())
# some of hyponyms are  "pug", "puppy", "lap_dog", etc..
#Get the path similarity between to words - the method returns the shortest path in the taxonomy

print(cat.path_similarity(dog)) #Returns a value between 0 and 1. The higher the number, higher the similarity in path
# wu and palmer similarity method. 
""" Produces similarity values based on their Least Common Subsumer (most specific ancestor node) and 
   the maximum depth in the taxonomy"""
cat.wup_similarity(dog)
# Get all synonyms of the word 'good'
synonyms = []
for syn in wn.synsets("good"):
    for word in syn.lemmas():
        if word.name() != "good":
            synonyms.append(word.name())
print(synonyms)
# Get all antonyms of the word "good"
antonyms = []
for syn in wn.synsets("good"):
    for word in syn.lemmas():
        if word.name() != "good" and word.antonyms():
            antonyms.append( word.antonyms()[0].name())
            
print(wn.morphy("working",wn.VERB))
print(wn.morphy("denied",wn.VERB))
print(wn.morphy("abaci")) 



[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
['dog', 'domestic_dog', 'Canis_familiaris']
[Synset('canine.n.02'), Synset('domestic_animal.n.01')]
[Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), Synset('pug.n.01'), Synset('puppy.n.01'), Synset('spitz.n.01'), Synset('toy_dog.n.01'), Synset('working_dog.n.01')]
0.2
['goodness', 'goodness', 'commodity', 'trade_good', 'full', 'estimable', 'honorable', 'respectable', 'beneficial', 'just', 'upright', 'adept', 'expert', 'practiced', 'proficient', 'skillful', 'skilful', 'dear', 'near', 'dependable', 'safe', 'secure', 'right', 'ripe', 'well', '