# Spell Correction

##### Non-word spelling mistake correction

In [1]:
from autocorrect import spell

In [2]:
print (spell('caaaar'))
print (spell('mussage'))
print (spell('survice'))
print (spell('hte'))

caesar
message
service
the


In [3]:
import re
print(re.sub('a+','a','caaar'))
print (spell(re.sub('a+','a','caaar')))

car
car


# Sentence Tokenizer

In [4]:
import nltk

word_data = "Tokenization is a way to split text into tokens. These tokens could be paragraphs, sentences, or individual words."
nltk_tokens = nltk.sent_tokenize(word_data)
display(nltk_tokens)

['Tokenization is a way to split text into tokens.',
 'These tokens could be paragraphs, sentences, or individual words.']

# Word Tokenizer (Unigrams)

In [5]:
import nltk

word_data = "The best performance can bring in sky high success."
nltk_tokens = nltk.word_tokenize(word_data)
display(nltk_tokens)

['The',
 'best',
 'performance',
 'can',
 'bring',
 'in',
 'sky',
 'high',
 'success',
 '.']

# Bigrams

In [6]:
import nltk

word_data = "The best performance can bring in sky high success."
nltk_tokens = nltk.word_tokenize(word_data)

display(list(nltk.bigrams(nltk_tokens)))

[('The', 'best'),
 ('best', 'performance'),
 ('performance', 'can'),
 ('can', 'bring'),
 ('bring', 'in'),
 ('in', 'sky'),
 ('sky', 'high'),
 ('high', 'success'),
 ('success', '.')]

# Trigrams

In [7]:
import nltk

word_data = "The best performance can bring in sky high success."
nltk_tokens = nltk.word_tokenize(word_data)

display(list(nltk.trigrams(nltk_tokens)))

[('The', 'best', 'performance'),
 ('best', 'performance', 'can'),
 ('performance', 'can', 'bring'),
 ('can', 'bring', 'in'),
 ('bring', 'in', 'sky'),
 ('in', 'sky', 'high'),
 ('sky', 'high', 'success'),
 ('high', 'success', '.')]

# NGrams

In [8]:
import nltk

word_data = "The best performance can bring in sky high success."
nltk_tokens = nltk.word_tokenize(word_data)

print('Four Grams')
display(list(nltk.ngrams(nltk_tokens,4)))

print('\nFive Grams')
display(list(nltk.ngrams(nltk_tokens,5)))

Four Grams


[('The', 'best', 'performance', 'can'),
 ('best', 'performance', 'can', 'bring'),
 ('performance', 'can', 'bring', 'in'),
 ('can', 'bring', 'in', 'sky'),
 ('bring', 'in', 'sky', 'high'),
 ('in', 'sky', 'high', 'success'),
 ('sky', 'high', 'success', '.')]


Five Grams


[('The', 'best', 'performance', 'can', 'bring'),
 ('best', 'performance', 'can', 'bring', 'in'),
 ('performance', 'can', 'bring', 'in', 'sky'),
 ('can', 'bring', 'in', 'sky', 'high'),
 ('bring', 'in', 'sky', 'high', 'success'),
 ('in', 'sky', 'high', 'success', '.')]

# Pos Tagging


It is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition and its context—i.e., its relationship with adjacent and related words in a phrase, sentence, or paragraph.

use nltk.help.upenn_tagset() to see the possible tags 

* IN  : preposition or conjunction, subordinating
* JJ  : adjective or numeral, ordinal
* JJR : adjective, comparative
* JJS : adjective, superlative
* NN  : noun, common, singular or mass
* NNP : noun, proper, singular
* NNPS: noun, proper, plural
* NNS : noun, common, plural
* RB  : adverb
* RBR : adverb, comparative
* RBS : adverb, superlative
* VB  : verb, base form
* VBD : verb, past tense

In [9]:
import nltk
nltk.pos_tag(nltk.word_tokenize('My name is Krishna'))

[('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Krishna', 'NNP')]

In [10]:
s='Natural language processing is the ability of a computer program to understand human language as it is spoken.'
pos=nltk.pos_tag(nltk.word_tokenize(s))

display(pos)


[('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('ability', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('computer', 'NN'),
 ('program', 'NN'),
 ('to', 'TO'),
 ('understand', 'VB'),
 ('human', 'JJ'),
 ('language', 'NN'),
 ('as', 'IN'),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('spoken', 'VBN'),
 ('.', '.')]

# NER Tagging

In [11]:
sentence = 'Tommorrow I will be at Mumbai.'
tokenized = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokenized)
display(tagged)

namedEnt = nltk.ne_chunk(tagged)
for subtree in namedEnt.subtrees():
    print(subtree)

namedEnt.draw()

[('Tommorrow', 'NN'),
 ('I', 'PRP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('at', 'IN'),
 ('Mumbai', 'NNP'),
 ('.', '.')]

(S
  Tommorrow/NN
  I/PRP
  will/MD
  be/VB
  at/IN
  (ORGANIZATION Mumbai/NNP)
  ./.)
(ORGANIZATION Mumbai/NNP)


# Stemming

In [12]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer() 
print(stemmer.stem('painting'))

paint


In [13]:
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer() 
print(stemmer.stem('increases'))

increas


# Lemmitization

In [14]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('increases'))

increase


In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('playing')) # default pos is noun
print(lemmatizer.lemmatize('playing', pos="v"))

playing
play


# Stop word removal

In [16]:
from nltk.corpus import stopwords
stpwrd=(stopwords.words('english'))
print(stpwrd)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [17]:
sentence="I Love Chocolates"
tokens=sentence.lower().split(' ')
print(tokens)

for token in tokens:
    if token in stpwrd:
        tokens.remove(token)
tokens

['i', 'love', 'chocolates']


['love', 'chocolates']

# TF-IDF

In [18]:
documents=['I Love India','India sport']

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [21]:
import pandas as pd
pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

Unnamed: 0,india,love,sport
0,0.579739,0.814802,0.0
1,0.579739,0.0,0.814802


# Synonyms

In [22]:
from nltk.corpus import wordnet
syn = wordnet.synsets("pain")
print(syn[0].definition())
print(syn[0].examples())

a symptom of some physical hurt or disorder
['the patient developed severe pain and distension']


In [23]:
from nltk.corpus import wordnet 
synonyms = []
for syn in wordnet.synsets('pain'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
display(set(synonyms))

{'ail',
 'anguish',
 'annoyance',
 'bother',
 'botheration',
 'hurt',
 'hurting',
 'infliction',
 'nuisance',
 'pain',
 'pain_in_the_ass',
 'pain_in_the_neck',
 'pain_sensation',
 'painful_sensation',
 'painfulness',
 'trouble'}

# Antonyms

In [24]:
from nltk.corpus import wordnet
antonyms = []
for syn in wordnet.synsets("small"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
display(set(antonyms))

{'big', 'large'}