# Text preprocessing  using nltk
1. read the data
2. convert data to tokens
3. take the token from alphnumeric
4. convert the token to lower case
5. remove the english stop words

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
import re

In [2]:
def re_exp(text):
    '''remove the punctuations from the given text'''
    re_punt= re.compile('[%s]'%re.escape(string.punctuation))
    w=[re_punt.sub('',w) for w in text.split()]
    return (w)

In [3]:
# step 1: read data
filename=r"metamorphosis-clean.txt"
fp=open(filename,"rt", encoding='utf-8-sig') 
text=fp.read()

# convert to token
words =  word_tokenize(text) 
sen = " ".join(words)
words = re_exp(sen)
#alpha - lower
tokens = [word for word in words if word.isalnum()]
lower_tokens = [word.lower() for word in tokens ]

# stop words
eng_stopwords=stopwords.words("english")
no_punctuations_stopwords_tokens = [token for  token in lower_tokens if token not in eng_stopwords]
no_punctuations_stopwords_tokens

['one',
 'morning',
 'gregor',
 'samsa',
 'woke',
 'troubled',
 'dreams',
 'found',
 'transformed',
 'bed',
 'horrible',
 'vermin',
 'lay',
 'armourlike',
 'back',
 'lifted',
 'head',
 'little',
 'could',
 'see',
 'brown',
 'belly',
 'slightly',
 'domed',
 'divided',
 'arches',
 'stiff',
 'sections',
 'bedding',
 'hardly',
 'able',
 'cover',
 'seemed',
 'ready',
 'slide',
 'moment',
 'many',
 'legs',
 'pitifully',
 'thin',
 'compared',
 'size',
 'rest',
 'waved',
 'helplessly',
 'looked',
 'happened',
 'thought',
 'nt',
 'dream',
 'room',
 'proper',
 'human',
 'room',
 'although',
 'little',
 'small',
 'lay',
 'peacefully',
 'four',
 'familiar',
 'walls',
 'collection',
 'textile',
 'samples',
 'lay',
 'spread',
 'table',
 'samsa',
 'travelling',
 'salesman',
 'hung',
 'picture',
 'recently',
 'cut',
 'illustrated',
 'magazine',
 'housed',
 'nice',
 'gilded',
 'frame',
 'showed',
 'lady',
 'fitted',
 'fur',
 'hat',
 'fur',
 'boa',
 'sat',
 'upright',
 'raising',
 'heavy',
 'fur',
 'muf

# stem words

- fishing , fished , fisher =====> fish
- NLP making a vocubalary ,  a dictionary
- We create our own dictionary

In [4]:
from nltk.stem import PorterStemmer
words = ["python", "pythoner","pythonly","pythoned"]
ps=PorterStemmer()
stem_words = [ps.stem(w) for w in words]
stem_words


['python', 'python', 'pythonli', 'python']

In [5]:
# step 1: read data
filename=r"metamorphosis-clean.txt"
fp=open(filename,"rt", encoding='utf-8-sig') 
text=fp.read()

# convert to token
words =  word_tokenize(text) 
sen = " ".join(words)
words = re_exp(sen)
#alpha - lower
tokens = [word for word in words if word.isalnum()]
lower_tokens = [word.lower() for word in tokens ]

# stop words
eng_stopwords=stopwords.words("english")
no_punctuations_stopwords_tokens = [token for  token in lower_tokens if token not in eng_stopwords]
print(no_punctuations_stopwords_tokens[:10])

#stem words
ps=PorterStemmer()
no_pun_stop_stem_words = [ps.stem(w) for w in no_punctuations_stopwords_tokens]
print(no_pun_stop_stem_words[:10])

['one', 'morning', 'gregor', 'samsa', 'woke', 'troubled', 'dreams', 'found', 'transformed', 'bed']
['one', 'morn', 'gregor', 'samsa', 'woke', 'troubl', 'dream', 'found', 'transform', 'bed']


# Part of speech

In [6]:
from nltk import pos_tag
text="He received best actor award"
pos_tag(word_tokenize(text))

[('He', 'PRP'),
 ('received', 'VBD'),
 ('best', 'JJS'),
 ('actor', 'NN'),
 ('award', 'NN')]

In [8]:
import nltk
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\91956\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [9]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

# lemmatizer

In [10]:
from nltk.stem import WordNetLemmatizer
l=WordNetLemmatizer()
print(l.lemmatize('having',pos='v'))
print(l.lemmatize('fishing',pos='v'))

print(l.lemmatize('have',pos='v'))
print(l.lemmatize('had',pos='v'))

print(l.lemmatize('fished',pos='v'))
print(l.lemmatize('fishes',pos='v'))

have
fish
have
have
fish
fish


# word net

In [11]:
from nltk.corpus import wordnet
syns = wordnet.synsets('good')
syns

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

In [24]:
word1=wordnet.synset('good.n.01')
print(word1.definition())
print(word1.examples())

benefit
['for your own good', "what's the good of worrying?"]


In [25]:
word1=wordnet.synset('good.s.20')
print(word1.definition())
print(word1.examples())

not left to spoil
['the meat is still good']


In [26]:
word1=wordnet.synset('good.s.15')
print(word1.definition())
print(word1.examples())

capable of pleasing
['good looks']


 - re package
- tokenization (punkt)

- normalization - convert to lower case

- stopwords - corpus

- stemming 

- lemmatization

- pos (pos_tag)

- wordnet (corpus)