# **Text Preprocessing 1**
## **Tokenization --> Stemming --> Lemmatization**  

In [None]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

**Tokenization**

In [None]:
from nltk.tokenize import NLTKWordTokenizer

In [None]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize, WhitespaceTokenizer
s = '''Good muffins cost $3.88\nin New York.  Please buy me
... two of them.\n\nThanks.'''
word_tokenize(s)

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 '...',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [None]:
wordpunct_tokenize(s)

['Good',
 'muffins',
 'cost',
 '$',
 '3',
 '.',
 '88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 '...',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [None]:
sent_tokenize(s)

['Good muffins cost $3.88\nin New York.',
 'Please buy me\n... two of them.',
 'Thanks.']

In [None]:
list(WhitespaceTokenizer().span_tokenize(s))

[(0, 4),
 (5, 12),
 (13, 17),
 (18, 23),
 (24, 26),
 (27, 30),
 (31, 36),
 (38, 44),
 (45, 48),
 (49, 51),
 (52, 55),
 (56, 59),
 (60, 62),
 (63, 68),
 (70, 77)]

**Stop Words**

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in word_tokenize(s) if w not in stop_words]

In [None]:
filtered_sentence

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 '...',
 'two',
 '.',
 'Thanks',
 '.']

In [None]:
corpus = '''
The morning light filtered through the dense canopy of ancient redwood trees, casting a soft,
emerald glow on the forest floor. Moss-covered logs and delicate ferns created a lush carpet beneath the towering giants,
their bark rough and weathered from centuries of standing sentinel in this quiet woodland.
A gentle breeze whispered through the branches, carrying the faint scent of damp earth and pine,
while a distant stream murmured its endless song, weaving between the massive tree trunks and
creating a serene symphony of natural sounds that had remained unchanged for generations.'''

**Stemming**

In [None]:
from nltk.stem import PorterStemmer

In [None]:
# Initialize Python porter stemmer
ps = PorterStemmer()

# Example inflections to reduce
example_words = word_tokenize(corpus.lower())
# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))


--Word--            --Stem--            
the                 the                 
morning             morn                
light               light               
filtered            filter              
through             through             
the                 the                 
dense               dens                
canopy              canopi              
of                  of                  
ancient             ancient             
redwood             redwood             
trees               tree                
,                   ,                   
casting             cast                
a                   a                   
soft                soft                
,                   ,                   
emerald             emerald             
glow                glow                
on                  on                  
the                 the                 
forest              forest              
floor               floor               
.               

**Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

In [None]:
word_tokens = word_tokenize(corpus.lower())

# Perform lemmatization
print("{0:20}{1:20}".format("--Word--","--Lemma--"))
for word in word_tokens:
   print ("{0:20}{1:20}".format(word, wnl.lemmatize(word, pos="v")))

--Word--            --Lemma--           
the                 the                 
morning             morning             
light               light               
filtered            filter              
through             through             
the                 the                 
dense               dense               
canopy              canopy              
of                  of                  
ancient             ancient             
redwood             redwood             
trees               tree                
,                   ,                   
casting             cast                
a                   a                   
soft                soft                
,                   ,                   
emerald             emerald             
glow                glow                
on                  on                  
the                 the                 
forest              forest              
floor               floor               
.               

# **Text Preprocessing 2**
*   OHE(One Hot Encoding)
*   Bag of Words(BOW)
*   TFIDF
*   Word2Vec



