# Testing NLP Libraries
* We'll see brief examples on how to use the NLTK and Spacy libraries
* We'll perform exercises on PoS, NER, Tokenization, Lemmanization and Stemming

# Tokenization Examples

In [1]:
# Importing libraries and downloading contents
import nltk
from nltk.corpus import names
nltk.download('popular')



[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\52556\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\52556\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\52556\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\52556\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\52556\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

True

In [2]:
# Checking the first 10 names
print(names.words()[:10])
print(f"Our names corpus has: {len(names.words())} names")


['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale']
Our names corpus has: 7944 names


In [3]:
# Tokenization example with NLTK
from nltk.tokenize import word_tokenize
sent = """I am reading a book.
    It is Python Machine Learning By Example,
    3rd edition."""

print(word_tokenize(sent))


['I', 'am', 'reading', 'a', 'book', '.', 'It', 'is', 'Python', 'Machine', 'Learning', 'By', 'Example', ',', '3rd', 'edition', '.']


In [4]:
# A more complex example of tokenization
sent2 = "I've been to U.K. and U.S.A"
print(word_tokenize(sent2))

['I', "'ve", 'been', 'to', 'U.K.', 'and', 'U.S.A']


In [5]:
# Using Spacy to tokenize the same example
import spacy

nlp = spacy.load('en_core_web_sm')
tokens2 = nlp(sent2)
print([token.text for token in tokens2])


['I', "'ve", 'been', 'to', 'U.K.', 'and', 'U.S.A']


In [6]:
# Segmenting the text based on sentences
from nltk.tokenize import sent_tokenize
print(sent_tokenize(sent))


['I am reading a book.', 'It is Python Machine Learning By Example,\n    3rd edition.']


## PoS Tagging
* Using 