# üß© Part of Speech: POS

In [1]:
!pip install textblob




[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Initializing

In [2]:
from textblob import TextBlob as txb
import nltk

nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

#### Actual Code

In [3]:
text = ("Codespeedy is programming blog. " "Blog posts contain articles and tutorials on Python, CSS and even much more")

tb = txb(text)
print(tb.tags)

[('Codespeedy', 'NNP'), ('is', 'VBZ'), ('programming', 'VBG'), ('blog', 'NN'), ('Blog', 'NNP'), ('posts', 'NNS'), ('contain', 'VBP'), ('articles', 'NNS'), ('and', 'CC'), ('tutorials', 'NNS'), ('on', 'IN'), ('Python', 'NNP'), ('CSS', 'NNP'), ('and', 'CC'), ('even', 'RB'), ('much', 'RB'), ('more', 'JJR')]


<style>
details {
    margin-bottom: 10px;
}

summary {
    cursor: pointer;
    outline: none;
    font-weight: bold;
}

p {
    margin-left: 20px;
}
</style>


In [4]:
# Challenge Question (POS)

tokens = nltk.word_tokenize(text)

# Perform part-of-speech tagging
pos_tags = nltk.pos_tag(tokens)

# Print the tagged tokens
print(pos_tags)

[('Codespeedy', 'NNP'), ('is', 'VBZ'), ('programming', 'VBG'), ('blog', 'NN'), ('.', '.'), ('Blog', 'NNP'), ('posts', 'VBZ'), ('contain', 'VBP'), ('articles', 'NNS'), ('and', 'CC'), ('tutorials', 'NNS'), ('on', 'IN'), ('Python', 'NNP'), (',', ','), ('CSS', 'NNP'), ('and', 'CC'), ('even', 'RB'), ('much', 'RB'), ('more', 'JJR')]


### üìù Docs

In [5]:
# Docs

nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## üóíÔ∏è Named Entity Recognition

In [6]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize

text = ' Jack Nelson worked for Microsoft and attended a conference in Italy. I Study at Lambton college in Toronto.'

# Tokenize article into senteces:
sentences = sent_tokenize(text)
print('Tokenize article into senteces:',sentences)
print('____________________\n')

# Tokenize each sentence into words:
token_sentences = [word_tokenize(sent) for sent in sentences]
print('Tokenize each sentence into words:',token_sentences)
print('____________________\n')

# Tag each tokenized sentence into POS:
pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences]
print('Tag each tokenized sentence into POS:',pos_sentences)
print('____________________\n')

# Create Names Entity Chunks:
chunked = nltk.ne_chunk_sents(pos_sentences, binary=True)

for sent in chunked:
    for chunk in sent:
        print(chunk)


Tokenize article into senteces: [' Jack Nelson worked for Microsoft and attended a conference in Italy.', 'I Study at Lambton college in Toronto.']
____________________

Tokenize each sentence into words: [['Jack', 'Nelson', 'worked', 'for', 'Microsoft', 'and', 'attended', 'a', 'conference', 'in', 'Italy', '.'], ['I', 'Study', 'at', 'Lambton', 'college', 'in', 'Toronto', '.']]
____________________

Tag each tokenized sentence into POS: [[('Jack', 'NNP'), ('Nelson', 'NNP'), ('worked', 'VBD'), ('for', 'IN'), ('Microsoft', 'NNP'), ('and', 'CC'), ('attended', 'VBD'), ('a', 'DT'), ('conference', 'NN'), ('in', 'IN'), ('Italy', 'NNP'), ('.', '.')], [('I', 'PRP'), ('Study', 'VBP'), ('at', 'IN'), ('Lambton', 'NNP'), ('college', 'NN'), ('in', 'IN'), ('Toronto', 'NNP'), ('.', '.')]]
____________________

(NE Jack/NNP Nelson/NNP)
('worked', 'VBD')
('for', 'IN')
(NE Microsoft/NNP)
('and', 'CC')
('attended', 'VBD')
('a', 'DT')
('conference', 'NN')
('in', 'IN')
(NE Italy/NNP)
('.', '.')
('I', 'PRP')


# üî¨: What kind of data is returned by the ne_chunk_sents

In [15]:
chunked = nltk.ne_chunk_sents(pos_sentences, binary=True)

types = []
values = []
for sent in chunked:
    for chunk in sent:
        types.append(type(chunk))
        values.append(chunk)
        print(type(chunk),'---------\t',chunk)

<class 'nltk.tree.tree.Tree'> ---------	 (NE Jack/NNP Nelson/NNP)
<class 'tuple'> ---------	 ('worked', 'VBD')
<class 'tuple'> ---------	 ('for', 'IN')
<class 'nltk.tree.tree.Tree'> ---------	 (NE Microsoft/NNP)
<class 'tuple'> ---------	 ('and', 'CC')
<class 'tuple'> ---------	 ('attended', 'VBD')
<class 'tuple'> ---------	 ('a', 'DT')
<class 'tuple'> ---------	 ('conference', 'NN')
<class 'tuple'> ---------	 ('in', 'IN')
<class 'nltk.tree.tree.Tree'> ---------	 (NE Italy/NNP)
<class 'tuple'> ---------	 ('.', '.')
<class 'tuple'> ---------	 ('I', 'PRP')
<class 'tuple'> ---------	 ('Study', 'VBP')
<class 'tuple'> ---------	 ('at', 'IN')
<class 'nltk.tree.tree.Tree'> ---------	 (NE Lambton/NNP)
<class 'tuple'> ---------	 ('college', 'NN')
<class 'tuple'> ---------	 ('in', 'IN')
<class 'nltk.tree.tree.Tree'> ---------	 (NE Toronto/NNP)
<class 'tuple'> ---------	 ('.', '.')


## üì£ Challenge: Minimize the steps using TextBlob

In [16]:
# Challenge: Minimize the steps using TextBlob

pos_sentences_txb = txb(text) # txb is imported name for TextBlob

print('Text:',text)


# Create Named Entity Chunks:
chunked = nltk.ne_chunk_sents(pos_sentences, binary=True)

print('\nChunked Sentences Type:',type(chunked))
print('\n ** Named Entity Chunks **\n')

chunksList = [] # a list to save the chunked(generator) data into a list so that it can be used in the next step
for sent in chunked:
    for chunk in sent:
        chunksList.append(chunk) # append in the list
        print(chunk)

Text:  Jack Nelson worked for Microsoft and attended a conference in Italy. I Study at Lambton college in Toronto.

Chunked Sentences Type: <class 'generator'>

 ** Named Entity Chunks **

(NE Jack/NNP Nelson/NNP)
('worked', 'VBD')
('for', 'IN')
(NE Microsoft/NNP)
('and', 'CC')
('attended', 'VBD')
('a', 'DT')
('conference', 'NN')
('in', 'IN')
(NE Italy/NNP)
('.', '.')
('I', 'PRP')
('Study', 'VBP')
('at', 'IN')
(NE Lambton/NNP)
('college', 'NN')
('in', 'IN')
(NE Toronto/NNP)
('.', '.')


### ‚ö†Ô∏èüö®

In [25]:
# For some reason, looping through chunked sentences wont do any loops, Therefore in the above cell, stored the chunks
# into another <List> to use in the upcoming cells

for sent in chunked:
    for chunk in sent:
        print(chunk)

### üîç Filter NE only

In [26]:
print('List of chunks:',chunksList,'\n\n')


# Only show NE (Named entity)
for chunk in chunksList:
    if hasattr(chunk,'label') and chunk.label() == 'NE':
        print(chunk)

List of chunks: [Tree('NE', [('Jack', 'NNP'), ('Nelson', 'NNP')]), ('worked', 'VBD'), ('for', 'IN'), Tree('NE', [('Microsoft', 'NNP')]), ('and', 'CC'), ('attended', 'VBD'), ('a', 'DT'), ('conference', 'NN'), ('in', 'IN'), Tree('NE', [('Italy', 'NNP')]), ('.', '.'), ('I', 'PRP'), ('Study', 'VBP'), ('at', 'IN'), Tree('NE', [('Lambton', 'NNP')]), ('college', 'NN'), ('in', 'IN'), Tree('NE', [('Toronto', 'NNP')]), ('.', '.')] 


(NE Jack/NNP Nelson/NNP)
(NE Microsoft/NNP)
(NE Italy/NNP)
(NE Lambton/NNP)
(NE Toronto/NNP)


# üìã Word Collocations

In [27]:
# import nltk

nltk.download('webtext')
from nltk.corpus import webtext

[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


In [34]:
# Let's check how our corpus look like

words = [w.lower() for w in webtext.words('grail.txt')] # ‚ÑπÔ∏è grail.txt is fetched from the 'webtext' not from local
output = ' '.join(words)

print('\nüñãÔ∏è Length of words:', len(words),'\n\n')
print('Text:',output)


üñãÔ∏è Length of words: 16967 


