# NLTK Tutorial

The following is based on tutorials by sentdex. It can be found (here)[https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/] and (here)[]

## Tokenization

Tokenization is the process of splitting a larger entity into a smaller entity. This could be a paragraph split into sentences and sentences split into words. When sentences are split into words, the words are tokens.

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize 

#### Sentence Tokenization

In [2]:
example_text = "Hello Mr. John, I am a line of text. Nice to meet you!"
print(sent_tokenize(example_text))

['Hello Mr. John, I am a line of text.', 'Nice to meet you!']


#### Word Tokenization

**Note**: Special characters are treated as separate tokens when sentences are tokenized to get words.

In [3]:
example_sentence = "Hi! I am a sentence, right here."
print(word_tokenize(example_sentence))
print(word_tokenize(example_text))

['Hi', '!', 'I', 'am', 'a', 'sentence', ',', 'right', 'here', '.']
['Hello', 'Mr.', 'John', ',', 'I', 'am', 'a', 'line', 'of', 'text', '.', 'Nice', 'to', 'meet', 'you', '!']


## Stopwords

In [4]:
from nltk.corpus import stopwords

In [5]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**Removing stopwords from a Sentence**

In [6]:
stop_words = stopwords.words("english")
word_tokens = word_tokenize(example_text)
filtered_sent = [w for w in word_tokens if not w in stop_words]
print(example_text)
print(filtered_sent)

Hello Mr. John, I am a line of text. Nice to meet you!
['Hello', 'Mr.', 'John', ',', 'I', 'line', 'text', '.', 'Nice', 'meet', '!']


## Stemming

In [7]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
example_words = ["wash", "washing", "washed", "washer", "ghostly"]

for w in example_words:
    print(ps.stem(w))

wash
wash
wash
washer
ghostli


*Stemming a Sentence*

In [8]:
random_sent = "The first man-made moon will launch from Xichang Satellite Launch Center in Sichuan, with three more to follow in 2022 if the first test goes well, said Wu Chunfeng, head of Tian Fu New Area Science Society, the organization responsible for the project."
random_sent_tokens = word_tokenize(random_sent)
for w in random_sent_tokens:
    print(ps.stem(w))

the
first
man-mad
moon
will
launch
from
xichang
satellit
launch
center
in
sichuan
,
with
three
more
to
follow
in
2022
if
the
first
test
goe
well
,
said
Wu
chunfeng
,
head
of
tian
Fu
new
area
scienc
societi
,
the
organ
respons
for
the
project
.


## Part Of Speech Tagging

### POS Tag list

* CC	coordinating conjunction
* CD	cardinal digit
* DT	determiner
* EX	existential there (like: "there is" ... think of it like "there exists")
* FW	foreign word
* IN	preposition/subordinating conjunction
* JJ	adjective	'big'
* JJR	adjective, comparative	'bigger'
* JJS	adjective, superlative	'biggest'
* LS	list marker	1)
* MD	modal	could, will
* NN	noun, singular 'desk'
* NNS	noun plural	'desks'
* NNP	proper noun, singular	'Harrison'
* NNPS	proper noun, plural	'Americans'
* PDT	predeterminer	'all the kids'
* POS	possessive ending	parent\'s
* PRP	personal pronoun	I, he, she
* PRP	possessive pronoun	my, his, hers
* RB	adverb	very, silently,
* RBR	adverb, comparative	better
* RBS	adverb, superlative	best
* RP	particle	give up
* TO	to	go 'to' the store.
* UH	interjection	errrrrrrrm
* VB	verb, base form	take
* VBD	verb, past tense	took
* VBG	verb, gerund/present participle	taking
* VBN	verb, past participle	taken
* VBP	verb, sing. present, non-3d	take
* VBZ	verb, 3rd person sing. present	takes
* WDT	wh-determiner	which
* WP	wh-pronoun	who, what
* WP	possessive wh-pronoun	whose
* WRB	wh-abverb	where, when


In [9]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
import nltk

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def pos_tag(tokens):
    for i in tokens[:10]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        print(tagged)
        
pos_tag(tokenized)

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

## Chunking

In [None]:
import nltk.draw
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_chunk(tokens):
    for i in tokens:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        chunked.draw() 
        
process_chunk(tokenized)

## Named Entity Recognition

In [None]:
def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()
    except Exception as e:
        print(str(e))


process_content()

## Lemmatizing

In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run
