# NLTK with Python

### Asst. Prof. Hadi Saboohi

### _Mohammed K Jumaah_ & _Mohammed Q Kareem_

---

# Tokenizing

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
example_string = """
Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

In [None]:
sent_tokenize(example_string)

In [None]:
word_tokenize(example_string)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
worf_quote = "Sir, I protest. I am not a merry man!"

In [None]:
words_in_quote = word_tokenize(worf_quote)
words_in_quote

# Filtering Stop Words

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
filtered_list = []
for word in words_in_quote:
    if word.casefold() not in stop_words:
         filtered_list.append(word)
            
filtered_list = [
    word for word in words_in_quote if word.casefold() not in stop_words
]
filtered_list

# Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
stemmer = PorterStemmer()
string_for_stemming = """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""
words = word_tokenize(string_for_stemming)
words

In [None]:
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_words

# Tagging Parts of Speech

In [None]:
from nltk.tokenize import word_tokenize
sagan_quote = """
If you wish to make an apple pie from scratch,
you must first invent the universe."""
words_in_sagan_quote = word_tokenize(sagan_quote)

import nltk
nltk.pos_tag(words_in_sagan_quote)

In [None]:
nltk.help.upenn_tagset()

In [None]:
jabberwocky_excerpt = """
'Twas brillig, and the slithy toves did gyre and gimble in the wabe:
all mimsy were the borogoves, and the mome raths outgrabe."""
words_in_excerpt = word_tokenize(jabberwocky_excerpt)
nltk.pos_tag(words_in_excerpt)

# Lemmatizing

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("scarves")

In [None]:
string_for_lemmatizing = "The friends of DeSoto love scarves."
words = word_tokenize(string_for_lemmatizing)
words

In [None]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_words

In [None]:
lemmatizer.lemmatize("worst")
lemmatizer.lemmatize("worst", pos="a")

# Chunking

In [None]:
from nltk.tokenize import word_tokenize
lotr_quote = "It's a dangerous business, Frodo, going out your door."
words_in_lotr_quote = word_tokenize(lotr_quote)
words_in_lotr_quote

In [None]:
lotr_pos_tags = nltk.pos_tag(words_in_lotr_quote)
lotr_pos_tags

In [None]:
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(lotr_pos_tags)
tree.draw()
lotr_pos_tags

# Chinking

In [None]:
grammar = """
Chunk: {<.*>+}
       }<JJ>{""";
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(lotr_pos_tags)
tree.draw()

# Named Entity Recognition (NER)

In [None]:
tree = nltk.ne_chunk(lotr_pos_tags)
tree.draw()

In [None]:
tree = nltk.ne_chunk(lotr_pos_tags, binary=True)
tree.draw()

In [None]:
quote = """
Men like Schiaparelli watched the red planet—it is odd, by-the-bye, that
for countless centuries Mars has been the star of war—but failed to
interpret the fluctuating appearances of the markings they mapped so well.
All that time the Martians must have been getting ready.

During the opposition of 1894 a great light was seen on the illuminated
part of the disk, first at the Lick Observatory, then by Perrotin of Nice,
and then by other observers. English readers heard of it first in the
issue of Nature dated August 2."""

def extract_ne(quote):
    words = word_tokenize(quote, language='english')
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=True)
    return set(
        " ".join(i[0] for i in t)
        for t in tree
        if hasattr(t, "label") and t.label() == "NE"
    )
extract_ne(quote)