In [1]:
# Import the Gutenberg and stopwords databases from the nltk corpus 
from nltk.corpus import gutenberg, stopwords
# Import tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize

# Import nltk and download  the sentence tokenizer.
import nltk
nltk.download('punkt')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /Users/tberton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/tberton/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
# Get all the fileids 
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
# Get Jane Austen's book, Persuasion.
persuasion_book = gutenberg.raw(fileids=('austen-persuasion.txt'))
print(persuasion_book)

In [4]:
# Use the sentence tokenizer on a random sentence in Persuasion.
one_sentence = sent_tokenize(persuasion_book)[8]
print(one_sentence)

He had been remarkably handsome
in his youth; and, at fifty-four, was still a very fine man.


In [5]:
# Get all the words in the sentence.
all_words = word_tokenize(one_sentence)
print(all_words)

['He', 'had', 'been', 'remarkably', 'handsome', 'in', 'his', 'youth', ';', 'and', ',', 'at', 'fifty-four', ',', 'was', 'still', 'a', 'very', 'fine', 'man', '.']


## NLTK Stopwords

In [6]:
# Get all the nltk stopwords
sw = set(stopwords.words('english'))
print(sw)

{'as', 'from', 'below', "shan't", 'wouldn', 'i', 'now', "couldn't", 'while', 'over', 'couldn', 'you', 'if', 'his', 'will', "that'll", 'hers', 'wasn', 'o', 'how', 'he', 'the', 'll', "needn't", 'about', 'so', 'yourself', 'these', 'doing', 'isn', 'at', 'won', 'for', 'were', 'they', 'few', 'with', 'been', 'same', 'what', "you'll", 't', 'off', 'an', 'again', 'd', 'mustn', 'once', 'needn', 'both', 'such', 'very', 'our', "weren't", 'theirs', 'have', 'ourselves', 'ma', 'and', 'between', 'had', "you'd", 'out', 'a', 'be', 'own', 'because', "should've", 'm', 'any', 'did', 're', 'has', 'above', 'most', "hadn't", 'hasn', 'herself', "it's", "mightn't", 'him', 'mightn', 'those', 'their', 'than', 'don', "won't", "wouldn't", 'all', "she's", 'does', 'to', 'should', "aren't", "you're", 'am', 'its', 'which', "shouldn't", 'me', 'who', 's', 'shouldn', 'myself', 'ours', 'is', 'up', 'themselves', 'other', 'too', "wasn't", "don't", 'haven', 'in', 'doesn', 'y', 'into', 'when', 'by', 'was', "mustn't", 'shan', 'w

In [7]:
# Filter out all the stopwords from the words in the sentence.
first_result = [word.lower() for word in all_words if word.lower() not in sw]
print(first_result)

['remarkably', 'handsome', 'youth', ';', ',', 'fifty-four', ',', 'still', 'fine', 'man', '.']


In [8]:
# We can define our own list of stopwords to add to the default nltk stopwords
sw_addon = {'still', 'fifty-four'}
second_result = [word.lower() for word in all_words if word.lower() not in sw.union(sw_addon)]
print(second_result)

['remarkably', 'handsome', 'youth', ';', ',', ',', 'fine', 'man', '.']


## Getting Rid of Non-Alpha Characters using Regular Expressions

In [9]:
# Import regular expressions library
import re

In [10]:
# Substitute everything that is not a letter with an empty string
regex = re.compile("[^a-zA-Z ]")
re_clean = regex.sub(' ', one_sentence)
print(re_clean)

He had been remarkably handsome in his youth  and  at fifty four  was still a very fine man 


In [11]:
# Retrieve everything that is not a letter with an empty string
re_clean_2 = re.findall("[^a-zA-Z ]", one_sentence)
print(re_clean_2)

['\n', ';', ',', '-', ',', '.']


In [12]:
# Remove all the stopwords from our cleaned regular expression.
re_words = word_tokenize(re_clean)
re_result = [word.lower() for word in re_words if word.lower() not in sw.union(sw_addon)]
print(re_result)

['remarkably', 'handsome', 'youth', 'fifty', 'four', 'fine', 'man']
