In [None]:
# This is following NLP - https://realpython.com/python-nltk-sentiment-analysis/

In [3]:
import nltk

In [4]:
# download specific resources for this tutorial 
nltk.download(["names", "stopwords", "state_union", "twitter_samples","movie_reviews","averaged_perceptron_tagger",
               "vader_lexicon","punkt"])

[nltk_data] Downloading package names to /home/rgupta323/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rgupta323/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     /home/rgupta323/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/rgupta323/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/rgupta323/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/rgupta323/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/rgupta

True

In [5]:
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()] #list of words from nltk state of union 
print(words[:10]) # list that has stop words 

# In the words list, there's lots of stop words ("of", "a", "the"). We want to filter out these words because they can 
# have a negative effect on the analysis so we need to be able to filter these words out 
stopwords = nltk.corpus.stopwords.words("english")
words = [w for w in words if w.lower() not in stopwords]
print("words without stop words: {}".format(words[:20]))

['PRESIDENT', 'HARRY', 'S', 'TRUMAN', 'S', 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION']
words without stop words: ['PRESIDENT', 'HARRY', 'TRUMAN', 'ADDRESS', 'JOINT', 'SESSION', 'CONGRESS', 'April', 'Mr', 'Speaker', 'Mr', 'President', 'Members', 'Congress', 'heavy', 'heart', 'stand', 'friends', 'colleagues', 'Congress']


In [6]:
from pprint import pprint
text = """
For some quick analysis, creating a corpus could be overkill.
If all you need is a word list,
there are simpler ways to achieve that goal."""
pprint(nltk.word_tokenize(text), width=79, compact=True) 

['For', 'some', 'quick', 'analysis', ',', 'creating', 'a', 'corpus', 'could',
 'be', 'overkill', '.', 'If', 'all', 'you', 'need', 'is', 'a', 'word', 'list',
 ',', 'there', 'are', 'simpler', 'ways', 'to', 'achieve', 'that', 'goal', '.']


In [10]:
# Creating Frequency Distributions 
# A frequency distribution is essentially a table that tells you how many times each word appears within a given text.
#In NLTK, frequency distributions are a specific object type implemented as a distinct class called FreqDist. 

words: list[str] = nltk.word_tokenize(text) 
fd = nltk.FreqDist(words) # this will create a frequency distribution object (similar to a python dict)

In [11]:
# Getting some data based on teh frequency distribution 
print(fd.most_common(3))

print(fd.tabulate(3))

[(',', 2), ('a', 2), ('.', 2)]
, a . 
2 2 2 
None


In [12]:
# Extracting Concordance & Collactions 
# a concordance is a collection of word locations along with their context. 
# You can use concordances to find: 
    # HOw many times a word appears 
    # where each occurence appears 
    # what words surround each occurence 
    
# Using this library (nltk), you can do this by calling .concordance()
# to use that method, you'd need to build a new word list from the original corpus text so that all the context, even 
# stop words will be there. 

text = nltk.Text(nltk.corpus.state_union.words())
text.concordance("america", lines=5)

Displaying 5 of 1079 matches:
 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace
beyond any shadow of a doubt , that America will continue the fight for freedom
 to make complete victory certain , America will never become a party to any pl
nly in law and in justice . Here in America , we have labored long and hard to 
