In [None]:
#using nltk
import nltk
import string
from nltk import word_tokenize
text = "we'd like to book a flight from boston to london"
tokenized_text = word_tokenize(text)
print(tokenized_text)

In [None]:
from nltk.probability import FreqDist
FreqDist(tokenized_text)

In [None]:
nltk.pos_tag(tokenized_text)

In [None]:
#tokenize text with SpaCy
import spacy
from spacy.lang.en import English
nlp = spacy.load('en_core_web_sm')
text = "we'd like to book a flight from boston to london"

doc = nlp(text)
print ([token.text for token in doc])

In [None]:
#make an array of tokens
words = [token.text for token in doc]

In [None]:
#collect word frequency statistics with SpaCy

from collections import Counter

word_freq = Counter(words)
print(word_freq)


In [None]:
# Just as with NLTK, we can perform POS tagging with spaCy:
for token in doc:
    print(token.text, token.pos_)

In [None]:
from spacy import displacy

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "we'd like to book a flight from boston to new york"
doc = nlp(text)
displacy.render(doc,style='ent',jupyter=True,options={'distance':200})

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp('they get in an accident')
displacy.render(doc,style='dep',jupyter=True,options={'distance':200})

In [None]:
# NLP imports
import nltk
import spacy
from spacy import displacy
# general numerical and visualization imports
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

In [None]:
# open the nltk downloader
# note that the downloader might be minimized in your toolbar
# the downloader is a modal window, so the Jupyter notebook will wait for you to do something with it
nltk.download()

In [None]:
#import the training data
from nltk.corpus import movie_reviews
sents = movie_reviews.sents()
print(sents)
[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church',
'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get',
'into', 'an', 'accident', '.'], ...]

sample = sents[9]
print(sample)
['they', 'seem', 'to', 'have', 'taken', 'this', 'pretty', 'neat',
'concept', ',', 'but', 'executed', 'it', 'terribly', '.']

In [None]:
#displaying the most frequent 25 words
words = movie_reviews.words()
word_counts = nltk.FreqDist(word.lower() for word in words if word.
isalpha())
top_words = word_counts.most_common(25)
all_fdist = pd.Series(dict(top_words))
# Setting fig and ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# Plot with Seaborn plotting tools
plt.xticks(rotation = 70)
plt.title("Frequency -- Top 25 Words in the Movie Review Corpus",
fontsize = 30)
plt.xlabel("Words", fontsize = 30)
plt.ylabel("Frequency", fontsize = 30)
all_plot = sns.barplot(x = all_fdist.index, y = all_fdist.values,
ax=ax)
plt.xticks(rotation=60)
plt.show()

In [None]:
# displaying a WordCloud
from wordcloud import WordCloud
wordcloud = WordCloud(background_color = 'white',

max_words = 25,
relative_scaling = 0,
width = 600,height = 300,
max_font_size = 150,
colormap = 'Dark2',
min_font_size = 10).generate_from_frequencies(all_fdist)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# looking at part of speech frequency in the movie corpus
movie_reviews_sentences = movie_reviews.sents()
tagged_sentences = nltk.pos_tag_sents(movie_reviews_sentences)
total_counts = {}
for sentence in tagged_sentences:
    counts = Counter(tag for word,tag in sentence)
    total_counts = Counter(total_counts) + Counter(counts)
sorted_tag_list = sorted(total_counts.items(), key = lambda x:
x[1],reverse = True)
all_tags = pd.DataFrame(sorted_tag_list)
most_common_tags = all_tags.head(18)

# Setting figure and ax into variables
fig, ax = plt.subplots(figsize=(15,15))
all_plot = sns.barplot(x = most_common_tags[0], y = most_common_tags[1], ax = ax)
plt.xticks(rotation = 70)
plt.title("Part of Speech Frequency in Movie Review Corpus", fontsize
= 30)
plt.xlabel("Part of Speech", fontsize = 30)
plt.ylabel("Frequency", fontsize = 30)
plt.show()
