In [None]:
# Import Libraries

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag, RegexpParser
from nltk.chunk.api import ChunkParserI
from nltk.corpus import stopwords
from nltk import FreqDist
import nltk
import re
import os
from IPython.display import Image, display
from nltk.draw import TreeWidget
from nltk.draw.util import CanvasFrame
import subprocess
from nltk.book import text8

In [None]:
# Tokenization

f = open("story.txt", "r")
text = f.read()

# Sent Tokenization
print(sent_tokenize(text))


In [None]:
# Tokenization

# Word Tokenization
print(word_tokenize(text))

In [None]:
# Lower Case Conversion

text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
words = text.split()
print(words)

In [None]:
# Stop Word

f = open("story.txt")
text = f.read()

stop_words = set(stopwords.words("english"))
words_in_quote = word_tokenize(text)

filtered_list = [word for word in words_in_quote if word.casefold() not in stop_words]
filtered_list = []

for word in words_in_quote:
    if word.casefold() not in stop_words:
        filtered_list.append(word)
        

print(filtered_list)

In [None]:
# Stemming

stemmer = PorterStemmer()
f = open("story.txt")
text = f.read()

words = word_tokenize(text)
stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)

In [None]:
# Lemmatization

lemmatizer = WordNetLemmatizer()

words = word_tokenize(text)
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print(lemmatized_words)

In [None]:
# Chunking

words = word_tokenize(text)
lotr_pos_tags = nltk.pos_tag(words)
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(lotr_pos_tags)

tree.draw()


In [None]:
# Chinking

words = word_tokenize(text)
lotr_pos_tags = nltk.pos_tag(words)
grammar = """
    Chunk: {<.*>+}
           }<JJ>{"""
chunk_parser = nltk.RegexpParser(grammar)

tree = chunk_parser.parse(lotr_pos_tags)
tree.draw()

In [None]:
# Using Named Entity Recognition (NER)

f = open("story.txt")

def extract_ne(text):
    words = word_tokenize(text)
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=True)
    tree.draw()
    


text = f.read()
extract_ne(text)

In [None]:
# Making a Dispersion Plot

f = open("story.txt")
text = f.read()
words = word_tokenize(text)
text8.dispersion_plot(words)


In [None]:
# Making a Frequency Distribution

meaningful_words = [word for word in words if word.casefold() not in stop_words]
frequency_distribution = FreqDist(meaningful_words)
frequency_distribution.plot(20, cumulative=True)

In [None]:
# Parse tree or Syntax Tree generation

f = open("story.txt")
text = f.read()
tagged = pos_tag(word_tokenize(text))
chunker = RegexpParser(""" 
NP: {<.*>*}   
}<[\.VI].*>+{       
<.*>}{<DT>        
PP: {<IN><NP>}        
VP: {<VB.*><NP|PP>*}
""" )
output = chunker.parse(tagged)
print(output)


In [None]:
# Parse tree or Syntax Tree generation

output.draw()

In [None]:
# POS Tagging

f = open("story.txt")
text = f.read()
stop_words = set(stopwords.words('english'))
tokenized = sent_tokenize(text)

for i in tokenized:
    wordsList = nltk.word_tokenize(i)
    wordsList = [w for w in wordsList if not w in stop_words]
    tagged = nltk.pos_tag(wordsList)
    print(tagged)
    output = chunker.parse(tagged)
    output.draw()