In [1]:
# Install NLTK if you haven't already: pip install nltk
import nltk
from nltk.corpus import gutenberg

# Download the Gutenberg corpus (you only need to do this once)
nltk.download('gutenberg')

# Extract text content from different media (e.g., books from Project Gutenberg)
emma = gutenberg.raw('austen-emma.txt')
shakespeare = gutenberg.raw('shakespeare-hamlet.txt')

print("Text from Emma by Jane Austen:")
print(emma[:300])

print("\nText from Hamlet by William Shakespeare:")
print(shakespeare[:300])


Text from Emma by Jane Austen:
[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was t

Text from Hamlet by William Shakespeare:
[The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo and Francisco two Centinels.

  Barnardo. Who's there?
  Fran. Nay answer me: Stand & vnfold
your selfe

   Bar. Long liue the King

   Fran. Barnardo?
  Bar. He

   Fran. You come most carefully vpon 


[nltk_data] Error loading gutenberg: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Sample text for sentence splitting and tokenization
text = "NLTK is a powerful library for natural language processing. It makes it easy to work with human language data."

# Sentence splitting
sentences = sent_tokenize(text)
print("Sentences:", sentences)

# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)


Sentences: ['NLTK is a powerful library for natural language processing.', 'It makes it easy to work with human language data.']
Tokens: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '.', 'It', 'makes', 'it', 'easy', 'to', 'work', 'with', 'human', 'language', 'data', '.']


In [3]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Sample words for stemming and lemmatization
words = ['running', 'flies', 'better', 'happily', 'cats']

# Stemming
porter_stemmer = PorterStemmer()
stemmed_words = [porter_stemmer.stem(word) for word in words]
print("Stemmed Words:", stemmed_words)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized Words:", lemmatized_words)


Stemmed Words: ['run', 'fli', 'better', 'happili', 'cat']
Lemmatized Words: ['running', 'fly', 'better', 'happily', 'cat']


In [4]:
from nltk.corpus import stopwords

# Sample text for stop word removal
text = "NLTK is a powerful library for natural language processing. It makes it easy to work with human language data."

# Tokenization
tokens = word_tokenize(text)

# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens (after stop word removal):", filtered_tokens)


Filtered Tokens (after stop word removal): ['NLTK', 'powerful', 'library', 'natural', 'language', 'processing', '.', 'makes', 'easy', 'work', 'human', 'language', 'data', '.']


In [5]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Sample text for POS tagging
text = "NLTK is a powerful library for natural language processing. It makes it easy to work with human language data."

# Tokenization
tokens = word_tokenize(text)

# POS tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


POS Tags: [('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.'), ('It', 'PRP'), ('makes', 'VBZ'), ('it', 'PRP'), ('easy', 'JJ'), ('to', 'TO'), ('work', 'VB'), ('with', 'IN'), ('human', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('.', '.')]


In [6]:
from nltk import RegexpParser
from nltk.tokenize import word_tokenize

# Sample text for chunking
text = "NLTK is a powerful library for natural language processing. It makes it easy to work with human language data."

# Tokenization
tokens = word_tokenize(text)

# POS tagging
pos_tags = pos_tag(tokens)

# Define a chunking grammar
grammar = r"""
    NP: {<DT>?<JJ>*<NN>}   # Chunk NP (noun phrase)
    PP: {<IN><NP>}         # Chunk PP (prepositional phrase)
"""

# Create a chunk parser
chunk_parser = RegexpParser(grammar)

# Perform chunking
chunks = chunk_parser.parse(pos_tags)
chunks.draw()  # This will open a window to display the chunks (requires Tkinter)
