## 1. Installation and Exploring Features of NLTK Tools

In [1]:
!pip install nltk



In [4]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

## 2. Word, Sentence, and Paragraph Tokenizers

In [5]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = """
Natural Language Processing (NLP) is a field of computer science.
It focuses on the interaction between computers and human language.
This involves understanding, interpreting, and generating human language.
"""

word_tokens = word_tokenize(text)
sentence_tokens = sent_tokenize(text)
paragraph_tokens = text.split("\n")

print("Word Tokens:", word_tokens)
print("\nSentence Tokens:", sentence_tokens)
print("\nParagraph Tokens:", paragraph_tokens)

Word Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', '.', 'It', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'human', 'language', '.', 'This', 'involves', 'understanding', ',', 'interpreting', ',', 'and', 'generating', 'human', 'language', '.']

Sentence Tokens: ['\nNatural Language Processing (NLP) is a field of computer science.', 'It focuses on the interaction between computers and human language.', 'This involves understanding, interpreting, and generating human language.']

Paragraph Tokens: ['', 'Natural Language Processing (NLP) is a field of computer science. ', 'It focuses on the interaction between computers and human language.', 'This involves understanding, interpreting, and generating human language.', '']


## 3. Corpus Word Count and Distinct Words

In [6]:
from nltk.corpus import gutenberg
words = gutenberg.words("austen-emma.txt")
print("Total words in corpus:", len(words))
print("Distinct words:", len(set(words)))

Total words in corpus: 192427
Distinct words: 7811


## 4. Removing Stop Words from Text

In [7]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
print("Filtered Words (without stopwords):", filtered_words)

Filtered Words (without stopwords): ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'field', 'computer', 'science', '.', 'focuses', 'interaction', 'computers', 'human', 'language', '.', 'involves', 'understanding', ',', 'interpreting', ',', 'generating', 'human', 'language', '.']


## 5. Top 10 Frequent Words (Excluding Stopwords)

In [8]:
from collections import Counter

def top_frequent_non_stopwords(text, n=10):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
    freq_dist = Counter(filtered)
    return freq_dist.most_common(n)

print(top_frequent_non_stopwords(text))

[('language', 3), ('human', 2), ('natural', 1), ('processing', 1), ('nlp', 1), ('field', 1), ('computer', 1), ('science', 1), ('focuses', 1), ('interaction', 1)]
