## 1. Installation and Exploring Features of NLTK Tools

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('gutenberg')

## 2. Word, Sentence, and Paragraph Tokenizers

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = """
Natural Language Processing (NLP) is a field of computer science. 
It focuses on the interaction between computers and human language.
This involves understanding, interpreting, and generating human language.
"""

word_tokens = word_tokenize(text)
sentence_tokens = sent_tokenize(text)
paragraph_tokens = text.split("\n")

print("Word Tokens:", word_tokens)
print("\nSentence Tokens:", sentence_tokens)
print("\nParagraph Tokens:", paragraph_tokens)

## 3. Corpus Word Count and Distinct Words

In [None]:
from nltk.corpus import gutenberg
words = gutenberg.words("austen-emma.txt")
print("Total words in corpus:", len(words))
print("Distinct words:", len(set(words)))

## 4. Removing Stop Words from Text

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
print("Filtered Words (without stopwords):", filtered_words)

## 5. Top 10 Frequent Words (Excluding Stopwords)

In [None]:
from collections import Counter

def top_frequent_non_stopwords(text, n=10):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
    freq_dist = Counter(filtered)
    return freq_dist.most_common(n)

print(top_frequent_non_stopwords(text))