Develop a text preprocessing and analysis application using NLTK for tokenization, POS 
tagging, and basic NLP tasks. 

1. Install & Import Libraries

In [1]:
!pip install nltk





[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from collections import Counter


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

2. Input Text

In [4]:
text = """
Natural Language Processing enables machines to understand 
and analyze human language efficiently.
"""


3. Sentence Tokenization

In [5]:
sentences = sent_tokenize(text)
sentences


['\nNatural Language Processing enables machines to understand \nand analyze human language efficiently.']

4. Word Tokenization

In [6]:
words = word_tokenize(text.lower())
words


['natural',
 'language',
 'processing',
 'enables',
 'machines',
 'to',
 'understand',
 'and',
 'analyze',
 'human',
 'language',
 'efficiently',
 '.']

5. Remove Stopwords & Punctuation

In [7]:
stop_words = set(stopwords.words('english'))

clean_words = [
    word for word in words 
    if word not in stop_words and word not in string.punctuation
]

clean_words


['natural',
 'language',
 'processing',
 'enables',
 'machines',
 'understand',
 'analyze',
 'human',
 'language',
 'efficiently']

6. Stemming

In [8]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in clean_words]
stemmed_words

['natur',
 'languag',
 'process',
 'enabl',
 'machin',
 'understand',
 'analyz',
 'human',
 'languag',
 'effici']

7.Lemmatization

In [9]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in clean_words]
lemmatized_words


['natural',
 'language',
 'processing',
 'enables',
 'machine',
 'understand',
 'analyze',
 'human',
 'language',
 'efficiently']

8. POS Tagging

In [10]:
pos_tags = pos_tag(lemmatized_words)
pos_tags


[('natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('enables', 'VBZ'),
 ('machine', 'NN'),
 ('understand', 'NN'),
 ('analyze', 'NN'),
 ('human', 'JJ'),
 ('language', 'NN'),
 ('efficiently', 'RB')]

9. Basic NLP Analysis

In [11]:
#Word Frequency
word_freq = Counter(lemmatized_words)
word_freq

Counter({'language': 2,
         'natural': 1,
         'processing': 1,
         'enables': 1,
         'machine': 1,
         'understand': 1,
         'analyze': 1,
         'human': 1,
         'efficiently': 1})

In [12]:
#Most Common Words
word_freq.most_common(5)

[('language', 2),
 ('natural', 1),
 ('processing', 1),
 ('enables', 1),
 ('machine', 1)]

10.Complete Pipeline Function (Reusable)

In [13]:
def nlp_pipeline(text):
    words = word_tokenize(text.lower())
    clean = [w for w in words if w not in stop_words and w not in string.punctuation]
    lemmas = [lemmatizer.lemmatize(w) for w in clean]
    tags = pos_tag(lemmas)
    return tags


In [14]:
nlp_pipeline("NLTK is widely used for text preprocessing and NLP tasks.")


[('nltk', 'RB'),
 ('widely', 'RB'),
 ('used', 'VBN'),
 ('text', 'NN'),
 ('preprocessing', 'VBG'),
 ('nlp', 'JJ'),
 ('task', 'NN')]