In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample document
document = "This is a sample document for pre-processing. It includes tokenization, POS tagging, stop words removal, stemming, and lemmatization."

# Tokenization
tokens = nltk.word_tokenize(document)

# POS Tagging
pos_tags = nltk.pos_tag(tokens)

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# Term Frequency (TF)
tf_vectorizer = TfidfVectorizer(use_idf=False, norm=None)
tf_matrix = tf_vectorizer.fit_transform([' '.join(tokens)])


# Inverse Document Frequency (IDF)
idf_vectorizer = TfidfVectorizer(use_idf=True, norm=None)
idf_matrix = idf_vectorizer.fit_transform([' '.join(tokens)])

# Print results
print("Original Document:\n", document)
print("\nTokenization:\n", tokens)
print("\nPOS Tagging:\n", pos_tags)
print("\nStop Words Removal:\n", filtered_tokens)
print("\nStemming:\n", stemmed_tokens)
print("\nLemmatization:\n", lemmatized_tokens)
print("\nTerm Frequency (TF):\n", tf_matrix.toarray())
print("\nInverse Document Frequency (IDF):\n", idf_matrix.toarray())


Original Document:
 This is a sample document for pre-processing. It includes tokenization, POS tagging, stop words removal, stemming, and lemmatization.

Tokenization:
 ['This', 'is', 'a', 'sample', 'document', 'for', 'pre-processing', '.', 'It', 'includes', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.']

POS Tagging:
 [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('document', 'NN'), ('for', 'IN'), ('pre-processing', 'NN'), ('.', '.'), ('It', 'PRP'), ('includes', 'VBZ'), ('tokenization', 'NN'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]

Stop Words Removal:
 ['sample', 'document', 'pre-processing', '.', 'includes', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'lemmatization', '.']

