In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Prathamesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Prathamesh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Prathamesh\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prathamesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import wordnet
import math

# Sample document
sample_document = "Text analytics is the process of analyzing unstructured text data to extract \
                relevant information. It involves several preprocessing steps such as tokenization, \
                POS tagging, stop words removal, stemming, and lemmatization."

# Tokenization
tokens = word_tokenize(sample_document)

# POS Tagging
pos_tags = nltk.pos_tag(tokens)

# Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming
ps = PorterStemmer()
stemmed_tokens = [ps.stem(word) for word in filtered_tokens]

# Lemmatization
wnl = WordNetLemmatizer()
lemmatized_tokens = [wnl.lemmatize(word, pos='v') for word in filtered_tokens]

# Term Frequency Calculation
tf = FreqDist(lemmatized_tokens)

# Inverse Document Frequency Calculation
def idf(term, documents):
    doc_with_term = sum(1 for doc in documents if term in doc)
    if doc_with_term == 0:
        return 0
    else:
        return math.log(len(documents) / doc_with_term)

# Example collection of documents
documents = [
    "Text analytics is the process of analyzing unstructured text data to extract relevant information.",
    "Text analytics involves several preprocessing steps such as tokenization, POS tagging, stop words removal, stemming, and lemmatization.",
    "Text analytics helps in extracting insights from large volumes of text data for various applications.",
]

# Calculate IDF for each term in the collection of documents
idf_scores = {}
for doc in documents:
    doc_tokens = word_tokenize(doc)
    doc_tokens = [wnl.lemmatize(word.lower(), pos='v') for word in doc_tokens if word.lower() not in stop_words]
    for term in set(doc_tokens):
        idf_scores[term] = idf(term, documents)

# Print results
print("Tokenization:", tokens)
print("\nPOS Tagging:", pos_tags)
print("\nStopwords Removal:", filtered_tokens)
print("\nStemming:", stemmed_tokens)
print("\nLemmatization:", lemmatized_tokens)
print("\nTerm Frequency:", tf.most_common())
print("\nInverse Document Frequency:")
for term, score in idf_scores.items():
    print(term, ":", score)


Tokenization: ['Text', 'analytics', 'is', 'the', 'process', 'of', 'analyzing', 'unstructured', 'text', 'data', 'to', 'extract', 'relevant', 'information', '.', 'It', 'involves', 'several', 'preprocessing', 'steps', 'such', 'as', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.']

POS Tagging: [('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('analyzing', 'VBG'), ('unstructured', 'JJ'), ('text', 'NN'), ('data', 'NNS'), ('to', 'TO'), ('extract', 'VB'), ('relevant', 'JJ'), ('information', 'NN'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('several', 'JJ'), ('preprocessing', 'VBG'), ('steps', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]

St